From ec01e1d677354c832d6738c01ef94554e43b7051 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 30 Jan 2020 14:03:17 -0500 Subject: [PATCH 001/171] Add function to load demo data --- greenguard/demo.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 greenguard/demo.py diff --git a/greenguard/demo.py b/greenguard/demo.py new file mode 100644 index 0000000..369422b --- /dev/null +++ b/greenguard/demo.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- + +import logging +import os + +import pandas as pd + +LOGGER = logging.getLogger(__name__) + +S3_URL = 'https://d3-ai-greenguard.s3.amazonaws.com/' +DEMO_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'demo') + + +def _load_or_download(filename, dates): + filename += '.csv.gz' + file_path = os.path.join(DEMO_PATH, filename) + if os.path.exists(file_path): + return pd.read_csv(file_path, compression='gzip', parse_dates=[dates]) + + os.makedirs(DEMO_PATH, exist_ok=True) + url = S3_URL + filename + + LOGGER.info('Downloading %s from %s', filename, url) + data = pd.read_csv(url, compression='gzip', parse_dates=[dates]) + data.to_csv(file_path, index=False, compression='gzip') + + return data + + +def load_demo(): + """Load the demo included in the GreenGuard project. + The first time that this function is executed, the data will be downloaded + and cached inside the `greenguard/demo` folder. + Subsequent calls will load the cached data instead of downloading it again. + """ + target_times = _load_or_download('target_times', 'cutoff_time') + readings = _load_or_download('readings', 'timestamp') + + return target_times, readings From bfc24cd2abde6fd9181c374ad2cac6b38b86ece1 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 30 Jan 2020 14:04:24 -0500 Subject: [PATCH 002/171] Add functions to work with target_times --- greenguard/__init__.py | 5 +- greenguard/data.py | 225 ----------------------------------------- greenguard/targets.py | 155 ++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 228 deletions(-) delete mode 100644 greenguard/data.py create mode 100644 greenguard/targets.py diff --git a/greenguard/__init__.py b/greenguard/__init__.py index fbc6e9a..1eab417 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -8,7 +8,7 @@ import os -from greenguard.data import extract_readings, make_targets +from greenguard.demo import load_demo from greenguard.pipeline import GreenGuardPipeline, get_pipelines _BASE_PATH = os.path.abspath(os.path.dirname(__file__)) @@ -18,6 +18,5 @@ __all__ = ( 'GreenGuardPipeline', 'get_pipelines', - 'extract_readings', - 'make_targets' + 'load_demo' ) diff --git a/greenguard/data.py b/greenguard/data.py deleted file mode 100644 index 21e0b7d..0000000 --- a/greenguard/data.py +++ /dev/null @@ -1,225 +0,0 @@ -"""Data module. - -This module contains functions to work directly with GreenGuard data in raw format. - -This raw format has the following characteristics: - - * All the data from all the turbines is inside a single folder. - * Inside the data folder, a folder exists for each turbine. - This folders are named exactly like each turbine id, and inside it one or more - CSV files can be found. The names of these files is not relevant. - * Each CSV file will have the the following columns: - - * timestamp: timestemp of the reading. - * signal: name or id of the signal. - * value: value of the reading. -""" - -import logging -import os -import warnings -from datetime import datetime - -import numpy as np -import pandas as pd - -LOGGER = logging.getLogger(__name__) - - -def make_targets(target_times, window_size, target, new_targets=None): - target_times = target_times.sort_values('cutoff_time', ascending=True) - cutoff_times = target_times.cutoff_time - window_size = pd.to_timedelta(window_size) - original_size = len(target_times) - current_size = original_size - new_targets = new_targets or current_size - - for index in range(len(cutoff_times) - 1): - timestamp = cutoff_times.iloc[index] - next_time = cutoff_times.iloc[index + 1] - - if timestamp + (window_size * 2) >= next_time: - continue - - span_start = timestamp + window_size - span_end = next_time - window_size - span_length = (span_end - span_start).total_seconds() - - delay = pd.to_timedelta(np.random.randint(span_length), unit='s') - cutoff_time = span_start + delay - - target_times = target_times.append(pd.Series({ - 'turbine_id': target_times.iloc[index].turbine_id, - 'cutoff_time': cutoff_time, - 'target': target - }), ignore_index=True) - - current_size = len(target_times) - if current_size == original_size + new_targets: - return target_times.sort_values('cutoff_time', ascending=True) - - if current_size == original_size: - warnings.warn('There is no space left between to add more targets.') - return target_times - - new_targets = new_targets - (current_size - original_size) - return make_targets(target_times, window_size, target, new_targets) - - -def _filter_by_filename(target_times, filenames): - max_csv = target_times.end.dt.strftime('%Y-%m-.csv') - min_csv = target_times.start.dt.strftime('%Y-%m-.csv') - - for filename in filenames: - if ((min_csv <= filename) & (filename <= max_csv)).any(): - yield filename - - -def _load_readings_file(turbine_file): - LOGGER.info('Loading file %s', turbine_file) - data = pd.read_csv(turbine_file) - data.columns = data.columns.str.lower() - data.rename(columns={'signal': 'signal_id'}, inplace=True) - - if 'unnamed: 0' in data.columns: - # Someone forgot to drop the index before - # storing the DataFrame as a CSV - del data['unnamed: 0'] - - LOGGER.info('Loaded %s readings from file %s', len(data), turbine_file) - - return data - - -def _filter_by_signal(data, signals): - if signals is not None: - LOGGER.info('Filtering by signal') - data = data[data.signal_id.isin(signals.signal_id)] - - LOGGER.info('Selected %s readings by signal', len(data)) - - return data - - -def _filter_by_timestamp(data, target_times): - LOGGER.info('Parsing timestamps') - timestamps = pd.to_datetime(data['timestamp'], format='%m/%d/%y %H:%M:%S') - data['timestamp'] = timestamps - - LOGGER.info('Filtering by timestamp') - - related = [False] * len(timestamps) - for row in target_times.itertuples(): - related |= (row.start <= timestamps) & (timestamps <= row.end) - - data = data[related] - - LOGGER.info('Selected %s readings by timestamp', len(data)) - - return data - - -def _load_turbine_readings(readings_path, target_times, signals): - turbine_id = target_times.turbine_id.iloc[0] - turbine_path = os.path.join(readings_path, turbine_id) - filenames = sorted(os.listdir(turbine_path)) - filenames = _filter_by_filename(target_times, filenames) - - readings = list() - for readings_file in filenames: - readings_file_path = os.path.join(turbine_path, readings_file) - data = _load_readings_file(readings_file_path) - data = _filter_by_signal(data, signals) - data = _filter_by_timestamp(data, target_times) - - readings.append(data) - - if readings: - readings = pd.concat(readings) - else: - readings = pd.DataFrame(columns=['timestamp', 'signal_id', 'value', 'turbine_id']) - - LOGGER.info('Loaded %s readings from turbine %s', len(readings), turbine_id) - - return readings - - -def _get_times(target_times, window_size): - cutoff_times = target_times.cutoff_time - if window_size: - window_size = pd.to_timedelta(window_size) - min_times = cutoff_times - window_size - else: - min_times = [datetime.min] * len(cutoff_times) - - return pd.DataFrame({ - 'turbine_id': target_times.turbine_id, - 'start': min_times, - 'end': cutoff_times, - }) - - -def _load_readings(readings_path, target_times, signals, window_size): - turbine_ids = target_times.turbine_id.unique() - - target_times = _get_times(target_times, window_size) - - readings = list() - for turbine_id in sorted(turbine_ids): - turbine_target_times = target_times[target_times['turbine_id'] == turbine_id] - LOGGER.info('Loading turbine %s readings', turbine_id) - turbine_readings = _load_turbine_readings(readings_path, turbine_target_times, signals) - turbine_readings['turbine_id'] = turbine_id - readings.append(turbine_readings) - - return pd.concat(readings) - - -def extract_readings(readings_path, target_times, signals=None, window_size=None): - """Extract raw readings data for the given target_times. - - The ``target_times`` table is examined to decide from which turbines found - in the ``reading_pathp`` which data to load. - - And the output is a ``pandas.DataFrame`` containing: - - * `turbine_id`: Unique identifier of the turbine which this reading comes from. - * `signal_id`: Unique identifier of the signal which this reading comes from. - * `timestamp`: Time where the reading took place, as an ISO formatted datetime. - * `value`: Numeric value of this reading. - - Args: - readings_path (str): - Path to the folder containing all the readings data. - target_times (pd.DataFrame or str): - target_times DataFrame or path to the target_times CSV file. - signals (list): - List of signals to load from the readings files. If not given, load - all the signals available. - window_size (str): - Rule indicating how long back before the cutoff times we have to go - when loading the data. - - Returns: - pandas.DataFrame - """ - if isinstance(target_times, pd.DataFrame): - target_times = target_times.copy() - else: - target_times = pd.read_csv(target_times) - - target_times['cutoff_time'] = pd.to_datetime(target_times['cutoff_time']) - - without_duplicates = target_times.drop_duplicates(subset=['cutoff_time', 'turbine_id']) - if len(target_times) != len(without_duplicates): - raise ValueError("Duplicate rows found in target_times") - - if isinstance(signals, list): - signals = pd.DataFrame({'signal_id': signals}) - elif isinstance(signals, str): - signals = pd.read_csv(signals) - - readings = _load_readings(readings_path, target_times, signals, window_size) - LOGGER.info('Loaded %s turbine readings', len(readings)) - - return readings diff --git a/greenguard/targets.py b/greenguard/targets.py new file mode 100644 index 0000000..18106b7 --- /dev/null +++ b/greenguard/targets.py @@ -0,0 +1,155 @@ +"""Targets module. + +This module contains functions to work with target_times. +""" + +import logging +import warnings + +import numpy as np +import pandas as pd +from tqdm.auto import trange + +LOGGER = logging.getLogger(__name__) + + +def make_targets(target_times, window_size, target, new_targets=None): + target_times = target_times.sort_values('cutoff_time', ascending=True) + cutoff_times = target_times.cutoff_time + window_size = pd.to_timedelta(window_size) + original_size = len(target_times) + current_size = original_size + new_targets = new_targets or current_size + + for index in trange(len(cutoff_times) - 1): + timestamp = cutoff_times.iloc[index] + next_time = cutoff_times.iloc[index + 1] + + if timestamp + (window_size * 2) >= next_time: + continue + + span_start = timestamp + window_size + span_end = next_time - window_size + span_length = (span_end - span_start).total_seconds() + + delay = pd.to_timedelta(np.random.randint(span_length), unit='s') + cutoff_time = span_start + delay + + target_times = target_times.append(pd.Series({ + 'turbine_id': target_times.iloc[index].turbine_id, + 'cutoff_time': cutoff_time, + 'target': target + }), ignore_index=True) + + current_size = len(target_times) + if current_size == original_size + new_targets: + return target_times.sort_values('cutoff_time', ascending=True) + + if current_size == original_size: + warnings.warn('There is no space left between to add more targets.') + return target_times + + new_targets = new_targets - (current_size - original_size) + return make_targets(target_times, window_size, target, new_targets) + + +def _to_timedelta(specification): + if isinstance(specification, int): + specification = '{}s'.format(specification) + + return pd.to_timedelta(specification) + + +def make_target_times(failure_dates, step, start=None, end=None, forecast_window=0, + prediction_window=0, before=0, after=0, offset=0, max_true=None, + max_false=None, shuffle=True): + + step = _to_timedelta(step) + start = start or failure_dates.timestamp.min() + start = start or failure_dates.min() + + forecast_window = _to_timedelta(forecast_window) + prediction_window = _to_timedelta(prediction_window) + before = _to_timedelta(before) + after = _to_timedelta(after) + offset = _to_timedelta(offset) + + target_times = pd.DataFrame() + turbines = failure_dates.turbine_id.unique() + failures = failure_dates.set_index(['turbine_id', 'date']) + + for turbine in turbines: + turbine_failures = failures.loc[turbine] + + min_failure_date = turbine_failures.index.min() - before + last_failure_date = turbine_failures.index.max() + after + turbine_targets = list() + while min_failure_date < last_failure_date: + max_failure_date = min_failure_date + prediction_window + day_failures = turbine_failures.loc[min_failure_date:max_failure_date] + + min_failure_date = min_failure_date + offset + + turbine_targets.append({ + 'turbine_id': turbine, + 'target': int(bool(len(day_failures))), + 'cutoff_time': min_failure_date - forecast_window + }) + + turbine_targets = pd.DataFrame(turbine_targets) + failed = turbine_targets[turbine_targets.target == 1] + target_times = target_times.append(failed) + + non_failed = turbine_targets[turbine_targets.target == 0] + non_failed = non_failed.sample(min(max_false, len(non_failed))) + + target_times = target_times.append(non_failed) + + if shuffle: + target_times = target_times.sample(len(target_times)) + + return target_times + + +def _valid_targets(timestamps): + def apply_function(row): + cutoff = row.cutoff_time + try: + times = timestamps.loc[row.turbine_id] + except KeyError: + return False + + return times['min'] < cutoff < times['max'] + + return apply_function + + +def select_valid_targets(target_times, readings, window_size): + """Filter out target_times without enough data for this window_size. + + The table_times table is scanned and checked against the readings table + considering the window_size. All the target times entries that do not + have enough data are dropped. + + Args: + target_times (pandas.DataFrame): + Target times table, with at least turbined_id and cutoff_time fields. + readings (pandas.DataFrame): + Readings table, with at least turbine_id, signal_id, and timestamp ields. + window_size (str or pandas.TimeDelta): + TimeDelta specification that indicates the lenght of the training window. + + Returns: + pandas.DataFrame: + New target_times table without the invalid targets. + """ + + timestamps = readings.groupby('turbine_id').timestamp.agg(['min', 'max']) + timestamps['min'] += pd.to_timedelta(window_size) + + valid = target_times.apply(_valid_targets(timestamps), axis=1) + valid_targets = target_times[valid].copy() + + LOGGER.info('Dropped %s invalid targets', len(target_times) - len(valid_targets)) + + return valid_targets From e8c65f8c2b1652c22431fd455e2a6347d9b1ae8b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 30 Jan 2020 14:06:01 -0500 Subject: [PATCH 003/171] Add docstrings and small robustness and usability improvements --- greenguard/loaders/__init__.py | 30 +---- greenguard/loaders/csv.py | 84 ++++++++++++-- greenguard/pipeline.py | 196 ++++++++++++++++++++++++++++++++- 3 files changed, 271 insertions(+), 39 deletions(-) diff --git a/greenguard/loaders/__init__.py b/greenguard/loaders/__init__.py index a4011fb..169c687 100644 --- a/greenguard/loaders/__init__.py +++ b/greenguard/loaders/__init__.py @@ -1,28 +1,6 @@ -import logging +from greenguard.loaders.csv import CSVLoader -LOGGER = logging.getLogger(__name__) - -def _valid_targets(timestamps): - def apply_function(row): - cutoff = row.cutoff_time - try: - times = timestamps.loc[row.turbine_id] - except KeyError: - return False - - return times['min'] < cutoff < times['max'] - - return apply_function - - -def select_valid_targets(target_times, readings, window_size): - timestamps = readings.groupby('turbine_id').timestamp.agg(['min', 'max']) - timestamps['min'] += window_size - - valid = target_times.apply(_valid_targets(timestamps), axis=1) - valid_targets = target_times[valid].copy() - - LOGGER.info('Dropped %s invalid targets', len(target_times) - len(valid_targets)) - - return valid_targets +__all__ = ( + 'CSVLoader', +) diff --git a/greenguard/loaders/csv.py b/greenguard/loaders/csv.py index b032ba0..1d4bf9c 100644 --- a/greenguard/loaders/csv.py +++ b/greenguard/loaders/csv.py @@ -4,16 +4,49 @@ import dask import pandas as pd +from greenguard.targets import select_valid_targets + LOGGER = logging.getLogger(__name__) class CSVLoader: - - def __init__(self, readings_path='.', rule=None, aggregation='mean', unstack=True): + """Load the required readings from CSV files. + + The CSVLoader class is responsible for analyzing the target_times table + and then load the required readings from CSV files. + + Also, optionally, it can perform a resampling aggregation while loading + the data, reducing the amount of memory requirements. + + The CSVLoader class uses Dask to parallelize all the IO and resampling + computation and reduce loading times. + + Args: + readings_path (str): + Path to the readings folder, where a folder exist for each turbine. + rule (str): + Resampling rule, as expected by ``DataFrame.resmple``. The rule is a + string representation of a TimeDelta, which includes a number and a + unit. For example: ``3d``, ``1w``, ``6h``. + If ``None``, resampling is disabled. + aggregation (str): + Name of the aggregation to perform during the resampling. + unstack (bool): + Whether to unstack the resampled data, generating one column per signal. + Only used when resampling. Defaults to ``False``. + """ + + DEFAULT_DATETIME_FMT = '%Y-%m-%dT%M:%H:%S' + DEFAULT_FILENAME_FMT = '%Y-%m-.csv' + + def __init__(self, readings_path='.', rule=None, aggregation='mean', unstack=False, + datetime_fmt=DEFAULT_DATETIME_FMT, filename_fmt=DEFAULT_FILENAME_FMT): self._readings_path = readings_path self._rule = rule self._aggregation = aggregation self._unstack = unstack + self._datetime_fmt = datetime_fmt + self._filename_fmt = filename_fmt @dask.delayed def __filter_by_signal(self, readings, signals): @@ -28,7 +61,7 @@ def __filter_by_signal(self, readings, signals): @dask.delayed def __filter_by_timestamp(self, readings, timestamps): LOGGER.debug('Parsing timestamps') - readings_ts = pd.to_datetime(readings['timestamp'], format='%m/%d/%y %H:%M:%S') + readings_ts = pd.to_datetime(readings['timestamp'], format=self._datetime_fmt) readings['timestamp'] = readings_ts LOGGER.debug('Filtering by timestamp') @@ -76,10 +109,9 @@ def __consolidate(self, readings, turbine_id): return readings - @staticmethod - def _get_filenames(turbine_path, timestamps): - min_csv = timestamps.start.dt.strftime('%Y-%m-.csv') - max_csv = timestamps.stop.dt.strftime('%Y-%m-.csv') + def _get_filenames(self, turbine_path, timestamps): + min_csv = timestamps.start.dt.strftime(self._filename_fmt) + max_csv = timestamps.stop.dt.strftime(self._filename_fmt) for filename in sorted(os.listdir(turbine_path)): if ((min_csv <= filename) & (filename <= max_csv)).any(): @@ -138,7 +170,31 @@ def _get_timestamps(target_times, window_size): 'stop': cutoff_times, }) - def load(self, target_times, window_size, signals=None, debug=False): + def load(self, target_times, window_size, signals=None, debug=False, select_valid=True): + """Load the readings needed for the given target_times and window_size. + + Optionally filter the signals that are loaded and discard the rest. + + Args: + target_times (str or pandas.DataFrame): + target_times ``DataFrame`` or path to the corresponding CSV file. + The table must have three volumns, ``turbine_id``, ``target`` and + ``cutoff_time``. + window_size (str): + Amount of data to load before each cutoff time, specified as a string + representation of a TimeDelta, which includes a number and a + unit. For example: ``3d``, ``1w``, ``6h``. + signals (list or pandas.DataFrame): + List of signal names or table that has a ``signal_id`` column to + use as the signal names list. + debug (bool): + Force single thread execution for easy debugging. Defaults to ``False``. + + Returns: + pandas.DataFrame: + Table of readings for the target times, including the columns ``turbine_id``, + ``signal_id``, ``timestamp`` and ``value``. + """ if isinstance(target_times, str): target_times = pd.read_csv(target_times) target_times['cutoff_time'] = pd.to_datetime(target_times['cutoff_time']) @@ -155,8 +211,18 @@ def load(self, target_times, window_size, signals=None, debug=False): dask_scheduler = 'single-threaded' if debug else None computed = dask.compute(*readings, scheduler=dask_scheduler) - readings = pd.concat((c for c in computed if len(c)), ignore_index=True, sort=False) + + found_readings = [c for c in computed if len(c)] + if not found_readings: + msg = 'No readings found for the given target times in {}'.format(self._readings_path) + raise ValueError(msg) + + readings = pd.concat(found_readings, ignore_index=True, sort=False) LOGGER.info('Loaded %s turbine readings', len(readings)) + if select_valid: + target_times = select_valid_targets(target_times, readings, window_size) + return target_times, readings + return readings diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index b4db17f..509f766 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -22,18 +22,103 @@ PIPELINES_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'pipelines')) -def get_pipelines(): +def get_pipelines(pattern='', path=False): + """Get the list of available pipelines. + + Optionally filter the names using a patter or obtain + the paths to the pipelines alongside their name. + + Args: + pattern (str): + Pattern to search for in the pipeline names + path (bool): + Whether to return a dictionary containing the pipeline + paths instead of only a list with the names. + + Return: + list or dict: + List of available and matching pipeline names. + If `path=True`, return a dict containing the pipeline + names as keys and their absolute paths as values. + """ pipelines = dict() for filename in os.listdir(PIPELINES_DIR): - if filename.endswith('.json'): + if filename.endswith('.json') and pattern in filename: name = os.path.basename(filename)[:-len('.json')] - path = os.path.join(PIPELINES_DIR, filename) - pipelines[name] = path + pipeline_path = os.path.join(PIPELINES_DIR, filename) + pipelines[name] = pipeline_path + + if not path: + pipelines = list(pipelines) return pipelines class GreenGuardPipeline(object): + """Main Machine Learning component in the GreenGuard project. + + The ``GreenGuardPipeline`` represents the abstraction of a Machine + Learning pipeline architecture specialized on the GreenGuard data + format. + + In order to use it, an MLBlocks pipeline template needs to be given, + alongside information about how to evaluate its performance using + cross validation. + + Attributes: + template (MLPipeline): + MLPipeline instance used as the template for tuning. + template_name: + Name of the template being used. + fitted (bool): + Whether this GreenGuardPipeline has already been fitted or not. + steps (list): + List of primitives that compose this template. + preprocessing (list): + List of preprocessing steps. These steps have no learning stage + and are executed only once on the complete training dataset, before + partitioning it for cross validation. + static (list): + List of static steps. These are all the steps in the pipeline that + come after the preprocessing ones but have no hyperparameters. + These are executed on each cross validation split only once, when + the data is partitioned, and their output is cached to be reused + later on at every tuning iteration. + tunable (list): + List of steps that have hyperparameters and will be tuned during + the tuning loop. + + Args: + template (str or MLPipeline): + Template to use. If a ``str`` is given, load the corresponding + ``MLPipeline``. + metric (str or function): + Metric to use. If an ``str`` is give it must be one of the metrics + defined in the ``greenguard.metrics.METRICS`` dictionary. + cost (bool): + Whether the metric is a cost function (the lower the better) or not. + Defaults to ``False``. + init_params (dict): + Initial parameters to pass to the underlying MLPipeline if something + other than the defaults need to be used. + Defaults to ``None``. + stratify (bool): + Whether to stratify the data when partitioning for cross validation. + Defaults to ``True``. + cv_splits (int): + Number of cross validation folds to use. Defaults to ``5``. + shuffle (bool): + Whether to shuffle the data when partitioning for cross validation. + Defaults to ``True``. + random_state (int or RandomState): + random state to use for the cross validation partitioning. + Defaults to ``0``. + preprocessing (int): + Number of steps to execute during the preprocessing stage. + The number of preprocessing steps cannot be higher than the + number of static steps in the given template. + Defaults to ``0``. + """ template = None template_name = None @@ -82,6 +167,12 @@ def _update_params(old, new): block_params[param] = value def set_init_params(self, init_params): + """Set new init params for the template and pipeline. + + Args: + init_params (dict): + New init_params to use. + """ template_params = self.template['init_params'] self._update_params(template_params, init_params) self._build_pipeline() @@ -140,9 +231,23 @@ def __repr__(self): ) def get_hyperparameters(self): + """Get the current hyperparameters. + + Returns: + dict: + Current hyperparameters. + """ return deepcopy(self._hyperparameters) def set_hyperparameters(self, hyperparameters): + """Set new hyperparameters for this pipeline instance. + + The template ``init_params`` remain unmodified. + + Args: + hyperparameters (dict): + New hyperparameters to use. + """ self._update_params(self._hyperparameters, hyperparameters) self._build_pipeline() @@ -185,6 +290,35 @@ def _generate_splits(self, X, y, readings): return splits def cross_validate(self, X=None, y=None, readings=None, params=None): + """Compute cross validation score using the given data. + + If the splits have not been previously computed, compute them now. + During this computation, the data is partitioned using the indicated + cross validation parameters and later on processed using the + pipeline static steps. + + The results of the fit and produce executions are cached and reused + in subsequent calls to this method. + + Args: + X (pandas.DataFrame): + ``target_times`` data, without the ``target`` column. + Only needed if the splits have not been previously computed. + y (pandas.Series or numpy.ndarray): + ``target`` vector corresponding to the passed ``target_times``. + Only needed if the splits have not been previously computed. + readings (pandas.DataFrame): + ``readings`` table. Only needed if the splits have not been + previously computed. + params (dict): + hyperparameter values to use. + + Returns: + float: + Computed cross validation score. This score is the average + of the scores obtained accross all the cross validation folds. + """ + if self._splits is None: LOGGER.info('Running static steps before cross validation') self._splits = self._generate_splits(X, y, readings) @@ -272,6 +406,21 @@ def _get_tuner(self): return tuner def tune(self, X=None, y=None, readings=None, iterations=10): + """Tune this pipeline for the indicated number of iterations. + + Args: + X (pandas.DataFrame): + ``target_times`` data, without the ``target`` column. + Only needed if the splits have not been previously computed. + y (pandas.Series or numpy.ndarray): + ``target`` vector corresponding to the passed ``target_times``. + Only needed if the splits have not been previously computed. + readings (pandas.DataFrame): + ``readings`` table. Only needed if the splits have not been + previously computed. + iterations (int): + Number of iterations to perform. + """ if not self._tuner: LOGGER.info('Scoring the default pipeline') self.cv_score = self.cross_validate(X, y, readings) @@ -303,20 +452,59 @@ def tune(self, X=None, y=None, readings=None, iterations=10): i + 1, failed) def fit(self, X, y, readings): + """Fit this pipeline to the given data. + + Args: + X (pandas.DataFrame): + ``target_times`` data, without the ``target`` column. + y (pandas.Series or numpy.ndarray): + ``target`` vector corresponding to the passed ``target_times``. + readings (pandas.DataFrame): + ``readings`` table. + """ self._pipeline.fit(X, y, readings=readings) self.fitted = True def predict(self, X, readings): + """Make predictions using this pipeline. + + Args: + X (pandas.DataFrame): + ``target_times`` data, containing the ``turbine_id`` and + the ``cutoff_time`` column. + readings (pandas.DataFrame): + ``readings`` table. + + Returns: + numpy.ndarray: + Vector of predictions. + """ if not self.fitted: raise NotFittedError() return self._pipeline.predict(X, readings=readings) def save(self, path): + """Serialize and save this pipeline using cloudpickle. + + Args: + path (str): + Path to the file where the pipeline will be saved. + """ with open(path, 'wb') as pickle_file: cloudpickle.dump(self, pickle_file) @classmethod def load(cls, path): + """Load a previously saved pipeline from a file. + + Args: + path (str): + Path to the file where the pipeline is saved. + + Returns: + GreenGuardPipeline: + Loaded GreenGuardPipeline instance. + """ with open(path, 'rb') as pickle_file: return cloudpickle.load(pickle_file) From 1726d77b4bb160ea22d9da3fc54d9383dd5adffd Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 30 Jan 2020 14:06:39 -0500 Subject: [PATCH 004/171] Upgrade MLPrimitives and improve pipelines --- .../{ => disabled}/dfs_xgb_classifier.json | 0 .../normalize_dfs_xgb_classifier.json | 0 .../resample_dfs_xgb_classifier.json | 0 ...resample_normalize_dfs_xgb_classifier.json | 4 +- .../resample_unstack_dfs_xgb_classifier.json | 0 ...ack_double_lstm_timeseries_classifier.json | 0 ...le_unstack_lstm_timeseries_classifier.json | 8 +- ..._unstack_normalize_dfs_xgb_classifier.json | 0 ...unstack_24_lstm_timeseries_classifier.json | 119 ++++++++++++++++++ ..._double_24_lstm_timeseries_classifier.json | 119 ++++++++++++++++++ ..._600s_normalize_dfs_1d_xgb_classifier.json | 65 ++++++++++ ...nstack_144_lstm_timeseries_classifier.json | 119 ++++++++++++++++++ ...le_600s_unstack_dfs_1d_xgb_classifier.json | 78 ++++++++++++ ...double_144_lstm_timeseries_classifier.json | 119 ++++++++++++++++++ ...stack_normalize_dfs_1d_xgb_classifier.json | 69 ++++++++++ .../unstacked_dfs_xgb_classifier.json | 0 ...ked_double_lstm_timeseries_classifier.json | 0 .../unstacked_lstm_timeseries_classifier.json | 0 ...nstacked_normalize_dfs_xgb_classifier.json | 0 setup.py | 7 +- 20 files changed, 697 insertions(+), 10 deletions(-) rename greenguard/pipelines/{ => disabled}/dfs_xgb_classifier.json (100%) rename greenguard/pipelines/{ => disabled}/normalize_dfs_xgb_classifier.json (100%) rename greenguard/pipelines/{ => disabled}/resample_dfs_xgb_classifier.json (100%) rename greenguard/pipelines/{ => disabled}/resample_normalize_dfs_xgb_classifier.json (96%) rename greenguard/pipelines/{ => disabled}/resample_unstack_dfs_xgb_classifier.json (100%) rename greenguard/pipelines/{ => disabled}/resample_unstack_double_lstm_timeseries_classifier.json (100%) rename greenguard/pipelines/{ => disabled}/resample_unstack_lstm_timeseries_classifier.json (97%) rename greenguard/pipelines/{ => disabled}/resample_unstack_normalize_dfs_xgb_classifier.json (100%) create mode 100644 greenguard/pipelines/resample_3600s_unstack_24_lstm_timeseries_classifier.json create mode 100644 greenguard/pipelines/resample_3600s_unstack_double_24_lstm_timeseries_classifier.json create mode 100644 greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json create mode 100644 greenguard/pipelines/resample_600s_unstack_144_lstm_timeseries_classifier.json create mode 100644 greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json create mode 100644 greenguard/pipelines/resample_600s_unstack_double_144_lstm_timeseries_classifier.json create mode 100644 greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json rename greenguard/pipelines/{ => unstacked}/unstacked_dfs_xgb_classifier.json (100%) rename greenguard/pipelines/{ => unstacked}/unstacked_double_lstm_timeseries_classifier.json (100%) rename greenguard/pipelines/{ => unstacked}/unstacked_lstm_timeseries_classifier.json (100%) rename greenguard/pipelines/{ => unstacked}/unstacked_normalize_dfs_xgb_classifier.json (100%) diff --git a/greenguard/pipelines/dfs_xgb_classifier.json b/greenguard/pipelines/disabled/dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/dfs_xgb_classifier.json rename to greenguard/pipelines/disabled/dfs_xgb_classifier.json diff --git a/greenguard/pipelines/normalize_dfs_xgb_classifier.json b/greenguard/pipelines/disabled/normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/normalize_dfs_xgb_classifier.json rename to greenguard/pipelines/disabled/normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/resample_dfs_xgb_classifier.json b/greenguard/pipelines/disabled/resample_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/resample_dfs_xgb_classifier.json rename to greenguard/pipelines/disabled/resample_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/resample_normalize_dfs_xgb_classifier.json b/greenguard/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json similarity index 96% rename from greenguard/pipelines/resample_normalize_dfs_xgb_classifier.json rename to greenguard/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json index bf32034..3d7d4d2 100644 --- a/greenguard/pipelines/resample_normalize_dfs_xgb_classifier.json +++ b/greenguard/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json @@ -10,7 +10,7 @@ ], "init_params": { "pandas.DataFrame.resample#1": { - "rule": "1h", + "rule": "600s", "on": "timestamp", "groupby": [ "turbine_id", @@ -46,7 +46,7 @@ "copy": true, "verbose": true, "n_jobs": 1, - "training_window": "3d" + "training_window": "1d" } }, "input_names": { diff --git a/greenguard/pipelines/resample_unstack_dfs_xgb_classifier.json b/greenguard/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/resample_unstack_dfs_xgb_classifier.json rename to greenguard/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/resample_unstack_double_lstm_timeseries_classifier.json b/greenguard/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/resample_unstack_double_lstm_timeseries_classifier.json rename to greenguard/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/resample_unstack_lstm_timeseries_classifier.json b/greenguard/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json similarity index 97% rename from greenguard/pipelines/resample_unstack_lstm_timeseries_classifier.json rename to greenguard/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json index de2d1ce..e33e83b 100644 --- a/greenguard/pipelines/resample_unstack_lstm_timeseries_classifier.json +++ b/greenguard/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json @@ -14,7 +14,7 @@ ], "init_params": { "pandas.DataFrame.resample#1": { - "rule": "3600s", + "rule": "600s", "on": "timestamp", "groupby": [ "turbine_id", @@ -50,7 +50,7 @@ "key": "timestamp" }, "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "window_size": 72, + "window_size": 144, "cutoff_time": "cutoff_time", "time_index": "timestamp" }, @@ -58,8 +58,8 @@ "epochs": 35, "verbose": true, "input_shape": [ - 72, - 97 + 144, + 26 ] } }, diff --git a/greenguard/pipelines/resample_unstack_normalize_dfs_xgb_classifier.json b/greenguard/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/resample_unstack_normalize_dfs_xgb_classifier.json rename to greenguard/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/resample_3600s_unstack_24_lstm_timeseries_classifier.json b/greenguard/pipelines/resample_3600s_unstack_24_lstm_timeseries_classifier.json new file mode 100644 index 0000000..7e494d5 --- /dev/null +++ b/greenguard/pipelines/resample_3600s_unstack_24_lstm_timeseries_classifier.json @@ -0,0 +1,119 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.LSTMTimeSeriesClassifier" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "3600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 24, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.LSTMTimeSeriesClassifier": { + "epochs": 35, + "verbose": false + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/resample_3600s_unstack_double_24_lstm_timeseries_classifier.json b/greenguard/pipelines/resample_3600s_unstack_double_24_lstm_timeseries_classifier.json new file mode 100644 index 0000000..7f4e8a6 --- /dev/null +++ b/greenguard/pipelines/resample_3600s_unstack_double_24_lstm_timeseries_classifier.json @@ -0,0 +1,119 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.DoubleLSTMTimeSeriesClassifier" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "3600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 24, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.DoubleLSTMTimeSeriesClassifier": { + "epochs": 35, + "verbose": false + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json b/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json new file mode 100644 index 0000000..3d7d4d2 --- /dev/null +++ b/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json @@ -0,0 +1,65 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.normalize_entity", + "featuretools.EntitySet.normalize_entity", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": true + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "entity_id": "readings", + "index": "reading_id", + "make_index": true, + "time_index": "timestamp" + }, + "featuretools.EntitySet.normalize_entity#1": { + "base_entity_id": "readings", + "new_entity_id": "turbines", + "index": "turbine_id", + "make_time_index": false + }, + "featuretools.EntitySet.normalize_entity#2": { + "base_entity_id": "readings", + "new_entity_id": "signals", + "index": "signal_id", + "make_time_index": false + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": true, + "n_jobs": 1, + "training_window": "1d" + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "dataframe": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/resample_600s_unstack_144_lstm_timeseries_classifier.json b/greenguard/pipelines/resample_600s_unstack_144_lstm_timeseries_classifier.json new file mode 100644 index 0000000..b54702b --- /dev/null +++ b/greenguard/pipelines/resample_600s_unstack_144_lstm_timeseries_classifier.json @@ -0,0 +1,119 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.LSTMTimeSeriesClassifier" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 144, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.LSTMTimeSeriesClassifier": { + "epochs": 35, + "verbose": false + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json b/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json new file mode 100644 index 0000000..60be686 --- /dev/null +++ b/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json @@ -0,0 +1,78 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.add_relationship", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "entity_id": "readings", + "index": "reading_id", + "make_index": true, + "time_index": "timestamp" + }, + "featuretools.EntitySet.entity_from_dataframe#2": { + "entity_id": "turbines", + "index": "turbine_id", + "make_index": false + }, + "featuretools.EntitySet.add_relationship#1": { + "parent": "turbines", + "parent_column": "turbine_id", + "child": "readings", + "child_column": "turbine_id" + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": true, + "n_jobs": 1, + "training_window": "1d" + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "dataframe": "readings" + }, + "featuretools.EntitySet.entity_from_dataframe#2": { + "dataframe": "turbines" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/resample_600s_unstack_double_144_lstm_timeseries_classifier.json b/greenguard/pipelines/resample_600s_unstack_double_144_lstm_timeseries_classifier.json new file mode 100644 index 0000000..368dd4d --- /dev/null +++ b/greenguard/pipelines/resample_600s_unstack_double_144_lstm_timeseries_classifier.json @@ -0,0 +1,119 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.DoubleLSTMTimeSeriesClassifier" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 144, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.DoubleLSTMTimeSeriesClassifier": { + "epochs": 35, + "verbose": false + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json b/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json new file mode 100644 index 0000000..b0550ee --- /dev/null +++ b/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json @@ -0,0 +1,69 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.normalize_entity", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "entity_id": "readings", + "index": "reading_id", + "make_index": true, + "time_index": "timestamp" + }, + "featuretools.EntitySet.normalize_entity#1": { + "base_entity_id": "readings", + "new_entity_id": "turbines", + "index": "turbine_id", + "make_time_index": false + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": true, + "n_jobs": 1, + "training_window": "1d" + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "dataframe": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/unstacked_dfs_xgb_classifier.json b/greenguard/pipelines/unstacked/unstacked_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/unstacked_dfs_xgb_classifier.json rename to greenguard/pipelines/unstacked/unstacked_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/unstacked_double_lstm_timeseries_classifier.json b/greenguard/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/unstacked_double_lstm_timeseries_classifier.json rename to greenguard/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/unstacked_lstm_timeseries_classifier.json b/greenguard/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/unstacked_lstm_timeseries_classifier.json rename to greenguard/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/unstacked_normalize_dfs_xgb_classifier.json b/greenguard/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/unstacked_normalize_dfs_xgb_classifier.json rename to greenguard/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json diff --git a/setup.py b/setup.py index 8f7de72..5893f14 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,9 @@ history = '' install_requires = [ - 'baytune>=0.2.3,<0.3', 'mlblocks>=0.3.4,<0.4', - 'mlprimitives>=0.2.3,<0.3', + 'mlprimitives>=0.2.4,<0.3', + 'baytune>=0.2.3,<0.3', 'numpy>=1.15.4,<1.17', 'pymongo>=3.7.2,<4', 'scikit-learn>=0.20.1,<0.21', @@ -76,7 +76,6 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], @@ -98,7 +97,7 @@ long_description_content_type='text/markdown', name='greenguard', packages=find_packages(include=['greenguard', 'greenguard.*']), - python_requires='>=3.5', + python_requires='>=3.6', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, From 462b9e1ac989ae66e541a3cf0f15a15d8211061c Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 3 Feb 2020 12:57:47 -0500 Subject: [PATCH 005/171] Fix input format and add demo functions --- greenguard/__init__.py | 1 - greenguard/demo.py | 36 +++++++++++++++++-- greenguard/demo.py.new | 66 ++++++++++++++++++++++++++++++++++ greenguard/loaders/__init__.py | 1 - greenguard/loaders/csv.py | 2 +- greenguard/pipeline.py | 32 +++++++++-------- tests/test_pipeline.py | 48 ++++++++++++++----------- 7 files changed, 145 insertions(+), 41 deletions(-) create mode 100644 greenguard/demo.py.new diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 1eab417..c530d4e 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -18,5 +18,4 @@ __all__ = ( 'GreenGuardPipeline', 'get_pipelines', - 'load_demo' ) diff --git a/greenguard/demo.py b/greenguard/demo.py index 369422b..789a50d 100644 --- a/greenguard/demo.py +++ b/greenguard/demo.py @@ -27,13 +27,43 @@ def _load_or_download(filename, dates): return data -def load_demo(): +def load_demo(load_readings=True): """Load the demo included in the GreenGuard project. + The first time that this function is executed, the data will be downloaded and cached inside the `greenguard/demo` folder. Subsequent calls will load the cached data instead of downloading it again. + + Returns: + tuple[pandas.DataFrame]: + target_times and readings tables """ target_times = _load_or_download('target_times', 'cutoff_time') - readings = _load_or_download('readings', 'timestamp') + if load_readings: + readings = _load_or_download('readings', 'timestamp') + return target_times, readings + + return target_times + + +def generate_raw_readings(output_path='demo'): + """Generate raw readings based on the demo data. + + Args: + path (str): + Path where the readings will be generated. + """ + target_times, readings = load_demo() + + for turbine_id in target_times.turbine_id.unique(): + turbine_path = os.path.join(output_path, turbine_id) + os.makedirs(turbine_path, exist_ok=True) + data = readings[readings.turbine_id == turbine_id] + for month in range(1, 13): + month_data = data[data.timestamp.dt.month == month].copy() + month_data['timestamp'] = month_data['timestamp'].dt.strftime('%m/%d/%y %M:%H:%S') + month_path = os.path.join(turbine_path, '2013-{:02d}-.csv'.format(month)) + LOGGER.info('Generating file %s', month_path) + month_data.to_csv(month_path, index=False) - return target_times, readings + return target_times diff --git a/greenguard/demo.py.new b/greenguard/demo.py.new new file mode 100644 index 0000000..62a9eb1 --- /dev/null +++ b/greenguard/demo.py.new @@ -0,0 +1,66 @@ +import os +import random +from datetime import datetime, timedelta + +import pandas as pd + + +def get_turbine_df(start, end, interval, signals): + data = list() + current = start + delta = timedelta(seconds=interval) + while current < end: + for signal in signals: + data.append({ + 'timestamp': current.strftime('%m/%d/%y %H:%M:%S'), + 'signal_id': signal, + 'value': random.random() + }) + + current = current + delta + + return pd.DataFrame(data)[['timestamp', 'signal_id', 'value']] + + +def generate_turbine_files(data_path, turbine_name, signals, interval): + turbine_path = os.path.join(data_path, turbine_name) + os.makedirs(turbine_path, exist_ok=True) + + for year in range(2000, 2011): + for month in range(1, 13): + start = datetime(year, month, 1) + end = datetime(year + (1 if month == 12 else 0), (month % 12) + 1, 1) + tdf = get_turbine_df(start, end, interval, signals) + + csv_path = os.path.join(turbine_path, '{}-{:02d}-.csv'.format(year, month)) + tdf.to_csv(csv_path) + +def _prefixed_range(prefix, size): + arr = pd.Series(np.arange(size) + 1).astype(str) + arr = arr.str.zfill(arr.str.len().max()) + + return prefix + arr + + +def make_demo(path='.', signals=1, turbines=1, interval=600): + signals = _prefixed_range('S', signals) + turbines = _prefixed_range('S', turbines) + readings_path = os.path.join(path, readings) + + for turbine in turbines: + generate_turbine_files(readings_path, turbine, signals, interval) + + + target_times = pd.DataFrame([ + {'turbine_id': 'T001', 'cutoff_time': datetime(2005, 1, 1), 'target': False}, + {'turbine_id': 'T001', 'cutoff_time': datetime(2007, 1, 2), 'target': True}, + {'turbine_id': 'T001', 'cutoff_time': datetime(2009, 1, 2), 'target': False}, + {'turbine_id': 'T002', 'cutoff_time': datetime(2005, 1, 1), 'target': True}, + {'turbine_id': 'T002', 'cutoff_time': datetime(2007, 1, 2), 'target': False}, + {'turbine_id': 'T002', 'cutoff_time': datetime(2009, 1, 2), 'target': True}, + {'turbine_id': 'T003', 'cutoff_time': datetime(2005, 1, 1), 'target': False}, + {'turbine_id': 'T003', 'cutoff_time': datetime(2007, 1, 2), 'target': True}, + {'turbine_id': 'T003', 'cutoff_time': datetime(2009, 1, 2), 'target': False}, + ]) + + target_times.to_csv('target_times.csv', index=False) diff --git a/greenguard/loaders/__init__.py b/greenguard/loaders/__init__.py index 169c687..0113f15 100644 --- a/greenguard/loaders/__init__.py +++ b/greenguard/loaders/__init__.py @@ -1,6 +1,5 @@ from greenguard.loaders.csv import CSVLoader - __all__ = ( 'CSVLoader', ) diff --git a/greenguard/loaders/csv.py b/greenguard/loaders/csv.py index 1d4bf9c..a2db438 100644 --- a/greenguard/loaders/csv.py +++ b/greenguard/loaders/csv.py @@ -36,7 +36,7 @@ class CSVLoader: Only used when resampling. Defaults to ``False``. """ - DEFAULT_DATETIME_FMT = '%Y-%m-%dT%M:%H:%S' + DEFAULT_DATETIME_FMT = '%m/%d/%y %M:%H:%S' DEFAULT_FILENAME_FMT = '%Y-%m-.csv' def __init__(self, readings_path='.', rule=None, aggregation='mean', unstack=False, diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 509f766..9783052 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -405,15 +405,13 @@ def _get_tuner(self): return tuner - def tune(self, X=None, y=None, readings=None, iterations=10): + def tune(self, target_times=None, readings=None, iterations=10): """Tune this pipeline for the indicated number of iterations. Args: - X (pandas.DataFrame): - ``target_times`` data, without the ``target`` column. - Only needed if the splits have not been previously computed. - y (pandas.Series or numpy.ndarray): - ``target`` vector corresponding to the passed ``target_times``. + target_times (pandas.DataFrame): + ``target_times`` table, containing the ``turbine_id``, ``cutoff_time`` + and ``target`` columns. Only needed if the splits have not been previously computed. readings (pandas.DataFrame): ``readings`` table. Only needed if the splits have not been @@ -423,6 +421,8 @@ def tune(self, X=None, y=None, readings=None, iterations=10): """ if not self._tuner: LOGGER.info('Scoring the default pipeline') + X = target_times[['turbine_id', 'cutoff_time']] + y = target_times['target'] self.cv_score = self.cross_validate(X, y, readings) LOGGER.info('Default Pipeline score: %s', self.cv_score) @@ -451,27 +451,28 @@ def tune(self, X=None, y=None, readings=None, iterations=10): LOGGER.exception("Caught an exception scoring pipeline %s with params:\n%s", i + 1, failed) - def fit(self, X, y, readings): + def fit(self, target_times, readings): """Fit this pipeline to the given data. Args: - X (pandas.DataFrame): - ``target_times`` data, without the ``target`` column. - y (pandas.Series or numpy.ndarray): - ``target`` vector corresponding to the passed ``target_times``. + target_times (pandas.DataFrame): + ``target_times`` table, containing the ``turbine_id``, ``cutoff_time`` + and ``target`` columns. readings (pandas.DataFrame): ``readings`` table. """ + X = target_times[['turbine_id', 'cutoff_time']] + y = target_times['target'] self._pipeline.fit(X, y, readings=readings) self.fitted = True - def predict(self, X, readings): + def predict(self, target_times, readings): """Make predictions using this pipeline. Args: - X (pandas.DataFrame): - ``target_times`` data, containing the ``turbine_id`` and - the ``cutoff_time`` column. + target_times (pandas.DataFrame): + ``target_times`` table, containing the ``turbine_id``, ``cutoff_time`` + and ``target`` columns. readings (pandas.DataFrame): ``readings`` table. @@ -482,6 +483,7 @@ def predict(self, X, readings): if not self.fitted: raise NotFittedError() + X = target_times[['turbine_id', 'cutoff_time']] return self._pipeline.predict(X, readings=readings) def save(self, path): diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 80a9167..541ad6f 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -5,39 +5,47 @@ from unittest import TestCase from unittest.mock import patch -from mlblocks.discovery import find_pipelines, load_pipeline +import pandas as pd from greenguard.pipeline import GreenGuardPipeline class TestGreenGuardPipeline(TestCase): - """Tests for `TimeSeriesClassifier`.""" - PIPELINE_NAME = find_pipelines()[0] + def _get_data(self): + target_times = pd.DataFrame({ + 'turbine_id': ['T001'], + 'cutoff_time': [pd.Timestamp('2010-01-01')], + 'target': [1] + }) + readings = pd.DataFrame({ + 'turbine_id': ['T001'], + 'timestamp': [pd.Timestamp('2010-01-01')], + 'signal_id': ['S1'], + 'value': [0.1] + }) + return target_times, readings @patch('greenguard.pipeline.MLPipeline') - def test_fit(self, pipeline_class_mock): - """fit prepare the pipeline to make predictions based on the given data.""" + @patch('greenguard.pipeline.load_pipeline') + def test_fit(self, load_pipeline_mock, mlpipeline_mock): + load_pipeline_mock.return_value = dict() + # Run - instance = GreenGuardPipeline(self.PIPELINE_NAME, 'accuracy') - instance.fit('an_X', 'a_y', 'readings') + instance = GreenGuardPipeline('a_pipeline', 'accuracy') + target_times, readings = self._get_data() + instance.fit(target_times, readings) # Asserts - pipeline_mock = pipeline_class_mock.return_value - pipeline_class_mock.assert_called_once_with(load_pipeline(self.PIPELINE_NAME)) - assert instance._pipeline == pipeline_mock - - pipeline_mock.fit.assert_called_once_with('an_X', 'a_y', readings='readings') - assert instance.fitted @patch('greenguard.pipeline.MLPipeline') - def test_predict(self, pipeline_mock): - """predict produces results using the pipeline.""" + @patch('greenguard.pipeline.load_pipeline') + def test_predict(self, load_pipeline_mock, mlpipeline_mock): + load_pipeline_mock.return_value = dict() + # Run - instance = GreenGuardPipeline(self.PIPELINE_NAME, 'accuracy') + instance = GreenGuardPipeline('a_pipeline', 'accuracy') instance.fitted = True - instance.predict('an_X', 'readings') - - # Asserts - pipeline_mock.return_value.predict.assert_called_once_with('an_X', readings='readings') + target_times, readings = self._get_data() + instance.predict(target_times, readings) From 35df088a761b6312709cefcbd1a82e5d38c4a88d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 3 Feb 2020 12:58:40 -0500 Subject: [PATCH 006/171] Update readme and examples --- .dockerignore | 1 + README.md | 488 ++++---------- notebooks/CSVLoader Demo.ipynb | 683 ++++++++++++++++++++ notebooks/GreenGuard usage example.ipynb | 779 +++++++++++------------ 4 files changed, 1180 insertions(+), 771 deletions(-) create mode 100644 .dockerignore create mode 100644 notebooks/CSVLoader Demo.ipynb diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..d8e7acb --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +notebooks-private/ diff --git a/README.md b/README.md index 169386f..cc0a5e0 100644 --- a/README.md +++ b/README.md @@ -19,19 +19,19 @@ AutoML for Renewable Energy Industries. # GreenGuard -- Free software: MIT license +- License: [MIT](https://github.com/D3-AI/GreenGuard/blob/master/LICENSE) - Documentation: https://D3-AI.github.io/GreenGuard - Homepage: https://github.com/D3-AI/GreenGuard # Overview The GreenGuard project is a collection of end-to-end solutions for machine learning problems -commonly -found in monitoring wind energy production systems. Most tasks utilize sensor data +commonly found in monitoring wind energy production systems. Most tasks utilize sensor data emanating from monitoring systems. We utilize the foundational innovations developed for automation of machine Learning at Data to AI Lab at MIT. The salient aspects of this customized project are: + * A set of ready to use, well tested pipelines for different machine learning tasks. These are vetted through testing across multiple publicly available datasets for the same task. * An easy interface to specify the task, pipeline, and generate results and summarize them. @@ -41,16 +41,58 @@ The salient aspects of this customized project are: * A robust continuous integration and testing infrastructure. * A ``learning database`` recording all past outcomes --> tasks, pipelines, outcomes. +# Requirements + +**GreenGuard** has been developed and runs on Python 3.6 and 3.7. + +Also, although it is not strictly required, the usage of a [virtualenv]( +https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering +with other software installed in the system where you are trying to run **GreenGuard**. + +# Install + +**GreenGuard** can be installed locally using [pip](https://pip.pypa.io/en/stable/) with +the following command: + +```bash +pip install greenguard +``` + +This will pull and install the latest stable release from [PyPi](https://pypi.org/). + +If you want to install from source or contribute to the project please read the +[Contributing Guide](https://d3-ai.github.io/GreenGuard/contributing.html#get-started). + # Data Format -In order to be able to use the **GreenGuard Pipelines** to make predictions over you -time Series data, you will need to following tables, formatted as CSV files: +The input expected by the **GreenGuard** system consists of the following two elements, +which need to be passed as `pandas.DataFrame` objects: + +## Target Times + +A table containing the specification of the problem that we are solving, which has three +columns: + +* `turbine_id`: Unique identifier of the turbine which this label corresponds to. +* `cutoff_time`: Time associated with this target +* `target`: The value that we want to predict. This can either be a numerical value or a + categorical label. This column can also be skipped when preparing data that will be used + only to make predictions and not to fit any pipeline. + +| | turbine_id | cutoff_time | target | +|----|--------------|---------------------|----------| +| 0 | T1 | 2001-01-02 00:00:00 | 0 | +| 1 | T1 | 2001-01-03 00:00:00 | 1 | +| 2 | T2 | 2001-01-04 00:00:00 | 0 | + +## Readings + +A table containing the signal data from the different sensors, with the following columns: -* A **Readings** table that contains: * `turbine_id`: Unique identifier of the turbine which this reading comes from. * `signal_id`: Unique identifier of the signal which this reading comes from. - * `timestamp`: Time where the reading took place, as an ISO formatted datetime. - * `value`: Numeric value of this reading. + * `timestamp (datetime)`: Time where the reading took place, as a datetime. + * `value (float)`: Numeric value of this reading. | | turbine_id | signal_id | timestamp | value | |----|--------------|-------------|---------------------|---------| @@ -67,213 +109,80 @@ time Series data, you will need to following tables, formatted as CSV files: | 10 | T1 | S2 | 2001-01-03 00:00:00 | 11 | | 11 | T1 | S2 | 2001-01-03 12:00:00 | 12 | -* A **Target times** table that contains: - * `turbine_id`: Unique identifier of the turbine which this label corresponds to. - * `cutoff_time`: Time associated with this target - * `target`: The value that we want to predict. This can either be a numerical value or a - categorical label. This column can also be skipped when preparing data that will be used - only to make predictions and not to fit any pipeline. - -| | turbine_id | cutoff_time | target | -|----|--------------|---------------------|----------| -| 0 | T1 | 2001-01-02 00:00:00 | 0 | -| 1 | T1 | 2001-01-03 00:00:00 | 1 | -| 2 | T1 | 2001-01-04 00:00:00 | 0 | - -Additionally, if available, two more tables can be passed alongside the previous ones in order -to provide additional information about the turbines and signals. - -* A **Turbines** table that contains a `turbine_id` and additional properties about each turbine - -| | turbine_id | latitude | longitude | height | manufacturer | -|----|--------------|------------|-------------|----------|----------------| -| 0 | T1 | 49.8729 | -6.44571 | 23.435 | M1 | -| 1 | T2 | 49.8729 | -6.4457 | 24.522 | M1 | -| 2 | T3 | 49.8729 | -6.44565 | 23.732 | M2 | - -* A **Signals** table that contains a `signal_id` and additional properties about each signal - -| | signal_id | sensor_type | sensor_brand | sensitivity | -|----|-------------|---------------|----------------|---------------| -| 0 | S1 | t1 | b1 | 200 | -| 1 | S2 | t2 | b2 | 500 | - -## Demo Dataset - -For development and demonstration purposes, we include a dataset with data from several telemetry -signals associated with one wind energy production turbine. - -This data, which has been already formatted as expected by the GreenGuard Pipelines, can be -browsed and downloaded directly from the -[d3-ai-greenguard AWS S3 Bucket](https://d3-ai-greenguard.s3.amazonaws.com/index.html). - -This dataset is adapted from the one used in the project by Cohen, Elliot J., -"Wind Analysis." Joint Initiative of the ECOWAS Centre for Renewable Energy and Energy Efficiency -(ECREEE), The United Nations Industrial Development Organization (UNIDO) and the Sustainable -Engineering Lab (SEL). Columbia University, 22 Aug. 2014. -[Available online here](https://github.com/Ecohen4/ECREEE) - -The complete list of manipulations performed on the original dataset to convert it into the -demo one that we are using here is exhaustively shown and explained in the -[Green Guard Demo Data notebook](notebooks/Green%20Guard%20Demo%20Data.ipynb). +## CSV Format -# Concepts - -Before diving into the software usage, we briefly explain some concepts and terminology. - -## Primitive - -We call the smallest computational blocks used in a Machine Learning process -**primitives**, which: - -* Can be either classes or functions. -* Have some initialization arguments, which MLBlocks calls `init_params`. -* Have some tunable hyperparameters, which have types and a list or range of valid values. - -## Template - -Primitives can be combined to form what we call **Templates**, which: - -* Have a list of primitives. -* Have some initialization arguments, which correspond to the initialization arguments - of their primitives. -* Have some tunable hyperparameters, which correspond to the tunable hyperparameters - of their primitives. - -## Pipeline - -Templates can be used to build **Pipelines** by taking and fixing a set of valid -hyperparameters for a Template. Hence, Pipelines: - -* Have a list of primitives, which corresponds to the list of primitives of their template. -* Have some initialization arguments, which correspond to the initialization arguments - of their template. -* Have some hyperparameter values, which fall within the ranges of valid tunable - hyperparameters of their template. - -A pipeline can be fitted and evaluated using the MLPipeline API in MLBlocks. - -## Tuning - -We call tuning the process of, given a dataset and a template, find the pipeline derived from the -given template that gets the best possible score on the given dataset. - -This process usually involves fitting and evaluating multiple pipelines with different hyperparameter -values on the same data while using optimization algorithms to deduce which hyperparameters are more -likely to get the best results in the next iterations. - -We call each one of these tries a **tuning iteration**. - -# Current tasks and pipelines - -In our current phase, we are addressing two tasks - time series classification and time series -regression. To provide solutions for these two tasks we have two components. - -## GreenGuardPipeline - -This class is the one in charge of learning from the data and making predictions by building -[MLBlocks](https://hdi-project.github.io/MLBlocks) pipelines and later on tuning them using -[BTB](https://hdi-project.github.io/BTB/) - -## GreenGuardLoader - -A class responsible for loading the time series data from CSV files, and return it in the -format ready to be used by the **GreenGuardPipeline**. - -# Install - -## Requirements - -**GreenGuard** has been developed and runs on Python 3.5, 3.6 and 3.7. - -Also, although it is not strictly required, the usage of a [virtualenv](https://virtualenv.pypa.io/en/latest/) -is highly recommended in order to avoid interfering with other software installed in the system -where you are trying to run **GreenGuard**. - -## Installation - -The simplest and recommended way to install **GreenGuard** is using pip: - -```bash -pip install greenguard -``` - -For development, you can also clone the repository and install it from sources - -```bash -git clone git@github.com:D3-AI/GreenGuard.git -cd GreenGuard -make install-develop -``` +A part from the in-memory data format explained above, which is limited by the memory +allocation capabilities of the system where it is run, **GreenGuard** is also prepared to +load and work with data stored as a collection of CSV files, drastically increasing the amount +of data which it can work with. Further details about this format can be found in the +[project documentation site](https://D3-AI.github.io/GreenGuard/). # Quickstart -In this example we will load some demo data using the **GreenGuardLoader** and fetch it to the -**GreenGuardPipeline** for it to find the best possible pipeline, fit it using the given data -and then make predictions from it. +In this example we will load some demo data and classify it using a **GreenGuard Pipeline**. -## 1. Load and explore the data +## 1. Load and split the demo data The first step is to load the demo data. -For this, we will import and call the `greenguard.loader.load_demo` function without any arguments: +For this, we will import and call the `greenguard.demo.load_demo` function without any arguments: ```python -from greenguard.loader import load_demo +from greenguard.demo import load_demo -X, y, readings = load_demo() +target_times, readings = load_demo() ``` The returned objects are: -`X`: A `pandas.DataFrame` with the `target_times` table data without the `target` column. +* ``target_times``: A ``pandas.DataFrame`` with the ``target_times`` table data: -``` - turbine_id timestamp -0 T1 2013-01-01 -1 T1 2013-01-02 -2 T1 2013-01-03 -3 T1 2013-01-04 -4 T1 2013-01-05 -``` + ``` + turbine_id cutoff_time target + 0 T001 2013-01-12 0 + 1 T001 2013-01-13 0 + 2 T001 2013-01-14 0 + 3 T001 2013-01-15 1 + 4 T001 2013-01-16 0 + ``` -`y`: A `pandas.Series` with the `target` column from the `target_times` table. +* ``readings``: A ``pandas.DataFrame`` containing the time series data in the format explained above. -``` -0 0.0 -1 0.0 -2 0.0 -3 0.0 -4 0.0 -Name: target, dtype: float64 -``` + ``` + turbine_id signal_id timestamp value + 0 T001 S01 2013-01-10 323.0 + 1 T001 S02 2013-01-10 320.0 + 2 T001 S03 2013-01-10 284.0 + 3 T001 S04 2013-01-10 348.0 + 4 T001 S05 2013-01-10 273.0 + ``` -`readings`: A `pandas.DataFrame` containing the time series data in the format explained above. +Once we have loaded the `target_times` and before proceeding to training any Machine Learning +Pipeline, we will have split them in 2 partitions for training and testing. -``` - turbine_id signal_id timestamp value -0 T1 S1 2013-01-01 817.0 -1 T1 S2 2013-01-01 805.0 -2 T1 S3 2013-01-01 786.0 -3 T1 S4 2013-01-01 809.0 -4 T1 S5 2013-01-01 755.0 -``` +In this case, we will split them using the [train_test_split function from scikit-learn]( +https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html), +but it can be done with any other suitable tool. -## 2. Split the data +```python +from sklearn.model_selection import train_test_split -If we want to split the data in train and test subsets, we can do so by splitting the -`X` and `y` variables with any suitable tool. +train, test = train_test_split(target_times, test_size=0.25, random_state=0) +``` -In this case, we will do it using the [train_test_split function from scikit-learn]( -https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html). +Notice how we are only splitting the `target_times` data and not the `readings`. +This is because the pipelines will later on take care of selecting the parts of the +`readings` table needed for the training based on the information found inside +the `train` and `test` inputs. -```python -from sklearn.model_selection import train_test_split +Additionally, if we want to calculate a goodness-of-fit score later on, we can separate the +testing target values from the `test` table by popping them from it: -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) +```python +test_targets = test.pop('target') ``` -## 3. Finding a Pipeline +## 2. Exploring the available Pipelines Once we have the data ready, we need to find a suitable pipeline. @@ -286,207 +195,68 @@ from greenguard import get_pipelines pipelines = get_pipelines() ``` -The returned `pipeline` variable will be `dict` containing the names of all the pipelines -available and their paths: - -``` -'greenguard_classification' -'greenguard_regression' -``` - -## 3. Finding the best Pipeline - -Once we have loaded the data, we create a **GreenGuardPipeline** instance by passing: - -* `template (string)`: the name of a template or the path to a template json file. -* `metric (string or function)`: The name of the metric to use or a metric function to use. -* `cost (bool)`: Whether the metric is a cost function to be minimized or a score to be maximized. - -Optionally, we can also pass defails about the cross validation configuration: - -* `stratify` -* `cv_splits` -* `shuffle` -* `random_state` - -In this case, we will be loading the `greenguard_classification` pipeline, using -the `accuracy` metric, and using only 2 cross validation splits: - -```python -from greenguard.pipeline import GreenGuardPipeline - -pipeline = GreenGuardPipeline( - template='greenguard_classification', - metric='f1_macro', - cv_splits=5 -) -``` - -Once we have created the pipeline, we can call its `tune` method to find the best possible -hyperparameters for our data, passing the `X`, `y`, and `readings` variables returned by the loader, -as well as an indication of the number of tuning iterations that we want to perform. - -```python -pipeline.tune(X_train, y_train, readings, iterations=10) -``` - -After the tuning process has finished, the hyperparameters have been already set in the classifier. - -We can see the found hyperparameters by calling the `get_hyperparameters` method, - -```python -pipeline.get_hyperparameters() -``` - -which will return a dictionary with the best hyperparameters found so far: - -``` -{ - "pandas.DataFrame.resample#1": { - "rule": "1D", - "time_index": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean" - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - ... -``` - -as well as the obtained cross validation score by looking at the `score` attribute of the -`pipeline` object: +The returned `pipeline` variable will be `list` containing the names of all the pipelines +available in the GreenGuard system: -```python -pipeline.score # -> 0.6447509660798626 ``` - -**NOTE**: If the score is not good enough, we can call the `tune` method again as many times -as needed and the pipeline will continue its tuning process every time based on the previous -results! - -## 4. Fitting the pipeline - -Once we are satisfied with the obtained cross validation score, we can proceed to call -the `fit` method passing again the same data elements. - -This will fit the pipeline with all the training data available using the best hyperparameters -found during the tuning process: - -```python -pipeline.fit(X_train, y_train, readings) +['resample_600s_normalize_dfs_1d_xgb_classifier', + 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier', + 'resample_600s_unstack_double_144_lstm_timeseries_classifier', + 'resample_3600s_unstack_24_lstm_timeseries_classifier', + 'resample_3600s_unstack_double_24_lstm_timeseries_classifier', + 'resample_600s_unstack_dfs_1d_xgb_classifier', + 'resample_600s_unstack_144_lstm_timeseries_classifier'] ``` -## 5. Use the fitted pipeline - -After fitting the pipeline, we are ready to make predictions on new data: +For the rest of this tutorial, we will select and use the pipeline +`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` as our template. ```python -predictions = pipeline.predict(X_test, readings) +pipeline_name = 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier' ``` -And evaluate its prediction performance: +## 3. Fitting the Pipeline -```python -from sklearn.metrics import accuracy_score - -accuracy_score(y_test, predictions) # -> 0.6413043478260869 -``` - -## 6. Save and load the pipeline - -Since the tuning and fitting process takes time to execute and requires a lot of data, you -will probably want to save a fitted instance and load it later to analyze new signals -instead of fitting pipelines over and over again. +Once we have loaded the data and selected the pipeline that we will use, we have to +fit it. -This can be done by using the `save` and `load` methods from the `GreenGuardPipeline`. - -In order to save an instance, call its `save` method passing it the path and filename -where the model should be saved. +For this, we will create an instance of a `GreenGuardPipeline` object passing the name +of the pipeline that we want to use: ```python -path = 'my_pipeline.pkl' - -pipeline.save(path) -``` - -Once the pipeline is saved, it can be loaded back as a new `GreenGuardPipeline` by using the -`GreenGuardPipeline.load` method: +from greenguard.pipeline import GreenGuardPipeline -```python -new_pipeline = GreenGuardPipeline.load(path) +pipeline = GreenGuardPipeline(pipeline_name) ``` -Once loaded, it can be directly used to make predictions on new data. +And then we can directly fit it to our data by calling its `fit` method and passing in the +training `target_times` and the complete `readings` table: ```python -new_pipeline.predict(X_test, readings) +pipeline.fit(train, readings) ``` +## 4. Make predictions -# Use your own Dataset - -Once you are familiar with the **GreenGuardPipeline** usage, you will probably want to run it -on your own dataset. - -Here are the necessary steps: - -## 1. Prepare the data - -Firt of all, you will need to prepare your data as 4 CSV files like the ones described in the -[data format](#data-format) section above. - -## 2. Create a GreenGuardLoader - -Once you have the CSV files ready, you will need to import the `greenguard.loader.GreenGuardLoader` -class and create an instance passing: - -* `path - str`: The path to the folder where the 4 CSV files are -* `target_times - str, gptional`: The name of the target table. Defaults to `target_times`. -* `target_column - str, optional`: The name of the target column. Defaults to `target`. -* `readings - str, optional`: The name of the readings table. Defaults to `readings`. -* `turbines - str, optional`: The name of the turbines table. Defaults to `None`. -* `signals - str, optional`: The name of the signals table. Defaults to `None`. -* `gzip - bool, optional`: Set to True if the CSV files are gzipped. Defaults to False. - -For example, here we will be loading a custom dataset which has been sorted in gzip format -inside the `my_dataset` folder, and for which the target table has a different name: +After fitting the pipeline, we are ready to make predictions on new data by calling the +`pipeline.predict` method passing the testing `target_times` and, again, the complete +`readings` table. ```python -from greenguard.loader import GreenGuardLoader - -loader = GreenGuardLoader(path='my_dataset', target='labels', gzip=True) +predictions = pipeline.predict(test, readings) ``` -## 3. Call the loader.load method. +## 5. Evaluate the goodness-of-fit -Once the `loader` instance has been created, we can call its `load` method: +Finally, after making predictions we can evaluate how good the prediction was +using any suitable metric. ```python -X, y, tables = loader.load() -``` +from sklearn.metrics import f1_score -Optionally, if the dataset contains only data to make predictions and the `target` column -does not exist, we can pass it the argument `False` to skip it: - -```python -X, readings = loader.load(target=False) +f1_score(test_targets, predictions) ``` - -# Docker Usage - -**GreenGuard** comes configured and ready to be distributed and run as a docker image which starts -a jupyter notebook already configured to use greenguard, with all the required dependencies already -installed. - -For more details about how to run GreenGuard over docker, please check the [DOCKER.md](DOCKER.md) -documentation. - ## What's next? For more details about **GreenGuard** and all its possibilities and features, please check the diff --git a/notebooks/CSVLoader Demo.ipynb b/notebooks/CSVLoader Demo.ipynb new file mode 100644 index 0000000..66dab0b --- /dev/null +++ b/notebooks/CSVLoader Demo.ipynb @@ -0,0 +1,683 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import logging;\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "logging.getLogger().setLevel(level=logging.INFO)\n", + "\n", + "import warnings\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-03 12:49:43,764 - INFO - demo - Generating file raw_demo/T001/2013-01-.csv\n", + "2020-02-03 12:49:44,277 - INFO - demo - Generating file raw_demo/T001/2013-02-.csv\n", + "2020-02-03 12:49:44,810 - INFO - demo - Generating file raw_demo/T001/2013-03-.csv\n", + "2020-02-03 12:49:45,345 - INFO - demo - Generating file raw_demo/T001/2013-04-.csv\n", + "2020-02-03 12:49:45,885 - INFO - demo - Generating file raw_demo/T001/2013-05-.csv\n", + "2020-02-03 12:49:46,417 - INFO - demo - Generating file raw_demo/T001/2013-06-.csv\n", + "2020-02-03 12:49:46,954 - INFO - demo - Generating file raw_demo/T001/2013-07-.csv\n", + "2020-02-03 12:49:47,492 - INFO - demo - Generating file raw_demo/T001/2013-08-.csv\n", + "2020-02-03 12:49:48,017 - INFO - demo - Generating file raw_demo/T001/2013-09-.csv\n", + "2020-02-03 12:49:48,543 - INFO - demo - Generating file raw_demo/T001/2013-10-.csv\n", + "2020-02-03 12:49:49,094 - INFO - demo - Generating file raw_demo/T001/2013-11-.csv\n", + "2020-02-03 12:49:49,606 - INFO - demo - Generating file raw_demo/T001/2013-12-.csv\n" + ] + } + ], + "source": [ + "from greenguard.demo import generate_raw_readings\n", + "\n", + "target_times = generate_raw_readings('raw_demo')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353, 3)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.3002832861189802" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.target.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "turbine_id object\n", + "cutoff_time datetime64[ns]\n", + "target int64\n", + "dtype: object" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from greenguard.loaders import CSVLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-03 12:50:11,263 - INFO - csv - Loaded 1306052 readings from turbine T001\n", + "2020-02-03 12:50:11,275 - INFO - csv - Loaded 1306052 turbine readings\n", + "2020-02-03 12:50:11,500 - INFO - targets - Dropped 2 invalid targets\n" + ] + } + ], + "source": [ + "import numpy as np\n", + "\n", + "csv_loader = CSVLoader('raw_demo')\n", + "target_times, readings = csv_loader.load(target_times, '1d')" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1306052, 4)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-11209.0
1T001S022013-01-11193.0
2T001S032013-01-11177.0
3T001S042013-01-11188.0
4T001S052013-01-11150.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-11 209.0\n", + "1 T001 S02 2013-01-11 193.0\n", + "2 T001 S03 2013-01-11 177.0\n", + "3 T001 S04 2013-01-11 188.0\n", + "4 T001 S05 2013-01-11 150.0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "turbine_id object\n", + "signal_id object\n", + "timestamp datetime64[ns]\n", + "value float64\n", + "dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(351, 3)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-03 12:50:31,423 - INFO - csv - Loaded 1306052 readings from turbine T001\n", + "2020-02-03 12:50:31,427 - INFO - csv - Resampling: 4h - mean\n", + "2020-02-03 12:50:31,689 - INFO - csv - Loaded 2119 turbine readings\n", + "2020-02-03 12:50:31,843 - INFO - targets - Dropped 14 invalid targets\n" + ] + } + ], + "source": [ + "csv_loader = CSVLoader('raw_demo', rule='4h', aggregation='mean', unstack=True)\n", + "target_times, readings = csv_loader.load(target_times, '15d')" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(2119, 28)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00253.041667268.250000268.041667297.166667234.666667261.916667206.7916673.198335e+06...9.0791673.134510e+0642.41666744.95833344.83333349.62500039.20833343.83333334.625293.166667
1T0012013-01-10 04:00:00572.083333555.291667538.666667592.291667557.166667534.000000544.2500003.199514e+06...10.8375003.142505e+0662.08333362.50000063.62500063.54166761.33333362.54166754.000421.208333
2T0012013-01-10 08:00:00688.791667696.791667706.625000750.791667714.250000683.333333658.1666673.201449e+06...12.7541673.155809e+0692.20833394.95833394.66666797.33333394.12500093.58333386.375638.291667
3T0012013-01-10 12:00:00396.333333418.500000415.791667438.541667382.250000364.666667320.3333333.203319e+06...10.9166673.168640e+0655.75000060.08333358.58333361.29166752.79166752.79166744.000376.125000
4T0012013-01-10 16:00:00390.458333408.875000409.500000458.000000415.583333363.000000364.4583333.204504e+06...10.4125003.176672e+0649.95833353.87500054.45833356.75000052.70833346.70833347.625354.750000
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", + "0 T001 2013-01-10 00:00:00 253.041667 268.250000 268.041667 \n", + "1 T001 2013-01-10 04:00:00 572.083333 555.291667 538.666667 \n", + "2 T001 2013-01-10 08:00:00 688.791667 696.791667 706.625000 \n", + "3 T001 2013-01-10 12:00:00 396.333333 418.500000 415.791667 \n", + "4 T001 2013-01-10 16:00:00 390.458333 408.875000 409.500000 \n", + "\n", + " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", + "0 297.166667 234.666667 261.916667 206.791667 3.198335e+06 ... \n", + "1 592.291667 557.166667 534.000000 544.250000 3.199514e+06 ... \n", + "2 750.791667 714.250000 683.333333 658.166667 3.201449e+06 ... \n", + "3 438.541667 382.250000 364.666667 320.333333 3.203319e+06 ... \n", + "4 458.000000 415.583333 363.000000 364.458333 3.204504e+06 ... \n", + "\n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 9.079167 3.134510e+06 42.416667 44.958333 44.833333 49.625000 \n", + "1 10.837500 3.142505e+06 62.083333 62.500000 63.625000 63.541667 \n", + "2 12.754167 3.155809e+06 92.208333 94.958333 94.666667 97.333333 \n", + "3 10.916667 3.168640e+06 55.750000 60.083333 58.583333 61.291667 \n", + "4 10.412500 3.176672e+06 49.958333 53.875000 54.458333 56.750000 \n", + "\n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 39.208333 43.833333 34.625 293.166667 \n", + "1 61.333333 62.541667 54.000 421.208333 \n", + "2 94.125000 93.583333 86.375 638.291667 \n", + "3 52.791667 52.791667 44.000 376.125000 \n", + "4 52.708333 46.708333 47.625 354.750000 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(337, 3)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.shape" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/GreenGuard usage example.ipynb b/notebooks/GreenGuard usage example.ipynb index a764885..e912d19 100644 --- a/notebooks/GreenGuard usage example.ipynb +++ b/notebooks/GreenGuard usage example.ipynb @@ -37,8 +37,8 @@ "\n", "The first step is to load the data that we are going to use.\n", "\n", - "In order to use the demo data included in GreenGuard, the `greenguard.loader.load_demo`\n", - "can be used." + "In order to use the demo data included in GreenGuard, the `greenguard.load_demo`\n", + "function can be used." ] }, { @@ -47,9 +47,17 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard import load_demo\n", + "from greenguard.demo import load_demo\n", "\n", - "X, y, readings = load_demo()" + "target_times, readings = load_demo()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will download some demo data from our S3 demo Bucket and load it as\n", + "the necessary `target_times` and `readings` tables." ] }, { @@ -80,45 +88,51 @@ " \n", " turbine_id\n", " cutoff_time\n", + " target\n", " \n", " \n", " \n", " \n", " 0\n", - " GRID1\n", - " 2013-01-01\n", + " T001\n", + " 2013-01-12\n", + " 0\n", " \n", " \n", " 1\n", - " GRID1\n", - " 2013-01-02\n", + " T001\n", + " 2013-01-13\n", + " 0\n", " \n", " \n", " 2\n", - " GRID1\n", - " 2013-01-03\n", + " T001\n", + " 2013-01-14\n", + " 0\n", " \n", " \n", " 3\n", - " GRID1\n", - " 2013-01-04\n", + " T001\n", + " 2013-01-15\n", + " 1\n", " \n", " \n", " 4\n", - " GRID1\n", - " 2013-01-05\n", + " T001\n", + " 2013-01-16\n", + " 0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " turbine_id cutoff_time\n", - "0 GRID1 2013-01-01\n", - "1 GRID1 2013-01-02\n", - "2 GRID1 2013-01-03\n", - "3 GRID1 2013-01-04\n", - "4 GRID1 2013-01-05" + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" ] }, "execution_count": 3, @@ -127,7 +141,7 @@ } ], "source": [ - "X.head()" + "target_times.head()" ] }, { @@ -138,12 +152,7 @@ { "data": { "text/plain": [ - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - "Name: target, dtype: int64" + "(353, 3)" ] }, "execution_count": 4, @@ -152,7 +161,7 @@ } ], "source": [ - "y.head()" + "target_times.shape" ] }, { @@ -162,78 +171,11 @@ "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0GRID1WTG01_possible_power_avg2013-01-01817.0
1GRID1WTG01_total_active_power2013-01-013109970.0
2GRID1WTG02_possible_power_avg2013-01-01805.0
3GRID1WTG02_total_active_power2013-01-01609852.0
4GRID1WTG03_possible_power_avg2013-01-01786.0
\n", - "
" - ], "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 GRID1 WTG01_possible_power_avg 2013-01-01 817.0\n", - "1 GRID1 WTG01_total_active_power 2013-01-01 3109970.0\n", - "2 GRID1 WTG02_possible_power_avg 2013-01-01 805.0\n", - "3 GRID1 WTG02_total_active_power 2013-01-01 609852.0\n", - "4 GRID1 WTG03_possible_power_avg 2013-01-01 786.0" + "turbine_id object\n", + "cutoff_time datetime64[ns]\n", + "target int64\n", + "dtype: object" ] }, "execution_count": 5, @@ -242,58 +184,13 @@ } ], "source": [ - "readings.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alternatively, if you want to load your own dataset, the `GreenGuardLoader` class can be used.\n", - "\n", - "For example, in order to load the data from the folder where we just downloaded the demo data\n", - "we can use this commands:" + "target_times.dtypes" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, - "outputs": [], - "source": [ - "from greenguard.loader import GreenGuardLoader\n", - "\n", - "loader = GreenGuardLoader('../greenguard/demo', gzip=True)\n", - "\n", - "X, y, tables = loader.load()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For further details about the GreenGuardLoder options please check the corresponding\n", - "[API Reference page in the docs](https://d3-ai.github.io/GreenGuard/api/greenguard.loader.html#greenguard.loader.GreenGuardLoader)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The output of either of the previous commands is:\n", - "\n", - "* `X`: A pandas.DataFrame with the contents of the\n", - " target table.\n", - "* `y`: A pandas.Series with the contents of\n", - " the target column.\n", - "* `tables`: A dictionary containing the readings, turbines and\n", - " signals tables as pandas.DataFrames." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, "outputs": [ { "data": { @@ -317,259 +214,283 @@ " \n", " \n", " turbine_id\n", - " cutoff_time\n", + " signal_id\n", + " timestamp\n", + " value\n", " \n", " \n", " \n", " \n", " 0\n", - " GRID1\n", - " 2013-01-01\n", + " T001\n", + " S01\n", + " 2013-01-10\n", + " 323.0\n", " \n", " \n", " 1\n", - " GRID1\n", - " 2013-01-02\n", + " T001\n", + " S02\n", + " 2013-01-10\n", + " 320.0\n", " \n", " \n", " 2\n", - " GRID1\n", - " 2013-01-03\n", + " T001\n", + " S03\n", + " 2013-01-10\n", + " 284.0\n", " \n", " \n", " 3\n", - " GRID1\n", - " 2013-01-04\n", + " T001\n", + " S04\n", + " 2013-01-10\n", + " 348.0\n", " \n", " \n", " 4\n", - " GRID1\n", - " 2013-01-05\n", + " T001\n", + " S05\n", + " 2013-01-10\n", + " 273.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " turbine_id cutoff_time\n", - "0 GRID1 2013-01-01\n", - "1 GRID1 2013-01-02\n", - "2 GRID1 2013-01-03\n", - "3 GRID1 2013-01-04\n", - "4 GRID1 2013-01-05" + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "X.head()" + "readings.head()" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 0\n", - "1 0\n", - "2 0\n", - "3 0\n", - "4 0\n", - "Name: target, dtype: int64" + "(1313540, 4)" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "y.head()" + "readings.shape" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "dict_keys(['readings'])" + "turbine_id object\n", + "signal_id object\n", + "timestamp datetime64[ns]\n", + "value float64\n", + "dtype: object" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tables.keys()" + "readings.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load your own Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, if you want to load your own dataset, all you have to do is load the\n", + "`target_times` and `readings` tables as `pandas.DataFrame` objects.\n", + "\n", + "Make sure to parse the corresponding datetime fields!" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "#import pandas as pd\n", + "\n", + "#target_times = pd.read_csv('path/to/your/target_times.csv', parse_dates=['cutoff_time'])\n", + "#readings = pd.read_csv('path/to/your/readings.csv', parse_dates=['timestamp'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Split the data\n", + "\n", + "Once we have loaded the `target_times` and before proceeding to training any Machine Learning\n", + "Pipeline, we will have split them in 2 partitions for training and testing.\n", + "\n", + "In this case, we will split them using the [train_test_split function from scikit-learn](\n", + "https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html),/n", + "but it can be done with any other suitable tool." ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "train, test = train_test_split(target_times, test_size=0.25, random_state=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Finding a Template\n", + "\n", + "The next step will be to select a template from the ones available in\n", + "GreenGuard.\n", + "\n", + "For this, we can use the `greenguard.get_pipelines` function, which will\n", + "return us the list of all the available MLBlocks pipelines found in the\n", + "GreenGuard system." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0GRID1WTG01_possible_power_avg2013-01-01817.0
1GRID1WTG01_total_active_power2013-01-013109970.0
2GRID1WTG02_possible_power_avg2013-01-01805.0
3GRID1WTG02_total_active_power2013-01-01609852.0
4GRID1WTG03_possible_power_avg2013-01-01786.0
\n", - "
" - ], "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 GRID1 WTG01_possible_power_avg 2013-01-01 817.0\n", - "1 GRID1 WTG01_total_active_power 2013-01-01 3109970.0\n", - "2 GRID1 WTG02_possible_power_avg 2013-01-01 805.0\n", - "3 GRID1 WTG02_total_active_power 2013-01-01 609852.0\n", - "4 GRID1 WTG03_possible_power_avg 2013-01-01 786.0" + "['resample_600s_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_double_144_lstm_timeseries_classifier',\n", + " 'resample_3600s_unstack_24_lstm_timeseries_classifier',\n", + " 'resample_3600s_unstack_double_24_lstm_timeseries_classifier',\n", + " 'resample_600s_unstack_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_144_lstm_timeseries_classifier']" ] }, - "execution_count": 10, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "tables['readings'].head()" + "from greenguard import get_pipelines\n", + "\n", + "get_pipelines()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. Split the data\n", - "\n", - "If we want to split the data in train and test subsets, we can do so by splitting\n", - "the X and y variables with any suitable tool.\n", - "\n", - "In this case, we will do it using the `train_test_split` function from scikit-learn." + "Optionally, we can pass a string to select the pipelines that contain it:" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['resample_600s_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_dfs_1d_xgb_classifier']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "from sklearn.model_selection import train_test_split\n", - "\n", - "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)" + "get_pipelines('dfs')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Additionally, we can pass the keyword `path=True` to obtain a dictionary containing\n", + "also the path to the pipelines instead of only the list of names." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'timeseries_classification': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/timeseries_classification.json',\n", - " 'resample_dfs_xgb_classification': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_dfs_xgb_classification.json',\n", - " 'resample_normalize_dfs_xgb_classification': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_normalize_dfs_xgb_classification.json',\n", - " 'greenguard_regression': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/greenguard_regression.json',\n", - " 'greenguard_classification': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/greenguard_classification.json'}" + "{'resample_600s_normalize_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json'}" ] }, - "execution_count": 7, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from greenguard import get_pipelines\n", - "\n", - "get_pipelines()" + "get_pipelines('dfs', path=True)" ] }, { - "cell_type": "code", - "execution_count": 8, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "from greenguard import GreenGuardPipeline\n", - "\n", - "pipeline = GreenGuardPipeline('resample_dfs_xgb_classification', 'f1')" + "For the rest of this tutorial, we will select and use the pipeline\n", + "`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` as our template." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ - "pipeline.fit(X_train, y_train, readings)" + "template = 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier'" ] }, { @@ -589,21 +510,18 @@ "* `stratify`\n", "* `cv_splits`\n", "* `shuffle`\n", - "* `random_state`\n", - "\n", - "In this case, we will be loading the `greenguard_classification` pipeline, using\n", - "the `accuracy` metric, and using only 2 cross validation splits:" + "* `random_state`" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "from greenguard.pipeline import GreenGuardPipeline\n", "\n", - "pipeline = GreenGuardPipeline(template='greenguard_classification', metric='accuracy', cv_splits=2)" + "pipeline = GreenGuardPipeline(template, metric='f1', cv_splits=3)" ] }, { @@ -611,29 +529,59 @@ "metadata": {}, "source": [ "Once we have created the pipeline, we can call its `tune` method to find the best possible\n", - "hyperparameters for our data, passing the `X`, `y`, and `tables` variables returned by the loader,\n", + "hyperparameters for our data, passing the `target_times` and `readings` variables,\n", "as well as an indication of the number of tuning iterations that we want to perform." ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2019-06-18 11:43:31,518 - INFO - pipeline - Scoring the default pipeline\n", - "2019-06-18 11:45:46,250 - INFO - pipeline - Default Pipeline score: 0.6447509660798626\n", - "2019-06-18 11:45:46,252 - INFO - pipeline - Scoring pipeline 1\n", - "2019-06-18 11:45:46,253 - INFO - gp - Using Uniform sampler as user specified r_minimum threshold is not met to start the GP based learning\n", - "2019-06-18 11:48:23,348 - INFO - pipeline - Pipeline 1 score: 0.6813278231000429\n" + "2020-02-03 12:51:46,145 - INFO - pipeline - Scoring the default pipeline\n", + "2020-02-03 12:51:46,147 - INFO - pipeline - Running static steps before cross validation\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 165 features\n", + "Elapsed: 00:32 | Progress: 100%|██████████\n", + "Elapsed: 00:16 | Progress: 100%|██████████\n", + "Built 165 features\n", + "Elapsed: 00:33 | Progress: 100%|██████████\n", + "Elapsed: 00:15 | Progress: 100%|██████████\n", + "Built 165 features\n", + "Elapsed: 00:31 | Progress: 100%|██████████\n", + "Elapsed: 00:15 | Progress: 100%|██████████\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-03 12:54:14,195 - INFO - pipeline - Default Pipeline score: 0.605187908496732\n", + "2020-02-03 12:54:14,196 - INFO - pipeline - Scoring pipeline 1\n", + "2020-02-03 12:54:14,199 - INFO - gp - Using Uniform sampler as user specified r_minimum threshold is not met to start the GP based learning\n", + "2020-02-03 12:54:14,380 - INFO - pipeline - Pipeline 1 score: 0.5976760567286199\n", + "2020-02-03 12:54:14,385 - INFO - pipeline - Scoring pipeline 2\n", + "2020-02-03 12:54:14,888 - INFO - pipeline - Pipeline 2 score: 0.5965798320999443\n", + "2020-02-03 12:54:14,890 - INFO - pipeline - Scoring pipeline 3\n", + "2020-02-03 12:54:15,313 - INFO - pipeline - Pipeline 3 score: 0.6431783902372138\n", + "2020-02-03 12:54:15,316 - INFO - pipeline - Scoring pipeline 4\n", + "2020-02-03 12:54:15,729 - INFO - pipeline - Pipeline 4 score: 0.5642664541017163\n", + "2020-02-03 12:54:15,731 - INFO - pipeline - Scoring pipeline 5\n", + "2020-02-03 12:54:15,883 - INFO - pipeline - Pipeline 5 score: 0.5859328579916815\n" ] } ], "source": [ - "pipeline.tune(X_train, y_train, tables, iterations=1)" + "pipeline.tune(target_times, readings, iterations=5)" ] }, { @@ -648,162 +596,145 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'pandas.DataFrame.resample#1': {},\n", - " 'pandas.DataFrame.unstack#1': {},\n", - " 'featuretools.EntitySet.entity_from_dataframe#1': {},\n", - " 'featuretools.EntitySet.entity_from_dataframe#2': {},\n", - " 'featuretools.EntitySet.entity_from_dataframe#3': {},\n", - " 'featuretools.EntitySet.add_relationship#1': {},\n", - " 'featuretools.dfs#1': {'max_depth': {'type': 'int',\n", - " 'default': 1,\n", - " 'range': [1, 3],\n", - " 'values': None},\n", - " 'remove_low_information': {'type': 'bool', 'default': True}},\n", - " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': {'type': 'int',\n", - " 'default': 0,\n", - " 'range': [0, 100]}},\n", - " 'sklearn.impute.SimpleImputer#1': {'strategy': {'type': 'str',\n", - " 'default': 'mean',\n", - " 'values': ['mean', 'median', 'most_frequent', 'constant']}},\n", - " 'sklearn.preprocessing.StandardScaler#1': {'with_mean': {'type': 'bool',\n", - " 'default': True},\n", - " 'with_std': {'type': 'bool', 'default': True}},\n", - " 'xgboost.XGBClassifier#1': {'n_estimators': {'type': 'int',\n", - " 'default': 100,\n", - " 'range': [10, 1000]},\n", - " 'max_depth': {'type': 'int', 'default': 3, 'range': [3, 10]},\n", - " 'learning_rate': {'type': 'float', 'default': 0.1, 'range': [0, 1]},\n", - " 'gamma': {'type': 'float', 'default': 0, 'range': [0, 1]},\n", - " 'min_child_weight': {'type': 'int', 'default': 1, 'range': [1, 10]}}}" + "{'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': 28},\n", + " 'xgboost.XGBClassifier#1': {'n_estimators': 549,\n", + " 'max_depth': 3,\n", + " 'learning_rate': 0.09499856413762053,\n", + " 'gamma': 0.48809516357182936,\n", + " 'min_child_weight': 7}}" ] }, - "execution_count": 14, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pipeline._pipeline.get_tunable_hyperparameters()" + "pipeline.get_hyperparameters()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also see the obtained cross validation score by looking at the `cv_score` attribute of the\n", + "`pipeline` object:" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'pandas.DataFrame.resample#1': {'rule': '1D',\n", - " 'time_index': 'timestamp',\n", - " 'groupby': ['turbine_id', 'signal_id'],\n", - " 'aggregation': 'mean'},\n", - " 'pandas.DataFrame.unstack#1': {'level': 'signal_id', 'reset_index': True},\n", - " 'featuretools.EntitySet.entity_from_dataframe#1': {'entityset_id': 'entityset',\n", - " 'entity_id': 'readings',\n", - " 'index': 'index',\n", - " 'variable_types': None,\n", - " 'make_index': True,\n", - " 'time_index': 'timestamp',\n", - " 'secondary_time_index': None,\n", - " 'already_sorted': False},\n", - " 'featuretools.EntitySet.entity_from_dataframe#2': {'entityset_id': 'entityset',\n", - " 'entity_id': 'turbines',\n", - " 'index': 'turbine_id',\n", - " 'variable_types': None,\n", - " 'make_index': False,\n", - " 'time_index': None,\n", - " 'secondary_time_index': None,\n", - " 'already_sorted': False},\n", - " 'featuretools.EntitySet.entity_from_dataframe#3': {'entityset_id': 'entityset',\n", - " 'entity_id': 'signals',\n", - " 'index': 'signal_id',\n", - " 'variable_types': None,\n", - " 'make_index': False,\n", - " 'time_index': None,\n", - " 'secondary_time_index': None,\n", - " 'already_sorted': False},\n", - " 'featuretools.EntitySet.add_relationship#1': {'parent': 'turbines',\n", - " 'parent_column': 'turbine_id',\n", - " 'child': 'readings',\n", - " 'child_column': 'turbine_id'},\n", - " 'featuretools.dfs#1': {'target_entity': 'turbines',\n", - " 'index': 'turbine_id',\n", - " 'time_index': 'timestamp',\n", - " 'agg_primitives': None,\n", - " 'trans_primitives': None,\n", - " 'copy': False,\n", - " 'encode': False,\n", - " 'max_depth': 3,\n", - " 'remove_low_information': True},\n", - " 'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'copy': True,\n", - " 'features': 'auto',\n", - " 'max_labels': 23},\n", - " 'sklearn.impute.SimpleImputer#1': {'missing_values': nan,\n", - " 'fill_value': None,\n", - " 'verbose': False,\n", - " 'copy': True,\n", - " 'strategy': 'constant'},\n", - " 'sklearn.preprocessing.StandardScaler#1': {'with_mean': True,\n", - " 'with_std': False},\n", - " 'xgboost.XGBClassifier#1': {'n_jobs': -1,\n", - " 'n_estimators': 353,\n", - " 'max_depth': 4,\n", - " 'learning_rate': 0.6150792206840879,\n", - " 'gamma': 0.46831924909241274,\n", - " 'min_child_weight': 3}}" + "0.6431783902372138" ] }, - "execution_count": 12, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pipeline.get_hyperparameters()" + "pipeline.cv_score" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also see the obtained cross validation score by looking at the `score` attribute of the\n", - "`pipeline` object:" + "**NOTE**: If the score is not good enough, we can call the `tune` method again as many times\n", + "as needed and the pipeline will continue its tuning process every time based on the previous\n", + "results!" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-03 12:54:15,971 - INFO - pipeline - Scoring pipeline 1\n", + "2020-02-03 12:54:16,421 - INFO - pipeline - Pipeline 1 score: 0.6220467704338674\n", + "2020-02-03 12:54:16,423 - INFO - pipeline - Scoring pipeline 2\n", + "2020-02-03 12:54:16,795 - INFO - pipeline - Pipeline 2 score: 0.5867369345630215\n", + "2020-02-03 12:54:16,797 - INFO - pipeline - Scoring pipeline 3\n", + "2020-02-03 12:54:17,227 - INFO - pipeline - Pipeline 3 score: 0.6161616161616162\n", + "2020-02-03 12:54:17,229 - INFO - pipeline - Scoring pipeline 4\n", + "2020-02-03 12:54:17,725 - INFO - pipeline - Pipeline 4 score: 0.6037324896256047\n", + "2020-02-03 12:54:17,727 - INFO - pipeline - Scoring pipeline 5\n", + "2020-02-03 12:54:18,287 - INFO - pipeline - Pipeline 5 score: 0.6169717350045217\n", + "2020-02-03 12:54:18,288 - INFO - pipeline - Scoring pipeline 6\n", + "2020-02-03 12:54:18,744 - INFO - pipeline - Pipeline 6 score: 0.639102564102564\n", + "2020-02-03 12:54:18,746 - INFO - pipeline - Scoring pipeline 7\n", + "2020-02-03 12:54:19,171 - INFO - pipeline - Pipeline 7 score: 0.6724889262202695\n", + "2020-02-03 12:54:19,174 - INFO - pipeline - Scoring pipeline 8\n", + "2020-02-03 12:54:19,627 - INFO - pipeline - Pipeline 8 score: 0.628250663400694\n", + "2020-02-03 12:54:19,629 - INFO - pipeline - Scoring pipeline 9\n", + "2020-02-03 12:54:20,250 - INFO - pipeline - Pipeline 9 score: 0.656191724941725\n", + "2020-02-03 12:54:20,253 - INFO - pipeline - Scoring pipeline 10\n", + "2020-02-03 12:54:20,799 - INFO - pipeline - Pipeline 10 score: 0.639014073371284\n" + ] + } + ], + "source": [ + "pipeline.tune(target_times, readings, iterations=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.659349506225848" + "0.6724889262202695" ] }, - "execution_count": 13, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pipeline.score" + "pipeline.cv_score" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 21, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': 35},\n", + " 'xgboost.XGBClassifier#1': {'n_estimators': 542,\n", + " 'max_depth': 9,\n", + " 'learning_rate': 0.8024814826871371,\n", + " 'gamma': 0.8891378840299992,\n", + " 'min_child_weight': 10}}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "**NOTE**: If the score is not good enough, we can call the `tune` method again as many times\n", - "as needed and the pipeline will continue its tuning process every time based on the previous\n", - "results!" + "pipeline.get_hyperparameters()" ] }, { @@ -821,11 +752,20 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 165 features\n", + "Elapsed: 00:35 | Progress: 100%|██████████\n" + ] + } + ], "source": [ - "pipeline.fit(X_train, y_train, tables)" + "pipeline.fit(train, readings)" ] }, { @@ -839,11 +779,19 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elapsed: 00:11 | Progress: 100%|██████████\n" + ] + } + ], "source": [ - "predictions = pipeline.predict(X_test, tables)" + "predictions = pipeline.predict(test, readings)" ] }, { @@ -855,24 +803,24 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6413043478260869" + "0.7058823529411765" ] }, - "execution_count": 18, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "from sklearn.metrics import accuracy_score\n", + "from sklearn.metrics import f1_score\n", "\n", - "accuracy_score(y_test, predictions)" + "f1_score(test['target'], predictions)" ] }, { @@ -893,7 +841,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -912,7 +860,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -928,22 +876,29 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 27, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elapsed: 00:11 | Progress: 100%|██████████\n" + ] + }, { "data": { "text/plain": [ - "array([1., 0., 0., 0., 0.])" + "array([0, 0, 0, 1, 0])" ] }, - "execution_count": 21, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "predictions = new_pipeline.predict(X_test, tables)\n", + "predictions = new_pipeline.predict(test, readings)\n", "predictions[0:5]" ] } @@ -964,7 +919,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.9" } }, "nbformat": 4, From a36a02446f1ae44fe5ef1cdf0d512b698f4d9e02 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 7 Feb 2020 11:09:17 -0500 Subject: [PATCH 007/171] Update notebooks and demo loading --- greenguard/demo.py | 2 +- greenguard/demo.py.new | 66 -------- greenguard/pipeline.py | 11 +- notebooks/CSVLoader Demo.ipynb | 183 +++++++++++++++++++++-- notebooks/GreenGuard usage example.ipynb | 55 ++++--- 5 files changed, 216 insertions(+), 101 deletions(-) delete mode 100644 greenguard/demo.py.new diff --git a/greenguard/demo.py b/greenguard/demo.py index 789a50d..bae6a64 100644 --- a/greenguard/demo.py +++ b/greenguard/demo.py @@ -64,6 +64,6 @@ def generate_raw_readings(output_path='demo'): month_data['timestamp'] = month_data['timestamp'].dt.strftime('%m/%d/%y %M:%H:%S') month_path = os.path.join(turbine_path, '2013-{:02d}-.csv'.format(month)) LOGGER.info('Generating file %s', month_path) - month_data.to_csv(month_path, index=False) + month_data[['signal_id', 'timestamp', 'value']].to_csv(month_path, index=False) return target_times diff --git a/greenguard/demo.py.new b/greenguard/demo.py.new deleted file mode 100644 index 62a9eb1..0000000 --- a/greenguard/demo.py.new +++ /dev/null @@ -1,66 +0,0 @@ -import os -import random -from datetime import datetime, timedelta - -import pandas as pd - - -def get_turbine_df(start, end, interval, signals): - data = list() - current = start - delta = timedelta(seconds=interval) - while current < end: - for signal in signals: - data.append({ - 'timestamp': current.strftime('%m/%d/%y %H:%M:%S'), - 'signal_id': signal, - 'value': random.random() - }) - - current = current + delta - - return pd.DataFrame(data)[['timestamp', 'signal_id', 'value']] - - -def generate_turbine_files(data_path, turbine_name, signals, interval): - turbine_path = os.path.join(data_path, turbine_name) - os.makedirs(turbine_path, exist_ok=True) - - for year in range(2000, 2011): - for month in range(1, 13): - start = datetime(year, month, 1) - end = datetime(year + (1 if month == 12 else 0), (month % 12) + 1, 1) - tdf = get_turbine_df(start, end, interval, signals) - - csv_path = os.path.join(turbine_path, '{}-{:02d}-.csv'.format(year, month)) - tdf.to_csv(csv_path) - -def _prefixed_range(prefix, size): - arr = pd.Series(np.arange(size) + 1).astype(str) - arr = arr.str.zfill(arr.str.len().max()) - - return prefix + arr - - -def make_demo(path='.', signals=1, turbines=1, interval=600): - signals = _prefixed_range('S', signals) - turbines = _prefixed_range('S', turbines) - readings_path = os.path.join(path, readings) - - for turbine in turbines: - generate_turbine_files(readings_path, turbine, signals, interval) - - - target_times = pd.DataFrame([ - {'turbine_id': 'T001', 'cutoff_time': datetime(2005, 1, 1), 'target': False}, - {'turbine_id': 'T001', 'cutoff_time': datetime(2007, 1, 2), 'target': True}, - {'turbine_id': 'T001', 'cutoff_time': datetime(2009, 1, 2), 'target': False}, - {'turbine_id': 'T002', 'cutoff_time': datetime(2005, 1, 1), 'target': True}, - {'turbine_id': 'T002', 'cutoff_time': datetime(2007, 1, 2), 'target': False}, - {'turbine_id': 'T002', 'cutoff_time': datetime(2009, 1, 2), 'target': True}, - {'turbine_id': 'T003', 'cutoff_time': datetime(2005, 1, 1), 'target': False}, - {'turbine_id': 'T003', 'cutoff_time': datetime(2007, 1, 2), 'target': True}, - {'turbine_id': 'T003', 'cutoff_time': datetime(2009, 1, 2), 'target': False}, - ]) - - target_times.to_csv('target_times.csv', index=False) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 9783052..dd70e8a 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -22,7 +22,7 @@ PIPELINES_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'pipelines')) -def get_pipelines(pattern='', path=False): +def get_pipelines(pattern='', path=False, unstacked=False): """Get the list of available pipelines. Optionally filter the names using a patter or obtain @@ -34,6 +34,9 @@ def get_pipelines(pattern='', path=False): path (bool): Whether to return a dictionary containing the pipeline paths instead of only a list with the names. + unstacked (bool): + Whether to load the pipelines that expect the readings + to be already unstacked by signal_id. Defaults to ``False``. Return: list or dict: @@ -42,7 +45,11 @@ def get_pipelines(pattern='', path=False): names as keys and their absolute paths as values. """ pipelines = dict() - for filename in os.listdir(PIPELINES_DIR): + pipelines_dir = PIPELINES_DIR + if unstacked: + pipelines_dir = os.path.join(pipelines_dir, 'unstacked') + + for filename in os.listdir(pipelines_dir): if filename.endswith('.json') and pattern in filename: name = os.path.basename(filename)[:-len('.json')] pipeline_path = os.path.join(PIPELINES_DIR, filename) diff --git a/notebooks/CSVLoader Demo.ipynb b/notebooks/CSVLoader Demo.ipynb index 66dab0b..4710596 100644 --- a/notebooks/CSVLoader Demo.ipynb +++ b/notebooks/CSVLoader Demo.ipynb @@ -1,8 +1,39 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CSVLoader Demo\n", + "\n", + "This notebook shows how to use the CSVLoader class to load readings from a folder\n", + "containing readings in the raw format.\n", + "\n", + "Details about the raw readings format can be found in the documentation site.\n", + "\n", + "In this notebook we will:\n", + "\n", + "- Generate a folder with readings in the raw format based on the demo data\n", + "- Load the redings needed for our target times\n", + "- Explore different options from the CSVLoader\n", + "- Load a pipeline and use it on the loaded data\n", + "- Load the readings in the unstacked format\n", + "- Load an unstacked pipeline and use it on the loaded data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Setup the logging\n", + "\n", + "This step sets up logging in our environment to increase our visibility over\n", + "the steps that GreenGuard performs." + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -15,9 +46,19 @@ "warnings.simplefilter(\"ignore\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Generate Raw Readings\n", + "\n", + "The first step will be to execute the `generate_raw_readings` function, which will create a\n", + "folder in the indicated path and populate it with the raw version of the demo readings." + ] + }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": { "scrolled": true }, @@ -26,25 +67,137 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-02-03 12:49:43,764 - INFO - demo - Generating file raw_demo/T001/2013-01-.csv\n", - "2020-02-03 12:49:44,277 - INFO - demo - Generating file raw_demo/T001/2013-02-.csv\n", - "2020-02-03 12:49:44,810 - INFO - demo - Generating file raw_demo/T001/2013-03-.csv\n", - "2020-02-03 12:49:45,345 - INFO - demo - Generating file raw_demo/T001/2013-04-.csv\n", - "2020-02-03 12:49:45,885 - INFO - demo - Generating file raw_demo/T001/2013-05-.csv\n", - "2020-02-03 12:49:46,417 - INFO - demo - Generating file raw_demo/T001/2013-06-.csv\n", - "2020-02-03 12:49:46,954 - INFO - demo - Generating file raw_demo/T001/2013-07-.csv\n", - "2020-02-03 12:49:47,492 - INFO - demo - Generating file raw_demo/T001/2013-08-.csv\n", - "2020-02-03 12:49:48,017 - INFO - demo - Generating file raw_demo/T001/2013-09-.csv\n", - "2020-02-03 12:49:48,543 - INFO - demo - Generating file raw_demo/T001/2013-10-.csv\n", - "2020-02-03 12:49:49,094 - INFO - demo - Generating file raw_demo/T001/2013-11-.csv\n", - "2020-02-03 12:49:49,606 - INFO - demo - Generating file raw_demo/T001/2013-12-.csv\n" + "2020-02-03 14:26:17,008 - INFO - demo - Generating file raw_readings/T001/2013-01-.csv\n", + "2020-02-03 14:26:17,518 - INFO - demo - Generating file raw_readings/T001/2013-02-.csv\n", + "2020-02-03 14:26:18,045 - INFO - demo - Generating file raw_readings/T001/2013-03-.csv\n", + "2020-02-03 14:26:18,580 - INFO - demo - Generating file raw_readings/T001/2013-04-.csv\n", + "2020-02-03 14:26:19,118 - INFO - demo - Generating file raw_readings/T001/2013-05-.csv\n", + "2020-02-03 14:26:19,668 - INFO - demo - Generating file raw_readings/T001/2013-06-.csv\n", + "2020-02-03 14:26:20,219 - INFO - demo - Generating file raw_readings/T001/2013-07-.csv\n", + "2020-02-03 14:26:20,753 - INFO - demo - Generating file raw_readings/T001/2013-08-.csv\n", + "2020-02-03 14:26:21,304 - INFO - demo - Generating file raw_readings/T001/2013-09-.csv\n", + "2020-02-03 14:26:21,852 - INFO - demo - Generating file raw_readings/T001/2013-10-.csv\n", + "2020-02-03 14:26:22,388 - INFO - demo - Generating file raw_readings/T001/2013-11-.csv\n", + "2020-02-03 14:26:22,931 - INFO - demo - Generating file raw_readings/T001/2013-12-.csv\n" ] } ], "source": [ "from greenguard.demo import generate_raw_readings\n", "\n", - "target_times = generate_raw_readings('raw_demo')" + "readings_path = 'raw_readings'\n", + "\n", + "target_times = generate_raw_readings(readings_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function will generate a set of reading files in the raw format.\n", + "\n", + "We will load one of them to explore it:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "readings_sample = pd.read_csv('raw_readings/T001/2013-01-.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S0101/10/13 00:00:00323.0
1T001S0201/10/13 00:00:00320.0
2T001S0301/10/13 00:00:00284.0
3T001S0401/10/13 00:00:00348.0
4T001S0501/10/13 00:00:00273.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 01/10/13 00:00:00 323.0\n", + "1 T001 S02 01/10/13 00:00:00 320.0\n", + "2 T001 S03 01/10/13 00:00:00 284.0\n", + "3 T001 S04 01/10/13 00:00:00 348.0\n", + "4 T001 S05 01/10/13 00:00:00 273.0" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings_sample.head()" ] }, { diff --git a/notebooks/GreenGuard usage example.ipynb b/notebooks/GreenGuard usage example.ipynb index e912d19..c3179c0 100644 --- a/notebooks/GreenGuard usage example.ipynb +++ b/notebooks/GreenGuard usage example.ipynb @@ -11,7 +11,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook shows how to use GreenGuard to fit a pipeline and later on use it to make predictions on new data and evaluate the pipeline performance." + "This notebook shows how to use GreenGuard to:\n", + "\n", + "- Load some demo data\n", + "- Find available pipelines and load one as a template\n", + "- Tune the template arguments to generate the optimal pipeline\n", + "- Fit the pipeline to our data\n", + "- Make predictions using the pipeline\n", + "- Evaluate the goodness-of-fit" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Setup the logging\n", + "\n", + "This step sets up logging in our environment to increase our visibility over\n", + "the steps that GreenGuard performs." ] }, { @@ -37,8 +54,7 @@ "\n", "The first step is to load the data that we are going to use.\n", "\n", - "In order to use the demo data included in GreenGuard, the `greenguard.load_demo`\n", - "function can be used." + "In order to use the demo data included in GreenGuard, the `greenguard.demo.load_demo` function can be used." ] }, { @@ -57,7 +73,9 @@ "metadata": {}, "source": [ "This will download some demo data from our S3 demo Bucket and load it as\n", - "the necessary `target_times` and `readings` tables." + "the necessary `target_times` and `readings` tables.\n", + "\n", + "The exact format of these tables is described in the GreenGuard README and docs:" ] }, { @@ -335,19 +353,14 @@ "Alternatively, if you want to load your own dataset, all you have to do is load the\n", "`target_times` and `readings` tables as `pandas.DataFrame` objects.\n", "\n", - "Make sure to parse the corresponding datetime fields!" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "#import pandas as pd\n", + "Make sure to parse the corresponding datetime fields!\n", + "\n", + "```python\n", + "import pandas as pd\n", "\n", - "#target_times = pd.read_csv('path/to/your/target_times.csv', parse_dates=['cutoff_time'])\n", - "#readings = pd.read_csv('path/to/your/readings.csv', parse_dates=['timestamp'])" + "target_times = pd.read_csv('path/to/your/target_times.csv', parse_dates=['cutoff_time'])\n", + "readings = pd.read_csv('path/to/your/readings.csv', parse_dates=['timestamp'])\n", + "```" ] }, { @@ -481,7 +494,15 @@ "metadata": {}, "source": [ "For the rest of this tutorial, we will select and use the pipeline\n", - "`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` as our template." + "`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` as our template.\n", + "\n", + "This templates contains the following steps:\n", + "\n", + "- Resample the data using a 10 minute average aggregation\n", + "- Unstack the data by signal, so each signal is in a different column\n", + "- Normalize the Turbine IDs into a new table to assist DFS aggregations\n", + "- Use DFS on the readings based on the target_times cutoff times using a 1d window size\n", + "- Apply an XGBoost Classifier" ] }, { From 1b2cd20d083092e9c0b62f3fefb3f9272d2102a0 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 7 Feb 2020 11:12:03 -0500 Subject: [PATCH 008/171] Remove unused import --- greenguard/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index c530d4e..1d98a94 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -8,7 +8,6 @@ import os -from greenguard.demo import load_demo from greenguard.pipeline import GreenGuardPipeline, get_pipelines _BASE_PATH = os.path.abspath(os.path.dirname(__file__)) From daa5276ee13c8eeb6650612e82aa818abdbfbc4a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 7 Feb 2020 11:19:29 -0500 Subject: [PATCH 009/171] re-enable py35 --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5893f14..ab0d545 100644 --- a/setup.py +++ b/setup.py @@ -76,6 +76,7 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], @@ -97,7 +98,7 @@ long_description_content_type='text/markdown', name='greenguard', packages=find_packages(include=['greenguard', 'greenguard.*']), - python_requires='>=3.6', + python_requires='>=3.5', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, From e9b4b56e168c60602e534b236fb1c14939265fda Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 10 Feb 2020 19:56:01 -0500 Subject: [PATCH 010/171] Add tutorials and documentation --- .gitignore | 1 + README.md | 19 +- docs/advanced_usage/concepts.md | 56 + docs/advanced_usage/csv.md | 54 + docs/advanced_usage/docker.md | 107 ++ docs/index.rst | 8 + greenguard/loaders/csv.py | 2 +- greenguard/pipeline.py | 34 +- ...e.ipynb => 1. GreenGuard Quickstart.ipynb} | 7 +- notebooks/2. Extract Readings.ipynb | 1214 +++++++++++++++++ notebooks/CSVLoader Demo.ipynb | 836 ------------ 11 files changed, 1484 insertions(+), 854 deletions(-) create mode 100644 docs/advanced_usage/concepts.md create mode 100644 docs/advanced_usage/csv.md create mode 100644 docs/advanced_usage/docker.md rename notebooks/{GreenGuard usage example.ipynb => 1. GreenGuard Quickstart.ipynb} (99%) create mode 100644 notebooks/2. Extract Readings.ipynb delete mode 100644 notebooks/CSVLoader Demo.ipynb diff --git a/.gitignore b/.gitignore index 1184ed0..fc59bb2 100644 --- a/.gitignore +++ b/.gitignore @@ -106,6 +106,7 @@ ENV/ .*.swp greenguard/demo/ +notebooks/ notebooks-private/ scripts/ dask-worker-space/ diff --git a/README.md b/README.md index cc0a5e0..361b3bb 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ If you want to install from source or contribute to the project please read the # Data Format -The input expected by the **GreenGuard** system consists of the following two elements, +The minimum input expected by the **GreenGuard** system consists of the following two elements, which need to be passed as `pandas.DataFrame` objects: ## Target Times @@ -109,13 +109,24 @@ A table containing the signal data from the different sensors, with the followin | 10 | T1 | S2 | 2001-01-03 00:00:00 | 11 | | 11 | T1 | S2 | 2001-01-03 12:00:00 | 12 | +## Turbines + +Optionally, a third table can be added containing metadata about the turbines. +The only requirement for this table is to have a `turbine_id` field, and it can have +an arbitraty number of additional fields. + +| | turbine_id | manufacturer | ... | ... | ... | +|----|--------------|----------------|-------|-------|-------| +| 0 | T1 | Siemens | ... | ... | ... | +| 1 | T2 | Siemens | ... | ... | ... | + ## CSV Format A part from the in-memory data format explained above, which is limited by the memory allocation capabilities of the system where it is run, **GreenGuard** is also prepared to load and work with data stored as a collection of CSV files, drastically increasing the amount of data which it can work with. Further details about this format can be found in the -[project documentation site](https://D3-AI.github.io/GreenGuard/). +[project documentation site](https://d3-ai.github.io/GreenGuard/advanced_usage/csv.html). # Quickstart @@ -260,4 +271,6 @@ f1_score(test_targets, predictions) ## What's next? For more details about **GreenGuard** and all its possibilities and features, please check the -[project documentation site](https://D3-AI.github.io/GreenGuard/)! +[project documentation site](https://D3-AI.github.io/GreenGuard/) +Also do not forget to have a look at the [notebook tutorials]( +https://github.com/D3-AI/GreenGuard/tree/master/notebooks)! diff --git a/docs/advanced_usage/concepts.md b/docs/advanced_usage/concepts.md new file mode 100644 index 0000000..302d34a --- /dev/null +++ b/docs/advanced_usage/concepts.md @@ -0,0 +1,56 @@ +# Concepts + +Here we briefly explain some of the concepts and terminology used within the GreenGuard +project and documentation. + +## Primitive + +We call the smallest computational blocks used in a Machine Learning process +**primitives**, which: + +* Can be either classes or functions. +* Have some initialization arguments, which MLBlocks calls `init_params`. +* Have some tunable hyperparameters, which have types and a list or range of valid values. + +## Template + +Primitives can be combined to form what we call **Templates**, which: + +* Have a list of primitives. +* Have some initialization arguments, which correspond to the initialization arguments + of their primitives. +* Have some tunable hyperparameters, which correspond to the tunable hyperparameters + of their primitives. + +## Pipeline + +Templates can be used to build **Pipelines** by taking and fixing a set of valid +hyperparameters for a Template. Hence, Pipelines: + +* Have a list of primitives, which corresponds to the list of primitives of their template. +* Have some initialization arguments, which correspond to the initialization arguments + of their template. +* Have some hyperparameter values, which fall within the ranges of valid tunable + hyperparameters of their template. + +A pipeline can be fitted and evaluated directly using [MLBlocks]( +https://hdi-project.github.io/MLBlocks), or using the **GreenGuardPipeline**. + +## Tuning + +We call tuning the process of, given a dataset and a template, finding the pipeline derived from +the template that gets the best possible score on the dataset. + +This process usually involves fitting and evaluating multiple pipelines with different +hyperparameter configurations on the same data while using optimization algorithms to deduce +which hyperparameters are more likely to get the best results in the next iterations. + +We call each one of these evaluations a **tuning iteration**. + +## GreenGuardPipeline + +This class is the one in charge of loading the **MLBlocks Pipelines** configured in the +system and use them to learn from the data and make predictions. + +This class is also responsible for tuning the pipeline hyperparameters using [BTB]( +https://hdi-project.github.io/BTB/) diff --git a/docs/advanced_usage/csv.md b/docs/advanced_usage/csv.md new file mode 100644 index 0000000..c020832 --- /dev/null +++ b/docs/advanced_usage/csv.md @@ -0,0 +1,54 @@ +# CSV Format + +As explained in a previous section, the input expected by the **GreenGuard** system consists of +two tables which need to be passed as `pandas.DataFrame` objects: + +* The `target_times` table, which containing the specification of the problem that we are solving + in the form of training examples with a `turbine_id`, a `cutoff_time` and a `target` value. +* The `readings` table, which contains the signal readings from the different sensors, with + `turbine_id`, `signal_id`, `timestamp` and `value` fields. + +However, in most scenarios the size of the available will far exceed the memory limitations +of the system on which **GreenGuard** is being run, so loading all the data in a single +`pandas.DataFrame` will not be possible. + +In order to solve this situation, **GreenGuard** provides a [CSVLoader]( +https://d3-ai.github.io/GreenGuard/api/greenguard.loaders.csv.html#greenguard.loaders.csv.CSVLoader) +class which can be used to load data from what we call the **Raw Data Format**. + +## Raw Data Format + +The **Raw Data Format** consists on a collection of CSV files stored in a single folder with the +following structure: + +* All the data from all the turbines is inside a single folder, which here we will call `readings`. +* Inside the `readings` folder, one folder exists for each turbine, named exactly like the turbine: + * `readings/T001` + * `readings/T002` + * ... +* Inside each turbine folder one CSV file exists for each month, named `%Y-%m-.csv`. + * `readings/T001/2010-01-.csv` + * `readings/T001/2010-02-.csv` + * `readings/T001/2010-03-.csv` + * ... +* Each CSV file contains three columns: + * `signal_id`: name or id of the signal. + * ``timestamp``: timestamp of the reading formatted as ``%m/%d/%y %H:%M:%S``. + * `value`: value of the reading. + +This is an example of what a CSV contents look like: + +| | signal_id | timestamp | value | +|----|-------------|-------------------|---------| +| 0 | S1 | 01/01/01 00:00:00 | 1 | +| 1 | S1 | 01/01/01 12:00:00 | 2 | +| 2 | S1 | 01/02/01 00:00:00 | 3 | +| 3 | S1 | 01/02/01 12:00:00 | 4 | +| 4 | S1 | 01/03/01 00:00:00 | 5 | +| 5 | S1 | 01/03/01 12:00:00 | 6 | +| 6 | S2 | 01/01/01 00:00:00 | 7 | +| 7 | S2 | 01/01/01 12:00:00 | 8 | +| 8 | S2 | 01/02/01 00:00:00 | 9 | +| 9 | S2 | 01/02/01 12:00:00 | 10 | +| 10 | S2 | 01/03/01 00:00:00 | 11 | +| 11 | S2 | 01/03/01 12:00:00 | 12 | diff --git a/docs/advanced_usage/docker.md b/docs/advanced_usage/docker.md new file mode 100644 index 0000000..e5603df --- /dev/null +++ b/docs/advanced_usage/docker.md @@ -0,0 +1,107 @@ +# Docker Usage + +**GreenGuard** comes configured and ready to be distributed and run as a docker image which starts +a jupyter notebook already configured to use greenguard, with all the required dependencies already +installed. + +## Requirements + +The only requirement in order to run the GreenGuard Docker image is to have Docker installed and +that the user has enough permissions to run it. + +Installation instructions for any possible system compatible can be found [here](https://docs.docker.com/install/) + +Additionally, the system that builds the GreenGuard Docker image will also need to have a working +internet connection that allows downloading the base image and the additional python depenedencies. + +## Building the GreenGuard Docker Image + +After having cloned the **GreenGuard** repository, all you have to do in order to build the GreenGuard Docker +Image is running this command: + +```bash +make docker-jupyter-build +``` + +After a few minutes, the new image, called `greenguard-jupyter`, will have been built into the system +and will be ready to be used or distributed. + +## Distributing the GreenGuard Docker Image + +Once the `greenguard-jupyter` image is built, it can be distributed in several ways. + +### Distributing using a Docker registry + +The simplest way to distribute the recently created image is [using a registry](https://docs.docker.com/registry/). + +In order to do so, we will need to have write access to a public or private registry (remember to +[login](https://docs.docker.com/engine/reference/commandline/login/)!) and execute these commands: + +```bash +docker tag greenguard-jupyter:latest your-registry-name:some-tag +docker push your-registry-name:some-tag +``` + +Afterwards, in the receiving machine: + +```bash +docker pull your-registry-name:some-tag +docker tag your-registry-name:some-tag greenguard-jupyter:latest +``` + +### Distributing as a file + +If the distribution of the image has to be done offline for any reason, it can be achieved +using the following command. + +In the system that already has the image: + +```bash +docker save --output greenguard-jupyter.tar greenguard-jupyter +``` + +Then copy over the file `greenguard-jupyter.tar` to the new system and there, run: + +```bash +docker load --input greenguard-jupyter.tar +``` + +After these commands, the `greenguard-jupyter` image should be available and ready to be used in the +new system. + + +## Running the greenguard-jupyter image + +Once the `greenguard-jupyter` image has been built, pulled or loaded, it is ready to be run. + +This can be done in two ways: + +### Running greenguard-jupyter with the code + +If the GreenGuard source code is available in the system, running the image is as simple as running +this command from within the root of the project: + +```bash +make docker-jupyter-run +``` + +This will start a jupyter notebook using the docker image, which you can access by pointing your +browser at http://127.0.0.1:8888 + +In this case, the local version of the project will also mounted within the Docker container, +which means that any changes that you do in your local code will immediately be available +within your notebooks, and that any notebook that you create within jupyter will also show +up in your `notebooks` folder! + +### Running greenguard-jupyter without the greenguard code + +If the GreenGuard source code is not available in the system and only the Docker Image is, you can +still run the image by using this command: + +```bash +docker run -ti -p8888:8888 greenguard-jupyter +``` + +In this case, the code changes and the notebooks that you create within jupyter will stay +inside the container and you will only be able to access and download them through the +jupyter interface. diff --git a/docs/index.rst b/docs/index.rst index ab088e6..a654f0e 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,6 +6,14 @@ Overview +.. toctree:: + :caption: Advanced Usage + :hidden: + + advanced_usage/concepts + advanced_usage/csv + advanced_usage/docker + .. toctree:: :caption: Resources :hidden: diff --git a/greenguard/loaders/csv.py b/greenguard/loaders/csv.py index a2db438..5fec885 100644 --- a/greenguard/loaders/csv.py +++ b/greenguard/loaders/csv.py @@ -103,7 +103,7 @@ def __consolidate(self, readings, turbine_id): signals = readings[readings['value'].str.isnumeric()].signal_id.unique() raise ValueError('Signals contain non-numerical values: {}'.format(signals)) - readings['turbine_id'] = turbine_id + readings.insert(0, 'turbine_id', turbine_id) LOGGER.info('Loaded %s readings from turbine %s', len(readings), turbine_id) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index dd70e8a..7437a4a 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -268,16 +268,18 @@ def _is_better(self, score): return score > self.cv_score - def _generate_splits(self, X, y, readings): + def _generate_splits(self, X, y, readings, turbines=None): if self._preprocessing: pipeline = MLPipeline(self.template) LOGGER.debug('Running %s preprocessing steps', self._preprocessing) - context = pipeline.fit(X=X, y=y, readings=readings, output_=self._preprocessing - 1) + context = pipeline.fit(X=X, y=y, readings=readings, + turbines=turbines, output_=self._preprocessing - 1) del context['X'] del context['y'] else: context = { - 'readings': readings + 'readings': readings, + 'turbines': turbines, } splits = list() @@ -296,7 +298,7 @@ def _generate_splits(self, X, y, readings): return splits - def cross_validate(self, X=None, y=None, readings=None, params=None): + def cross_validate(self, X=None, y=None, readings=None, turbines=None, params=None): """Compute cross validation score using the given data. If the splits have not been previously computed, compute them now. @@ -317,6 +319,9 @@ def cross_validate(self, X=None, y=None, readings=None, params=None): readings (pandas.DataFrame): ``readings`` table. Only needed if the splits have not been previously computed. + turbines (pandas.DataFrame): + ``turbines`` table. Only needed if the splits have not been + previously computed. params (dict): hyperparameter values to use. @@ -328,7 +333,7 @@ def cross_validate(self, X=None, y=None, readings=None, params=None): if self._splits is None: LOGGER.info('Running static steps before cross validation') - self._splits = self._generate_splits(X, y, readings) + self._splits = self._generate_splits(X, y, readings, turbines) scores = [] for fold, pipeline, fit, predict, y_test in self._splits: @@ -412,7 +417,7 @@ def _get_tuner(self): return tuner - def tune(self, target_times=None, readings=None, iterations=10): + def tune(self, target_times=None, readings=None, turbines=None, iterations=10): """Tune this pipeline for the indicated number of iterations. Args: @@ -423,6 +428,9 @@ def tune(self, target_times=None, readings=None, iterations=10): readings (pandas.DataFrame): ``readings`` table. Only needed if the splits have not been previously computed. + turbines (pandas.DataFrame): + ``turbines`` table. Only needed if the splits have not been + previously computed. iterations (int): Number of iterations to perform. """ @@ -430,7 +438,7 @@ def tune(self, target_times=None, readings=None, iterations=10): LOGGER.info('Scoring the default pipeline') X = target_times[['turbine_id', 'cutoff_time']] y = target_times['target'] - self.cv_score = self.cross_validate(X, y, readings) + self.cv_score = self.cross_validate(X, y, readings, turbines) LOGGER.info('Default Pipeline score: %s', self.cv_score) @@ -458,7 +466,7 @@ def tune(self, target_times=None, readings=None, iterations=10): LOGGER.exception("Caught an exception scoring pipeline %s with params:\n%s", i + 1, failed) - def fit(self, target_times, readings): + def fit(self, target_times, readings, turbines=None): """Fit this pipeline to the given data. Args: @@ -467,13 +475,15 @@ def fit(self, target_times, readings): and ``target`` columns. readings (pandas.DataFrame): ``readings`` table. + turbines (pandas.DataFrame): + ``turbines`` table. """ X = target_times[['turbine_id', 'cutoff_time']] y = target_times['target'] - self._pipeline.fit(X, y, readings=readings) + self._pipeline.fit(X, y, readings=readings, turbines=turbines) self.fitted = True - def predict(self, target_times, readings): + def predict(self, target_times, readings, turbines=None): """Make predictions using this pipeline. Args: @@ -482,6 +492,8 @@ def predict(self, target_times, readings): and ``target`` columns. readings (pandas.DataFrame): ``readings`` table. + turbines (pandas.DataFrame): + ``turbines`` table. Returns: numpy.ndarray: @@ -491,7 +503,7 @@ def predict(self, target_times, readings): raise NotFittedError() X = target_times[['turbine_id', 'cutoff_time']] - return self._pipeline.predict(X, readings=readings) + return self._pipeline.predict(X, readings=readings, turbines=turbines) def save(self, path): """Serialize and save this pipeline using cloudpickle. diff --git a/notebooks/GreenGuard usage example.ipynb b/notebooks/1. GreenGuard Quickstart.ipynb similarity index 99% rename from notebooks/GreenGuard usage example.ipynb rename to notebooks/1. GreenGuard Quickstart.ipynb index c3179c0..a32b494 100644 --- a/notebooks/GreenGuard usage example.ipynb +++ b/notebooks/1. GreenGuard Quickstart.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# GreenGuard usage example" + "# GreenGuard Quickstart" ] }, { @@ -13,7 +13,7 @@ "source": [ "This notebook shows how to use GreenGuard to:\n", "\n", - "- Load some demo data\n", + "- Load demo data\n", "- Find available pipelines and load one as a template\n", "- Tune the template arguments to generate the optimal pipeline\n", "- Fit the pipeline to our data\n", @@ -72,7 +72,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This will download some demo data from our S3 demo Bucket and load it as\n", + "This will download some demo data from [GreenGuard S3 demo Bucket](\n", + "https://d3-ai-greenguard.s3.amazonaws.com/index.html)%20and%20load%20it%20as/n", "the necessary `target_times` and `readings` tables.\n", "\n", "The exact format of these tables is described in the GreenGuard README and docs:" diff --git a/notebooks/2. Extract Readings.ipynb b/notebooks/2. Extract Readings.ipynb new file mode 100644 index 0000000..14b4cab --- /dev/null +++ b/notebooks/2. Extract Readings.ipynb @@ -0,0 +1,1214 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Extract Readings\n", + "\n", + "This notebook shows how to use the CSVLoader class to load readings from a folder\n", + "containing readings in the raw format.\n", + "\n", + "Details about the raw readings format can be found in the documentation site.\n", + "\n", + "In this notebook we will:\n", + "\n", + "- Generate a folder with readings in the raw format based on the demo data\n", + "- Load the redings needed for our target times\n", + "- Explore different options from the CSVLoader\n", + "- Load a pipeline and use it on the loaded data\n", + "- Load the readings in the unstacked format\n", + "- Load an unstacked pipeline and use it on the loaded data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Setup the logging\n", + "\n", + "This step sets up logging in our environment to increase our visibility over\n", + "the steps that GreenGuard performs." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import logging;\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "logging.getLogger().setLevel(level=logging.INFO)\n", + "\n", + "import warnings\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Generate Raw Readings\n", + "\n", + "The first step will be to execute the `generate_raw_readings` function, which will create a\n", + "folder in the indicated path and populate it with the raw version of the demo readings.\n", + "\n", + "**NOTE**: if you want to use your own dataset you can skip this step and go directly to step 2." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-10 18:41:33,310 - INFO - demo - Generating file readings/T001/2013-01-.csv\n", + "2020-02-10 18:41:34,048 - INFO - demo - Generating file readings/T001/2013-02-.csv\n", + "2020-02-10 18:41:34,845 - INFO - demo - Generating file readings/T001/2013-03-.csv\n", + "2020-02-10 18:41:35,670 - INFO - demo - Generating file readings/T001/2013-04-.csv\n", + "2020-02-10 18:41:36,476 - INFO - demo - Generating file readings/T001/2013-05-.csv\n", + "2020-02-10 18:41:37,259 - INFO - demo - Generating file readings/T001/2013-06-.csv\n", + "2020-02-10 18:41:38,194 - INFO - demo - Generating file readings/T001/2013-07-.csv\n", + "2020-02-10 18:41:39,031 - INFO - demo - Generating file readings/T001/2013-08-.csv\n", + "2020-02-10 18:41:39,891 - INFO - demo - Generating file readings/T001/2013-09-.csv\n", + "2020-02-10 18:41:40,689 - INFO - demo - Generating file readings/T001/2013-10-.csv\n", + "2020-02-10 18:41:41,478 - INFO - demo - Generating file readings/T001/2013-11-.csv\n", + "2020-02-10 18:41:42,249 - INFO - demo - Generating file readings/T001/2013-12-.csv\n" + ] + } + ], + "source": [ + "from greenguard.demo import generate_raw_readings\n", + "\n", + "target_times = generate_raw_readings('readings')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This function will generate a set of reading files in the raw format.\n", + "\n", + "We will load one of them to explore it:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Readings Format" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "readings_sample = pd.read_csv('readings/T001/2013-01-.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
signal_idtimestampvalue
0S0101/10/13 00:00:00323.0
1S0201/10/13 00:00:00320.0
2S0301/10/13 00:00:00284.0
3S0401/10/13 00:00:00348.0
4S0501/10/13 00:00:00273.0
\n", + "
" + ], + "text/plain": [ + " signal_id timestamp value\n", + "0 S01 01/10/13 00:00:00 323.0\n", + "1 S02 01/10/13 00:00:00 320.0\n", + "2 S03 01/10/13 00:00:00 284.0\n", + "3 S04 01/10/13 00:00:00 348.0\n", + "4 S05 01/10/13 00:00:00 273.0" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings_sample.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here we can cleary see the format in which the data is stored:\n", + "\n", + "* All the data from all the turbines is inside a single folder.\n", + "* Inside this folder, one folder exists for each turbine, named exactly like the turbine:\n", + " * `readings/T001`\n", + " * `readings/T002`\n", + " * ...\n", + "* Inside each turbine folder one CSV file exists for each month, named `%Y-%m-.csv`.\n", + " * `readings/T001/2010-01-.csv`\n", + " * `readings/T001/2010-02-.csv`\n", + " * `readings/T001/2010-03-.csv`\n", + " * ...\n", + "* Each CSV file contains three columns:\n", + " * `signal_id`: name or id of the signal.\n", + " * ``timestamp``: timestamp of the reading formatted as ``%m/%d/%y %H:%M:%S``.\n", + " * `value`: value of the reading." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Target Times" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The previous function will have also returned us a `target_times` variable,\n", + "which is a `pandas.DataFrame` with the three expected columns:\n", + "\n", + "* `turbine_id`\n", + "* `cutoff_time`\n", + "* `target`" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353, 3)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.3002832861189802" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.target.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "turbine_id object\n", + "cutoff_time datetime64[ns]\n", + "target int64\n", + "dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. CSVLoader\n", + "\n", + "The readings in raw format can arbitrarily big, which might make it impossible to load\n", + "them into memory all at once.\n", + "\n", + "In order to load them in an efficient way that allows us to solve Machine Learning problems\n", + "using them, GeenGuard provides the `greenguard.loaders.CVSLoader` class.\n", + "\n", + "This class is prepared to, given a target times table, explore a collection of raw readings\n", + "and extract only the information needed to solve the corresponding problem.\n", + "\n", + "The first step in order to use it, is to create an instance passing it the path\n", + "to where the reading files are stored.\n", + "\n", + "**NOTE**: If you want to use your own dataset instead of the demo version,\n", + "all you have to do is make the `readings_path` variable point at the\n", + "folder where you have your CVS files stored and load your `target_times` table:\n", + "\n", + "Make sure to parse the `cutoff_time` column!\n", + "\n", + "```python\n", + "readings_path = 'path/to/your/data'\n", + "target_times = pd.read_csv('path/to/your/target_times.csv', parse_dates=['cutoff_time'])\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from greenguard.loaders import CSVLoader\n", + "\n", + "readings_path = 'readings'\n", + "\n", + "csv_loader = CSVLoader(readings_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have created our instance, we can load the readings needed for our target times\n", + "calling the `load` method with two arguments:\n", + "\n", + "* `target_times (pandas.DataFrame)`: the `target_times` table.\n", + "* `window_size (str)`: the size of the training window, as a timedelta specification\n", + " (amount + time unit). This indicates the minimum amount of data that we need to\n", + " load for each training from the `target_times` table.\n", + " \n", + "For example, let's load the readings needed for all our `target_times`, using a\n", + "`window_size` of one day." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-10 19:03:18,638 - INFO - csv - Loaded 1298564 readings from turbine T001\n", + "2020-02-10 19:03:18,763 - INFO - csv - Loaded 1298564 turbine readings\n", + "2020-02-10 19:03:19,115 - INFO - targets - Dropped 2 invalid targets\n" + ] + } + ], + "source": [ + "target_times, readings = csv_loader.load(target_times, '1d')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1298564, 4)" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-12294.0
1T001S022013-01-12310.0
2T001S032013-01-12306.0
3T001S042013-01-12303.0
4T001S052013-01-12265.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-12 294.0\n", + "1 T001 S02 2013-01-12 310.0\n", + "2 T001 S03 2013-01-12 306.0\n", + "3 T001 S04 2013-01-12 303.0\n", + "4 T001 S05 2013-01-12 265.0" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "turbine_id object\n", + "signal_id object\n", + "timestamp datetime64[ns]\n", + "value float64\n", + "dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see how the readings have been loaded with the expected format, including\n", + "the four expected columns:\n", + "\n", + "* `turbine_id`: Unique identifier of the turbine which this reading comes from.\n", + "* `signal_id`: Unique identifier of the signal which this reading comes from.\n", + "* `timestamp (datetime)`: Time where the reading took place, as a datetime.\n", + "* `value (float)`: Numeric value of this reading." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also see how there is a message that indicates that there are 2 invalid targets\n", + "that have been dropped. This is because within our readings there was not enough\n", + "data to cover the entire trainin window for them, so they cannot be included in the\n", + "final problem specification." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(351, 3)" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's see what happens if we increase the `window_size` to, for example, 30 days." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-10 19:08:21,859 - INFO - csv - Loaded 1302308 readings from turbine T001\n", + "2020-02-10 19:08:21,955 - INFO - csv - Loaded 1302308 turbine readings\n", + "2020-02-10 19:08:22,298 - INFO - targets - Dropped 28 invalid targets\n" + ] + } + ], + "source": [ + "target_times, readings = csv_loader.load(target_times, '30d')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that now more targets needed to be dropped, because there was enough data\n", + "for them." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(321, 3)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "On the other side, we can see how now the size of the loaded readings table\n", + "is a bit bigger, as more data had to be included to properly cover all the\n", + "training windows." + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1302308, 4)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Preprocessing the data\n", + "\n", + "In some cases, if the amount of targets is big enough, fitting high frequency data\n", + "into memory will still be a challenge.\n", + "\n", + "For this cases, the `CSVLoader` class also supports passing a resampling rule and\n", + "an aggregation function specification, so the data can go through a sampling\n", + "frequency reduction aggregation while it is loaded, reducing the amount of spaces\n", + "that it occupies in memory once loaded.\n", + "\n", + "In order to use the resampling feature, we will need to create a new instance\n", + "of the `CSVLoader` passing the following new arguments:\n", + "\n", + "* `rule (str)`: Time-delta specification (amount+unit) of the new sampling frequency.\n", + "* `aggregation (str or function)`: Aggregation to apply when resampling." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "csv_loader = CSVLoader(readings_path, rule='4h', aggregation='mean')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then call the `load` method normally." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-10 19:31:50,932 - INFO - csv - Loaded 1235535 readings from turbine T001\n", + "2020-02-10 19:31:50,938 - INFO - csv - Resampling: 4h - mean\n", + "2020-02-10 19:31:51,459 - INFO - csv - Loaded 52130 turbine readings\n", + "2020-02-10 19:31:51,689 - INFO - targets - Dropped 2 invalid targets\n" + ] + } + ], + "source": [ + "target_times, readings = csv_loader.load(target_times, '14d')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(52130, 4)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-27 00:00:00791.333333
1T001S012013-01-27 04:00:00746.750000
2T001S012013-01-27 08:00:00808.750000
3T001S012013-01-27 12:00:00760.125000
4T001S012013-01-27 16:00:00720.833333
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-27 00:00:00 791.333333\n", + "1 T001 S01 2013-01-27 04:00:00 746.750000\n", + "2 T001 S01 2013-01-27 08:00:00 808.750000\n", + "3 T001 S01 2013-01-27 12:00:00 760.125000\n", + "4 T001 S01 2013-01-27 16:00:00 720.833333" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(319, 3)" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Unstacking\n", + "\n", + "Some of the pipelines included in **GreenGuard** expect a slightly different input format,\n", + "where the data has been unstacked by `signal_id`, putting the values of each signal in a\n", + "different column instead of having all of them in a single column.\n", + "\n", + "In such cases, the `CSVLoader` can also take care of the unstacking step.\n", + "\n", + "For this, all you need to do is add `unstack=True` argument when creating the instance\n", + "and then use the `load` method as usual." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-02-10 19:36:03,403 - INFO - csv - Loaded 1228047 readings from turbine T001\n", + "2020-02-10 19:36:03,411 - INFO - csv - Resampling: 4h - mean\n", + "2020-02-10 19:36:03,881 - INFO - csv - Loaded 1993 turbine readings\n", + "2020-02-10 19:36:04,165 - INFO - targets - Dropped 2 invalid targets\n" + ] + } + ], + "source": [ + "csv_loader = CSVLoader(readings_path, rule='4h', aggregation='mean', unstack=True)\n", + "target_times, readings = csv_loader.load(target_times, '14d')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1993, 28)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-28 00:00:00715.750000709.333333710.208333796.666667771.750000732.916667766.1666673.361627e+06...13.4875004.272212e+0649.04166749.04166749.04166749.04166749.04166749.04166749.041667336.000000
1T0012013-01-28 04:00:00779.416667777.500000779.666667824.125000800.083333765.291667791.9583333.362652e+06...14.6958334.279238e+0643.87500043.87500043.87500043.87500043.91666743.87500043.916667301.083333
2T0012013-01-28 08:00:00732.583333757.375000738.125000794.583333765.291667736.541667766.9166673.364190e+06...14.1000004.289814e+0681.66666782.37500082.41666782.87500082.54166783.25000081.416667564.041667
3T0012013-01-28 12:00:00743.833333779.083333775.833333804.208333771.458333736.166667761.0000003.366258e+06...13.6916674.304198e+0688.25000090.83333390.87500091.50000090.16666790.87500088.916667616.833333
4T0012013-01-28 16:00:00640.416667678.000000675.958333709.166667675.833333670.666667682.1666673.368310e+06...12.4541674.318658e+0680.45833383.54166785.33333385.91666783.50000086.37500083.333333574.958333
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", + "0 T001 2013-01-28 00:00:00 715.750000 709.333333 710.208333 \n", + "1 T001 2013-01-28 04:00:00 779.416667 777.500000 779.666667 \n", + "2 T001 2013-01-28 08:00:00 732.583333 757.375000 738.125000 \n", + "3 T001 2013-01-28 12:00:00 743.833333 779.083333 775.833333 \n", + "4 T001 2013-01-28 16:00:00 640.416667 678.000000 675.958333 \n", + "\n", + " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", + "0 796.666667 771.750000 732.916667 766.166667 3.361627e+06 ... \n", + "1 824.125000 800.083333 765.291667 791.958333 3.362652e+06 ... \n", + "2 794.583333 765.291667 736.541667 766.916667 3.364190e+06 ... \n", + "3 804.208333 771.458333 736.166667 761.000000 3.366258e+06 ... \n", + "4 709.166667 675.833333 670.666667 682.166667 3.368310e+06 ... \n", + "\n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 13.487500 4.272212e+06 49.041667 49.041667 49.041667 49.041667 \n", + "1 14.695833 4.279238e+06 43.875000 43.875000 43.875000 43.875000 \n", + "2 14.100000 4.289814e+06 81.666667 82.375000 82.416667 82.875000 \n", + "3 13.691667 4.304198e+06 88.250000 90.833333 90.875000 91.500000 \n", + "4 12.454167 4.318658e+06 80.458333 83.541667 85.333333 85.916667 \n", + "\n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 49.041667 49.041667 49.041667 336.000000 \n", + "1 43.916667 43.875000 43.916667 301.083333 \n", + "2 82.541667 83.250000 81.416667 564.041667 \n", + "3 90.166667 90.875000 88.916667 616.833333 \n", + "4 83.500000 86.375000 83.333333 574.958333 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/CSVLoader Demo.ipynb b/notebooks/CSVLoader Demo.ipynb deleted file mode 100644 index 4710596..0000000 --- a/notebooks/CSVLoader Demo.ipynb +++ /dev/null @@ -1,836 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# CSVLoader Demo\n", - "\n", - "This notebook shows how to use the CSVLoader class to load readings from a folder\n", - "containing readings in the raw format.\n", - "\n", - "Details about the raw readings format can be found in the documentation site.\n", - "\n", - "In this notebook we will:\n", - "\n", - "- Generate a folder with readings in the raw format based on the demo data\n", - "- Load the redings needed for our target times\n", - "- Explore different options from the CSVLoader\n", - "- Load a pipeline and use it on the loaded data\n", - "- Load the readings in the unstacked format\n", - "- Load an unstacked pipeline and use it on the loaded data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 0. Setup the logging\n", - "\n", - "This step sets up logging in our environment to increase our visibility over\n", - "the steps that GreenGuard performs." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import logging;\n", - "\n", - "logging.basicConfig(level=logging.INFO)\n", - "logging.getLogger().setLevel(level=logging.INFO)\n", - "\n", - "import warnings\n", - "warnings.simplefilter(\"ignore\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 1. Generate Raw Readings\n", - "\n", - "The first step will be to execute the `generate_raw_readings` function, which will create a\n", - "folder in the indicated path and populate it with the raw version of the demo readings." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2020-02-03 14:26:17,008 - INFO - demo - Generating file raw_readings/T001/2013-01-.csv\n", - "2020-02-03 14:26:17,518 - INFO - demo - Generating file raw_readings/T001/2013-02-.csv\n", - "2020-02-03 14:26:18,045 - INFO - demo - Generating file raw_readings/T001/2013-03-.csv\n", - "2020-02-03 14:26:18,580 - INFO - demo - Generating file raw_readings/T001/2013-04-.csv\n", - "2020-02-03 14:26:19,118 - INFO - demo - Generating file raw_readings/T001/2013-05-.csv\n", - "2020-02-03 14:26:19,668 - INFO - demo - Generating file raw_readings/T001/2013-06-.csv\n", - "2020-02-03 14:26:20,219 - INFO - demo - Generating file raw_readings/T001/2013-07-.csv\n", - "2020-02-03 14:26:20,753 - INFO - demo - Generating file raw_readings/T001/2013-08-.csv\n", - "2020-02-03 14:26:21,304 - INFO - demo - Generating file raw_readings/T001/2013-09-.csv\n", - "2020-02-03 14:26:21,852 - INFO - demo - Generating file raw_readings/T001/2013-10-.csv\n", - "2020-02-03 14:26:22,388 - INFO - demo - Generating file raw_readings/T001/2013-11-.csv\n", - "2020-02-03 14:26:22,931 - INFO - demo - Generating file raw_readings/T001/2013-12-.csv\n" - ] - } - ], - "source": [ - "from greenguard.demo import generate_raw_readings\n", - "\n", - "readings_path = 'raw_readings'\n", - "\n", - "target_times = generate_raw_readings(readings_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function will generate a set of reading files in the raw format.\n", - "\n", - "We will load one of them to explore it:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "\n", - "readings_sample = pd.read_csv('raw_readings/T001/2013-01-.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S0101/10/13 00:00:00323.0
1T001S0201/10/13 00:00:00320.0
2T001S0301/10/13 00:00:00284.0
3T001S0401/10/13 00:00:00348.0
4T001S0501/10/13 00:00:00273.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 01/10/13 00:00:00 323.0\n", - "1 T001 S02 01/10/13 00:00:00 320.0\n", - "2 T001 S03 01/10/13 00:00:00 284.0\n", - "3 T001 S04 01/10/13 00:00:00 348.0\n", - "4 T001 S05 01/10/13 00:00:00 273.0" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings_sample.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(353, 3)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", - "
" - ], - "text/plain": [ - " turbine_id cutoff_time target\n", - "0 T001 2013-01-12 0\n", - "1 T001 2013-01-13 0\n", - "2 T001 2013-01-14 0\n", - "3 T001 2013-01-15 1\n", - "4 T001 2013-01-16 0" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.3002832861189802" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.target.mean()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "turbine_id object\n", - "cutoff_time datetime64[ns]\n", - "target int64\n", - "dtype: object" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "from greenguard.loaders import CSVLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2020-02-03 12:50:11,263 - INFO - csv - Loaded 1306052 readings from turbine T001\n", - "2020-02-03 12:50:11,275 - INFO - csv - Loaded 1306052 turbine readings\n", - "2020-02-03 12:50:11,500 - INFO - targets - Dropped 2 invalid targets\n" - ] - } - ], - "source": [ - "import numpy as np\n", - "\n", - "csv_loader = CSVLoader('raw_demo')\n", - "target_times, readings = csv_loader.load(target_times, '1d')" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(1306052, 4)" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-11209.0
1T001S022013-01-11193.0
2T001S032013-01-11177.0
3T001S042013-01-11188.0
4T001S052013-01-11150.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-11 209.0\n", - "1 T001 S02 2013-01-11 193.0\n", - "2 T001 S03 2013-01-11 177.0\n", - "3 T001 S04 2013-01-11 188.0\n", - "4 T001 S05 2013-01-11 150.0" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "turbine_id object\n", - "signal_id object\n", - "timestamp datetime64[ns]\n", - "value float64\n", - "dtype: object" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(351, 3)" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2020-02-03 12:50:31,423 - INFO - csv - Loaded 1306052 readings from turbine T001\n", - "2020-02-03 12:50:31,427 - INFO - csv - Resampling: 4h - mean\n", - "2020-02-03 12:50:31,689 - INFO - csv - Loaded 2119 turbine readings\n", - "2020-02-03 12:50:31,843 - INFO - targets - Dropped 14 invalid targets\n" - ] - } - ], - "source": [ - "csv_loader = CSVLoader('raw_demo', rule='4h', aggregation='mean', unstack=True)\n", - "target_times, readings = csv_loader.load(target_times, '15d')" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(2119, 28)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00253.041667268.250000268.041667297.166667234.666667261.916667206.7916673.198335e+06...9.0791673.134510e+0642.41666744.95833344.83333349.62500039.20833343.83333334.625293.166667
1T0012013-01-10 04:00:00572.083333555.291667538.666667592.291667557.166667534.000000544.2500003.199514e+06...10.8375003.142505e+0662.08333362.50000063.62500063.54166761.33333362.54166754.000421.208333
2T0012013-01-10 08:00:00688.791667696.791667706.625000750.791667714.250000683.333333658.1666673.201449e+06...12.7541673.155809e+0692.20833394.95833394.66666797.33333394.12500093.58333386.375638.291667
3T0012013-01-10 12:00:00396.333333418.500000415.791667438.541667382.250000364.666667320.3333333.203319e+06...10.9166673.168640e+0655.75000060.08333358.58333361.29166752.79166752.79166744.000376.125000
4T0012013-01-10 16:00:00390.458333408.875000409.500000458.000000415.583333363.000000364.4583333.204504e+06...10.4125003.176672e+0649.95833353.87500054.45833356.75000052.70833346.70833347.625354.750000
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", - "0 T001 2013-01-10 00:00:00 253.041667 268.250000 268.041667 \n", - "1 T001 2013-01-10 04:00:00 572.083333 555.291667 538.666667 \n", - "2 T001 2013-01-10 08:00:00 688.791667 696.791667 706.625000 \n", - "3 T001 2013-01-10 12:00:00 396.333333 418.500000 415.791667 \n", - "4 T001 2013-01-10 16:00:00 390.458333 408.875000 409.500000 \n", - "\n", - " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", - "0 297.166667 234.666667 261.916667 206.791667 3.198335e+06 ... \n", - "1 592.291667 557.166667 534.000000 544.250000 3.199514e+06 ... \n", - "2 750.791667 714.250000 683.333333 658.166667 3.201449e+06 ... \n", - "3 438.541667 382.250000 364.666667 320.333333 3.203319e+06 ... \n", - "4 458.000000 415.583333 363.000000 364.458333 3.204504e+06 ... \n", - "\n", - " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", - "0 9.079167 3.134510e+06 42.416667 44.958333 44.833333 49.625000 \n", - "1 10.837500 3.142505e+06 62.083333 62.500000 63.625000 63.541667 \n", - "2 12.754167 3.155809e+06 92.208333 94.958333 94.666667 97.333333 \n", - "3 10.916667 3.168640e+06 55.750000 60.083333 58.583333 61.291667 \n", - "4 10.412500 3.176672e+06 49.958333 53.875000 54.458333 56.750000 \n", - "\n", - " value_S23 value_S24 value_S25 value_S26 \n", - "0 39.208333 43.833333 34.625 293.166667 \n", - "1 61.333333 62.541667 54.000 421.208333 \n", - "2 94.125000 93.583333 86.375 638.291667 \n", - "3 52.791667 52.791667 44.000 376.125000 \n", - "4 52.708333 46.708333 47.625 354.750000 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(337, 3)" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.shape" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.9" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} From aa82ca9654da12753657d176bc6d8e5bd619a8e8 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 10 Feb 2020 22:47:23 -0500 Subject: [PATCH 011/171] Restrict sphinx version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ab0d545..ff6afe9 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ # docs 'm2r>=0.2.0', - 'Sphinx>=1.7.1', + 'Sphinx>=1.7.1,<2.4', 'sphinx_rtd_theme>=0.2.4', 'autodocsumm>=0.1.10', From 2f375a48c6b2aa70b339cb99594c18e970cbf169 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 11 Feb 2020 10:56:54 -0500 Subject: [PATCH 012/171] Prepare for v0.2.0 --- HISTORY.md | 8 ++++++++ greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 0bd0426..dc27a8f 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,13 @@ # History +## 0.2.0 - 2020-02-11 + +first stable release: + +* efficient data loading and preprocessing +* initial collection of dfs and lstm based pipelines +* optimized pipeline tuning + ## 0.1.0 * First release on PyPI diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 1d98a94..c60bdb4 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.1-dev' +__version__ = '0.2.0.dev0' import os diff --git a/setup.cfg b/setup.cfg index 02c12ea..614c66a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.1.dev0 +current_version = 0.2.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index ff6afe9..8a09fa9 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.1.1.dev0', + version='0.2.0.dev0', zip_safe=False, ) From 5de0680cbf4fc059bd4fa9fea49ac7200280edb0 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 11 Feb 2020 10:57:16 -0500 Subject: [PATCH 013/171] =?UTF-8?q?Bump=20version:=200.2.0.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.0.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 9 +++++---- setup.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index c60bdb4..f677be1 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.0.dev0' +__version__ = '0.2.0.dev1' import os diff --git a/setup.cfg b/setup.cfg index 614c66a..ba0d3ac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,16 +1,16 @@ [bumpversion] -current_version = 0.2.0.dev0 +current_version = 0.2.0.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? -serialize = +serialize = {major}.{minor}.{patch}.{release}{candidate} {major}.{minor}.{patch} [bumpversion:part:release] optional_value = release first_value = dev -values = +values = dev release @@ -34,7 +34,7 @@ ignore = # keep empty to prevent default ignores [isort] include_trailing_comment = True -line_length=99 +line_length = 99 lines_between_types = 0 multi_line_output = 4 not_skip = __init__.py @@ -45,3 +45,4 @@ test = pytest [tool:pytest] collect_ignore = ['setup.py'] + diff --git a/setup.py b/setup.py index 8a09fa9..bc4f06b 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.0.dev0', + version='0.2.0.dev1', zip_safe=False, ) From 8cbf6f5101252a022abcd8bdb08b83d98513c456 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 11 Feb 2020 11:16:39 -0500 Subject: [PATCH 014/171] Make accuracy the default metric --- README.md | 20 +++++++++++--------- greenguard/pipeline.py | 2 +- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 361b3bb..94de7e7 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,9 @@ AutoML for Renewable Energy Industries. [![PyPI Shield](https://img.shields.io/pypi/v/greenguard.svg)](https://pypi.python.org/pypi/greenguard) [![Travis CI Shield](https://travis-ci.org/D3-AI/GreenGuard.svg?branch=master)](https://travis-ci.org/D3-AI/GreenGuard) [![Downloads](https://pepy.tech/badge/greenguard)](https://pepy.tech/project/greenguard) + # GreenGuard @@ -138,7 +140,7 @@ The first step is to load the demo data. For this, we will import and call the `greenguard.demo.load_demo` function without any arguments: -```python +```python3 from greenguard.demo import load_demo target_times, readings = load_demo() @@ -175,7 +177,7 @@ In this case, we will split them using the [train_test_split function from sciki https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html), but it can be done with any other suitable tool. -```python +```python3 from sklearn.model_selection import train_test_split train, test = train_test_split(target_times, test_size=0.25, random_state=0) @@ -189,7 +191,7 @@ the `train` and `test` inputs. Additionally, if we want to calculate a goodness-of-fit score later on, we can separate the testing target values from the `test` table by popping them from it: -```python +```python3 test_targets = test.pop('target') ``` @@ -200,7 +202,7 @@ Once we have the data ready, we need to find a suitable pipeline. The list of available GreenGuard Pipelines can be obtained using the `greenguard.get_pipelines` function. -```python +```python3 from greenguard import get_pipelines pipelines = get_pipelines() @@ -222,7 +224,7 @@ available in the GreenGuard system: For the rest of this tutorial, we will select and use the pipeline `resample_600s_unstack_normalize_dfs_1d_xgb_classifier` as our template. -```python +```python3 pipeline_name = 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier' ``` @@ -234,7 +236,7 @@ fit it. For this, we will create an instance of a `GreenGuardPipeline` object passing the name of the pipeline that we want to use: -```python +```python3 from greenguard.pipeline import GreenGuardPipeline pipeline = GreenGuardPipeline(pipeline_name) @@ -243,7 +245,7 @@ pipeline = GreenGuardPipeline(pipeline_name) And then we can directly fit it to our data by calling its `fit` method and passing in the training `target_times` and the complete `readings` table: -```python +```python3 pipeline.fit(train, readings) ``` @@ -253,7 +255,7 @@ After fitting the pipeline, we are ready to make predictions on new data by call `pipeline.predict` method passing the testing `target_times` and, again, the complete `readings` table. -```python +```python3 predictions = pipeline.predict(test, readings) ``` @@ -262,7 +264,7 @@ predictions = pipeline.predict(test, readings) Finally, after making predictions we can evaluate how good the prediction was using any suitable metric. -```python +```python3 from sklearn.metrics import f1_score f1_score(test_targets, predictions) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 7437a4a..6ac75af 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -184,7 +184,7 @@ def set_init_params(self, init_params): self._update_params(template_params, init_params) self._build_pipeline() - def __init__(self, template, metric, cost=False, init_params=None, stratify=True, + def __init__(self, template, metric='accuracy', cost=False, init_params=None, stratify=True, cv_splits=5, shuffle=True, random_state=0, preprocessing=0): self._cv = self._get_cv(stratify, cv_splits, shuffle, random_state) From e756fc519562f4816eaa3e42a167861fbc759b15 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 11 Feb 2020 11:28:50 -0500 Subject: [PATCH 015/171] Use rundoc to run the readme snippets --- Makefile | 4 ++++ setup.py | 1 + tox.ini | 10 ++++++++-- 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 78fef0f..d4126ed 100644 --- a/Makefile +++ b/Makefile @@ -106,6 +106,10 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort test: ## run tests quickly with the default Python python -m pytest --basetemp=${ENVTMPDIR} --cov=greenguard +.PHONY: test-readme +test-readme: ## run the readme snippets + rundoc run --single-session python3 -t python3 README.md + .PHONY: test-all test-all: ## run tests on every Python version with tox tox -r diff --git a/setup.py b/setup.py index bc4f06b..611f50c 100644 --- a/setup.py +++ b/setup.py @@ -32,6 +32,7 @@ tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', + 'rundoc>=0.4.3' ] development_requires = [ diff --git a/tox.ini b/tox.ini index f59f77b..de5cd07 100644 --- a/tox.ini +++ b/tox.ini @@ -1,11 +1,11 @@ [tox] -envlist = py35, py36, py37, lint, docs +envlist = py35, py36, py37, lint, docs, readme [travis] python = 3.7: py37 - 3.6: py36, docs, lint + 3.6: py36, docs, lint, readme 3.5: py35 @@ -30,3 +30,9 @@ skipsdist = true extras = dev commands = /usr/bin/env make docs + + +[testenv:readme] +skipsdist = true +commands = + /usr/bin/env make test-readme From ddad484f9423fe5b9e63dd93c5368b3382f7baef Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 11 Feb 2020 13:28:09 -0500 Subject: [PATCH 016/171] Move all travis to xenial --- .travis.yml | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/.travis.yml b/.travis.yml index ef8e31a..9cbca5a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,16 +1,11 @@ # Config file for automatic testing at travis-ci.org -dist: trusty +dist: xenial language: python python: + - 3.7 - 3.6 - 3.5 -matrix: - include: - - python: 3.7 - dist: xenial - sudo: required - # Command to install dependencies install: pip install -U tox-travis codecov From b0aba1a02fa25c398cfb77bc2b34c432c7251a86 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 12 Feb 2020 19:20:54 -0500 Subject: [PATCH 017/171] Add docker-compose --- .dockerignore | 1 + Dockerfile | 16 ++++++++++++++++ Makefile | 8 ++++---- docker-compose.yml | 9 +++++++++ docker/greenguard-jupyter.Dockerfile | 16 ---------------- 5 files changed, 30 insertions(+), 20 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yml delete mode 100644 docker/greenguard-jupyter.Dockerfile diff --git a/.dockerignore b/.dockerignore index d8e7acb..7ea8e51 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1 +1,2 @@ notebooks-private/ +.tox/ diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..f1f953d --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.6 + +EXPOSE 8888 + +RUN adduser jupyter --uid 1000 --disabled-password --system + +RUN mkdir /greenguard +COPY setup.py /greenguard +RUN pip install -e /greenguard && pip install jupyter + +COPY greenguard /greenguard/greenguard +COPY notebooks /greenguard/notebooks + +WORKDIR /greenguard +USER jupyter +CMD /usr/local/bin/jupyter notebook --ip 0.0.0.0 --NotebookApp.token='' diff --git a/Makefile b/Makefile index d4126ed..7683786 100644 --- a/Makefile +++ b/Makefile @@ -218,8 +218,8 @@ docker-jupyter-clean: ## Remove the greenguard-jupyter docker image docker rmi -f greenguard-jupyter .PHONY: docker-jupyter-build -docker-jupyter-build: docker-jupyter-clean ## Build the greenguard-jupyter docker image using repo2docker - docker build -f docker/greenguard-jupyter.Dockerfile -t greenguard-jupyter . +docker-jupyter-build: ## Build the greenguard-jupyter docker image using repo2docker + docker build -t greenguard-jupyter . .PHONY: docker-jupyter-save docker-jupyter-save: docker-jupyter-build ## Build the greenguard-jupyter image and save it as greenguard-jupyter.tar @@ -231,11 +231,11 @@ docker-jupyter-load: ## Load the greenguard-jupyter image from greenguard-jupyte .PHONY: docker-jupyter-run docker-jupyter-run: ## Run the greenguard-jupyter image in editable mode - docker run --rm -v $(shell pwd):/app -ti -p8888:8888 --name greenguard-jupyter greenguard-jupyter + docker run --rm -v $(shell pwd):/greenguard -ti -p8888:8888 --name greenguard-jupyter greenguard-jupyter .PHONY: docker-jupyter-start docker-jupyter-start: ## Start the greenguard-jupyter image as a daemon - docker run --rm -d -v $(shell pwd):/app -ti -p8888:8888 --name greenguard-jupyter greenguard-jupyter + docker run --rm -d -v $(shell pwd):/greenguard -ti -p8888:8888 --name greenguard-jupyter greenguard-jupyter .PHONY: docker-jupyter-stop docker-jupyter-stop: ## Stop the greenguard-jupyter daemon diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..5c549c6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,9 @@ +version: '3' +services: + greenguard: + build: + context: . + ports: + - "8888:8888" + volumes: + - .:/greenguard diff --git a/docker/greenguard-jupyter.Dockerfile b/docker/greenguard-jupyter.Dockerfile deleted file mode 100644 index 947d76b..0000000 --- a/docker/greenguard-jupyter.Dockerfile +++ /dev/null @@ -1,16 +0,0 @@ -FROM python:3.6 - -EXPOSE 8888 - -RUN mkdir /app -COPY setup.py /app -RUN pip install -e /app && pip install jupyter - -COPY greenguard /app/greenguard -COPY notebooks /app/notebooks - -RUN adduser jupyter --uid 1000 --disabled-password --system - -WORKDIR /app -USER jupyter -CMD /usr/local/bin/jupyter notebook --ip 0.0.0.0 --NotebookApp.token='' From beacd76769f861fd49204a562acf919ec1dbd83a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 13 Feb 2020 00:33:15 -0500 Subject: [PATCH 018/171] Udpate usage instructions --- Dockerfile | 15 ++++++++------- README.md | 29 +++++++++++++++++++++++++++-- docker-compose.yml | 6 ++++-- 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/Dockerfile b/Dockerfile index f1f953d..bd6411a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,17 @@ FROM python:3.6 +ARG UID=1000 EXPOSE 8888 -RUN adduser jupyter --uid 1000 --disabled-password --system +RUN adduser jupyter --uid $UID --disabled-password --system -RUN mkdir /greenguard -COPY setup.py /greenguard -RUN pip install -e /greenguard && pip install jupyter +RUN mkdir /app +COPY setup.py /app +RUN pip install -e /app && pip install jupyter -COPY greenguard /greenguard/greenguard -COPY notebooks /greenguard/notebooks +COPY greenguard /app/greenguard +COPY notebooks /app/notebooks -WORKDIR /greenguard +WORKDIR /app USER jupyter CMD /usr/local/bin/jupyter notebook --ip 0.0.0.0 --NotebookApp.token='' diff --git a/README.md b/README.md index 94de7e7..aecf61c 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,9 @@ The salient aspects of this customized project are: * A robust continuous integration and testing infrastructure. * A ``learning database`` recording all past outcomes --> tasks, pipelines, outcomes. -# Requirements +# Install + +## Requirements **GreenGuard** has been developed and runs on Python 3.6 and 3.7. @@ -51,7 +53,7 @@ Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering with other software installed in the system where you are trying to run **GreenGuard**. -# Install +## Download and Install **GreenGuard** can be installed locally using [pip](https://pip.pypa.io/en/stable/) with the following command: @@ -65,6 +67,29 @@ This will pull and install the latest stable release from [PyPi](https://pypi.or If you want to install from source or contribute to the project please read the [Contributing Guide](https://d3-ai.github.io/GreenGuard/contributing.html#get-started). +## Docker usage + +Alternatively, **GreenGuard** is prepared to be run inside a docker environment using +`docker-compose`. + +For this, make sure to have both [docker](https://docs.docker.com/install/) and [docker-compose]( +https://docs.docker.com/compose/install/) installed on your system and then follow these steps: + +1. Clone this repository and go into the `GreenGuard` folder: + +```bash +git clone git@github.com:D3-AI/GreenGuard.git +cd GreenGuard +``` + +2. Start a Jupyter Notebook inside a docker container. + +```bash +docker-compose up --build +``` + +3. Point your browser at http://127.0.0.1:8888 + # Data Format The minimum input expected by the **GreenGuard** system consists of the following two elements, diff --git a/docker-compose.yml b/docker-compose.yml index 5c549c6..dfb7aed 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,9 +1,11 @@ version: '3' services: - greenguard: + jupyter: build: context: . + args: + - UID=${UID:-1000} ports: - "8888:8888" volumes: - - .:/greenguard + - .:/app From 5cd3fe89f71c3214bae085db1c2fb39e58b83a23 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 14 Feb 2020 09:52:48 -0500 Subject: [PATCH 019/171] Fix some typos an add save/load example --- notebooks/2. Extract Readings.ipynb | 389 ++++++++++++++++++---------- 1 file changed, 255 insertions(+), 134 deletions(-) diff --git a/notebooks/2. Extract Readings.ipynb b/notebooks/2. Extract Readings.ipynb index 14b4cab..8379817 100644 --- a/notebooks/2. Extract Readings.ipynb +++ b/notebooks/2. Extract Readings.ipynb @@ -6,19 +6,20 @@ "source": [ "# Extract Readings\n", "\n", - "This notebook shows how to use the CSVLoader class to load readings from a folder\n", - "containing readings in the raw format.\n", + "This notebook shows how to use the CSVLoader class to load the readings table from a folder\n", + "that contains readings in the raw CSV format.\n", "\n", - "Details about the raw readings format can be found in the documentation site.\n", + "The Raw CSV format es briefly explained below, but more details can be found in [the documentation site](\n", + "https://d3-ai.github.io/GreenGuard/advanced_usage/csv.html)/n", "\n", "In this notebook we will:\n", "\n", "- Generate a folder with readings in the raw format based on the demo data\n", + "- Explore the raw format\n", "- Load the redings needed for our target times\n", "- Explore different options from the CSVLoader\n", - "- Load a pipeline and use it on the loaded data\n", "- Load the readings in the unstacked format\n", - "- Load an unstacked pipeline and use it on the loaded data" + "- Store the readins and target times using pickle" ] }, { @@ -69,18 +70,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-02-10 18:41:33,310 - INFO - demo - Generating file readings/T001/2013-01-.csv\n", - "2020-02-10 18:41:34,048 - INFO - demo - Generating file readings/T001/2013-02-.csv\n", - "2020-02-10 18:41:34,845 - INFO - demo - Generating file readings/T001/2013-03-.csv\n", - "2020-02-10 18:41:35,670 - INFO - demo - Generating file readings/T001/2013-04-.csv\n", - "2020-02-10 18:41:36,476 - INFO - demo - Generating file readings/T001/2013-05-.csv\n", - "2020-02-10 18:41:37,259 - INFO - demo - Generating file readings/T001/2013-06-.csv\n", - "2020-02-10 18:41:38,194 - INFO - demo - Generating file readings/T001/2013-07-.csv\n", - "2020-02-10 18:41:39,031 - INFO - demo - Generating file readings/T001/2013-08-.csv\n", - "2020-02-10 18:41:39,891 - INFO - demo - Generating file readings/T001/2013-09-.csv\n", - "2020-02-10 18:41:40,689 - INFO - demo - Generating file readings/T001/2013-10-.csv\n", - "2020-02-10 18:41:41,478 - INFO - demo - Generating file readings/T001/2013-11-.csv\n", - "2020-02-10 18:41:42,249 - INFO - demo - Generating file readings/T001/2013-12-.csv\n" + "2020-02-14 09:42:07,018 - INFO - demo - Generating file readings/T001/2013-01-.csv\n", + "2020-02-14 09:42:07,574 - INFO - demo - Generating file readings/T001/2013-02-.csv\n", + "2020-02-14 09:42:08,123 - INFO - demo - Generating file readings/T001/2013-03-.csv\n", + "2020-02-14 09:42:08,668 - INFO - demo - Generating file readings/T001/2013-04-.csv\n", + "2020-02-14 09:42:09,231 - INFO - demo - Generating file readings/T001/2013-05-.csv\n", + "2020-02-14 09:42:09,782 - INFO - demo - Generating file readings/T001/2013-06-.csv\n", + "2020-02-14 09:42:10,342 - INFO - demo - Generating file readings/T001/2013-07-.csv\n", + "2020-02-14 09:42:10,929 - INFO - demo - Generating file readings/T001/2013-08-.csv\n", + "2020-02-14 09:42:11,468 - INFO - demo - Generating file readings/T001/2013-09-.csv\n", + "2020-02-14 09:42:12,023 - INFO - demo - Generating file readings/T001/2013-10-.csv\n", + "2020-02-14 09:42:12,571 - INFO - demo - Generating file readings/T001/2013-11-.csv\n", + "2020-02-14 09:42:13,127 - INFO - demo - Generating file readings/T001/2013-12-.csv\n" ] } ], @@ -94,16 +95,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This function will generate a set of reading files in the raw format.\n", + "### Readings Format\n", "\n", - "We will load one of them to explore it:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Readings Format" + "Here we will load one of the generated CSV files to briefly explore its contents." ] }, { @@ -205,10 +199,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Here we can cleary see the format in which the data is stored:\n", + "We can cleary see the format in which the data is stored:\n", "\n", "* All the data from all the turbines is inside a single folder.\n", - "* Inside this folder, one folder exists for each turbine, named exactly like the turbine:\n", + "* Inside this folder, another folder exists for each turbine, named exactly like the turbine:\n", " * `readings/T001`\n", " * `readings/T002`\n", " * ...\n", @@ -235,16 +229,16 @@ "metadata": {}, "source": [ "The previous function will have also returned us a `target_times` variable,\n", - "which is a `pandas.DataFrame` with the three expected columns:\n", + "which is a `pandas.DataFrame` containing the training examples, with the three expected columns:\n", "\n", - "* `turbine_id`\n", - "* `cutoff_time`\n", - "* `target`" + "* `turbine_id`: Id of the turbine associated with each training example\n", + "* `cutoff_time`: Time at which the prediction is being made\n", + "* `target`: Value that needs to be predicted" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -253,7 +247,7 @@ "(353, 3)" ] }, - "execution_count": 7, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -264,7 +258,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -337,7 +331,7 @@ "4 T001 2013-01-16 0" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -348,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -357,7 +351,7 @@ "0.3002832861189802" ] }, - "execution_count": 9, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -368,7 +362,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -380,7 +374,7 @@ "dtype: object" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -398,20 +392,20 @@ "The readings in raw format can arbitrarily big, which might make it impossible to load\n", "them into memory all at once.\n", "\n", - "In order to load them in an efficient way that allows us to solve Machine Learning problems\n", - "using them, GeenGuard provides the `greenguard.loaders.CVSLoader` class.\n", + "In order to load them in an efficient way so that we can use them to solve Machine Learning\n", + "problems, GeenGuard provides the `greenguard.loaders.CVSLoader` class.\n", "\n", "This class is prepared to, given a target times table, explore a collection of raw readings\n", - "and extract only the information needed to solve the corresponding problem.\n", + "and extract only the information needed to solve that particular problem.\n", "\n", - "The first step in order to use it, is to create an instance passing it the path\n", + "The first step in order to use it is to create an instance passing it the path\n", "to where the reading files are stored.\n", "\n", "**NOTE**: If you want to use your own dataset instead of the demo version,\n", "all you have to do is make the `readings_path` variable point at the\n", "folder where you have your CVS files stored and load your `target_times` table:\n", "\n", - "Make sure to parse the `cutoff_time` column!\n", + "Make sure to parse the `cutoff_time` column as a datetime!\n", "\n", "```python\n", "readings_path = 'path/to/your/data'\n", @@ -421,7 +415,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -436,21 +430,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Once we have created our instance, we can load the readings needed for our target times\n", - "calling the `load` method with two arguments:\n", + "Once we have created our instance, we can load the readings needed for our target times by\n", + "calling the `load` method with the following two arguments:\n", "\n", "* `target_times (pandas.DataFrame)`: the `target_times` table.\n", "* `window_size (str)`: the size of the training window, as a timedelta specification\n", " (amount + time unit). This indicates the minimum amount of data that we need to\n", - " load for each training from the `target_times` table.\n", + " load for each training example from the `target_times` table.\n", " \n", "For example, let's load the readings needed for all our `target_times`, using a\n", - "`window_size` of one day." + "`window_size` of **one day**." ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 10, "metadata": { "scrolled": false }, @@ -459,9 +453,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-02-10 19:03:18,638 - INFO - csv - Loaded 1298564 readings from turbine T001\n", - "2020-02-10 19:03:18,763 - INFO - csv - Loaded 1298564 turbine readings\n", - "2020-02-10 19:03:19,115 - INFO - targets - Dropped 2 invalid targets\n" + "2020-02-14 09:42:33,976 - INFO - csv - Loaded 1306052 readings from turbine T001\n", + "2020-02-14 09:42:34,006 - INFO - csv - Loaded 1306052 turbine readings\n", + "2020-02-14 09:42:34,268 - INFO - targets - Dropped 2 invalid targets\n" ] } ], @@ -471,16 +465,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1298564, 4)" + "(1306052, 4)" ] }, - "execution_count": 18, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -491,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -526,36 +520,36 @@ " 0\n", " T001\n", " S01\n", - " 2013-01-12\n", - " 294.0\n", + " 2013-01-11\n", + " 209.0\n", " \n", " \n", " 1\n", " T001\n", " S02\n", - " 2013-01-12\n", - " 310.0\n", + " 2013-01-11\n", + " 193.0\n", " \n", " \n", " 2\n", " T001\n", " S03\n", - " 2013-01-12\n", - " 306.0\n", + " 2013-01-11\n", + " 177.0\n", " \n", " \n", " 3\n", " T001\n", " S04\n", - " 2013-01-12\n", - " 303.0\n", + " 2013-01-11\n", + " 188.0\n", " \n", " \n", " 4\n", " T001\n", " S05\n", - " 2013-01-12\n", - " 265.0\n", + " 2013-01-11\n", + " 150.0\n", " \n", " \n", "\n", @@ -563,14 +557,14 @@ ], "text/plain": [ " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-12 294.0\n", - "1 T001 S02 2013-01-12 310.0\n", - "2 T001 S03 2013-01-12 306.0\n", - "3 T001 S04 2013-01-12 303.0\n", - "4 T001 S05 2013-01-12 265.0" + "0 T001 S01 2013-01-11 209.0\n", + "1 T001 S02 2013-01-11 193.0\n", + "2 T001 S03 2013-01-11 177.0\n", + "3 T001 S04 2013-01-11 188.0\n", + "4 T001 S05 2013-01-11 150.0" ] }, - "execution_count": 19, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -581,7 +575,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -594,7 +588,7 @@ "dtype: object" ] }, - "execution_count": 20, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -612,23 +606,23 @@ "\n", "* `turbine_id`: Unique identifier of the turbine which this reading comes from.\n", "* `signal_id`: Unique identifier of the signal which this reading comes from.\n", - "* `timestamp (datetime)`: Time where the reading took place, as a datetime.\n", - "* `value (float)`: Numeric value of this reading." + "* `timestamp (datetime)`: Time at which the reading took place, as a datetime.\n", + "* `value (float)`: Numerical value of this reading." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also see how there is a message that indicates that there are 2 invalid targets\n", - "that have been dropped. This is because within our readings there was not enough\n", - "data to cover the entire trainin window for them, so they cannot be included in the\n", - "final problem specification." + "We can also see in the logged output above that there is a message that indicates that there\n", + "are 2 invalid targets that have been dropped. This is because within our readings there was not\n", + "enough data to cover the entire trainin window for them, and they have been discarded to ensure\n", + "that there is no missing data in our problem data." ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -637,7 +631,7 @@ "(351, 3)" ] }, - "execution_count": 11, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -650,21 +644,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Let's see what happens if we increase the `window_size` to, for example, 30 days." + "Let's see what happens if we increase the `window_size` to, for example, **30 days**." ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2020-02-10 19:08:21,859 - INFO - csv - Loaded 1302308 readings from turbine T001\n", - "2020-02-10 19:08:21,955 - INFO - csv - Loaded 1302308 turbine readings\n", - "2020-02-10 19:08:22,298 - INFO - targets - Dropped 28 invalid targets\n" + "2020-02-14 09:42:54,273 - INFO - csv - Loaded 1306052 readings from turbine T001\n", + "2020-02-14 09:42:54,309 - INFO - csv - Loaded 1306052 turbine readings\n", + "2020-02-14 09:42:54,535 - INFO - targets - Dropped 29 invalid targets\n" ] } ], @@ -676,22 +670,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that now more targets needed to be dropped, because there was enough data\n", - "for them." + "We can see that now more targets were be dropped, because there was not enough data for them." ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(321, 3)" + "(322, 3)" ] }, - "execution_count": 26, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -704,23 +697,22 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "On the other side, we can see how now the size of the loaded readings table\n", - "is a bit bigger, as more data had to be included to properly cover all the\n", - "training windows." + "On the other side, we can see how now the size of the loaded readings table has increased,\n", + "as more data had to be included to properly cover all the training windows." ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1302308, 4)" + "(1306052, 4)" ] }, - "execution_count": 27, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -735,24 +727,27 @@ "source": [ "## 3. Preprocessing the data\n", "\n", - "In some cases, if the amount of targets is big enough, fitting high frequency data\n", + "In some cases, if the amount of targets is big enough, loading high frequency data\n", "into memory will still be a challenge.\n", "\n", - "For this cases, the `CSVLoader` class also supports passing a resampling rule and\n", - "an aggregation function specification, so the data can go through a sampling\n", - "frequency reduction aggregation while it is loaded, reducing the amount of spaces\n", - "that it occupies in memory once loaded.\n", + "For this cases, the `CSVLoader` class also supports passing a **resampling rule** and\n", + "an **aggregation function** specification. In this cases, the data will go through a\n", + "**sampling frequency reduction aggregation** while it is loaded, reducing the amount\n", + "of memory needed to load it.\n", "\n", - "In order to use the resampling feature, we will need to create a new instance\n", - "of the `CSVLoader` passing the following new arguments:\n", + "In order to use the resampling feature, we will need to create a new instance of the\n", + "`CSVLoader` passing the following new arguments:\n", "\n", "* `rule (str)`: Time-delta specification (amount+unit) of the new sampling frequency.\n", - "* `aggregation (str or function)`: Aggregation to apply when resampling." + "* `aggregation (str or function)`: Aggregation function to apply when resampling.\n", + "\n", + "For example, let's create a `CSVLoader` instance that will reduce the sampling frequency\n", + "to **4 hours**, computing the **mean** of all the readings withing each interval." ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -768,17 +763,17 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2020-02-10 19:31:50,932 - INFO - csv - Loaded 1235535 readings from turbine T001\n", - "2020-02-10 19:31:50,938 - INFO - csv - Resampling: 4h - mean\n", - "2020-02-10 19:31:51,459 - INFO - csv - Loaded 52130 turbine readings\n", - "2020-02-10 19:31:51,689 - INFO - targets - Dropped 2 invalid targets\n" + "2020-02-14 09:43:13,166 - INFO - csv - Loaded 1239279 readings from turbine T001\n", + "2020-02-14 09:43:13,168 - INFO - csv - Resampling: 4h - mean\n", + "2020-02-14 09:43:13,443 - INFO - csv - Loaded 52286 turbine readings\n", + "2020-02-14 09:43:13,586 - INFO - targets - Dropped 2 invalid targets\n" ] } ], @@ -786,18 +781,25 @@ "target_times, readings = csv_loader.load(target_times, '14d')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see now how the size of the readings table has been drastically reduced." + ] + }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(52130, 4)" + "(52286, 4)" ] }, - "execution_count": 31, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -808,7 +810,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -887,7 +889,7 @@ "4 T001 S01 2013-01-27 16:00:00 720.833333" ] }, - "execution_count": 32, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -898,16 +900,16 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(319, 3)" + "(320, 3)" ] }, - "execution_count": 33, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -922,9 +924,9 @@ "source": [ "## 4. Unstacking\n", "\n", - "Some of the pipelines included in **GreenGuard** expect a slightly different input format,\n", + "Some of the pipelines included in **GreenGuard** expect a slightly different input format\n", "where the data has been unstacked by `signal_id`, putting the values of each signal in a\n", - "different column instead of having all of them in a single column.\n", + "different column instead of having all of them in a single one.\n", "\n", "In such cases, the `CSVLoader` can also take care of the unstacking step.\n", "\n", @@ -934,17 +936,17 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2020-02-10 19:36:03,403 - INFO - csv - Loaded 1228047 readings from turbine T001\n", - "2020-02-10 19:36:03,411 - INFO - csv - Resampling: 4h - mean\n", - "2020-02-10 19:36:03,881 - INFO - csv - Loaded 1993 turbine readings\n", - "2020-02-10 19:36:04,165 - INFO - targets - Dropped 2 invalid targets\n" + "2020-02-14 09:43:33,528 - INFO - csv - Loaded 1231791 readings from turbine T001\n", + "2020-02-14 09:43:33,530 - INFO - csv - Resampling: 4h - mean\n", + "2020-02-14 09:43:33,831 - INFO - csv - Loaded 1999 turbine readings\n", + "2020-02-14 09:43:33,970 - INFO - targets - Dropped 2 invalid targets\n" ] } ], @@ -953,18 +955,25 @@ "target_times, readings = csv_loader.load(target_times, '14d')" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The result is a table which has a much smaller number of rows, but one column for each signal" + ] + }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(1993, 28)" + "(1999, 28)" ] }, - "execution_count": 35, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -975,7 +984,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1180,7 +1189,7 @@ "[5 rows x 28 columns]" ] }, - "execution_count": 36, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1188,6 +1197,118 @@ "source": [ "readings.head()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Saving the readings\n", + "\n", + "In some cases we will not be intending to use the generated `readings` and `target_times` tables\n", + "right away, but rather store them for later use.\n", + "\n", + "### Using CSV\n", + "\n", + "This can be done using pandas an plain `CSV` format:\n", + "\n", + "**NOTE**: Notice the `index=False` argument. Otherwise, an extra index column will be added\n", + "to the CSV which would force us to modify the loading steps afterwards." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "target_times.to_csv('my_problem_target_times.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "readings.to_csv('my_problem_readings.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After this, we can easily reload the data back using pandas again.\n", + "\n", + "**NOTE**: Notice how the datetime columns need to be passed so they can be parsed!" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "my_target_times = pd.read_csv('my_problem_target_times.csv', parse_dates=['cutoff_time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "my_readings = pd.read_csv('my_problem_readings.csv', parse_dates=['timestamp'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "However, this has 2 inconvenients:\n", + "* Saving and loading the data is slow\n", + "* The datetimes need to be explicitly parsed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Using Pickle\n", + "\n", + "To solve the previously mentioned inconveniences we can use `pickle` instead of `CSV` format\n", + "to store our data.\n", + "\n", + "In order to do this we will put the two tables in a `tuple` and store them using `pickle.dump`." + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "\n", + "with open('my_problem.plk', 'wb') as pickle_file:\n", + " pickle.dump((target_times, readings), pickle_file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then load it back all at once using `pickle.load`." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "with open('my_problem.plk', 'rb') as pickle_file:\n", + " my_target_times, my_readings = pickle.load(pickle_file)" + ] } ], "metadata": { From 88c2bc52da5df3aef81e57bc7dc92ca718ffd136 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 14 Feb 2020 09:53:59 -0500 Subject: [PATCH 020/171] Prepare release notes for v0.2.0 --- HISTORY.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index dc27a8f..90dec27 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,12 +1,13 @@ # History -## 0.2.0 - 2020-02-11 +## 0.2.0 - 2020-02-14 -first stable release: +First stable release: * efficient data loading and preprocessing * initial collection of dfs and lstm based pipelines * optimized pipeline tuning +* documentation and tutorials ## 0.1.0 From f3ce70b2c7108576821c305d6ea39e78cc0262b7 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 14 Feb 2020 09:54:04 -0500 Subject: [PATCH 021/171] =?UTF-8?q?Bump=20version:=200.2.0.dev1=20?= =?UTF-8?q?=E2=86=92=200.2.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index f677be1..0c5a354 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.0.dev1' +__version__ = '0.2.0' import os diff --git a/setup.cfg b/setup.cfg index ba0d3ac..e64b81c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.0.dev1 +current_version = 0.2.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 611f50c..b200d20 100644 --- a/setup.py +++ b/setup.py @@ -104,6 +104,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.0.dev1', + version='0.2.0', zip_safe=False, ) From bb13a9353766803daca9f0a40eb19acd94a695a4 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 14 Feb 2020 09:54:19 -0500 Subject: [PATCH 022/171] =?UTF-8?q?Bump=20version:=200.2.0=20=E2=86=92=200?= =?UTF-8?q?.2.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 0c5a354..35fcad0 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.0' +__version__ = '0.2.1.dev0' import os diff --git a/setup.cfg b/setup.cfg index e64b81c..3596a0b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.0 +current_version = 0.2.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index b200d20..96b2019 100644 --- a/setup.py +++ b/setup.py @@ -104,6 +104,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.0', + version='0.2.1.dev0', zip_safe=False, ) From 18e3c59493dbf5daae54b129925d1f952fd35102 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 19 Feb 2020 13:44:28 -0500 Subject: [PATCH 023/171] Fix datetime format --- greenguard/demo.py | 2 +- greenguard/loaders/csv.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/greenguard/demo.py b/greenguard/demo.py index bae6a64..e15f71d 100644 --- a/greenguard/demo.py +++ b/greenguard/demo.py @@ -61,7 +61,7 @@ def generate_raw_readings(output_path='demo'): data = readings[readings.turbine_id == turbine_id] for month in range(1, 13): month_data = data[data.timestamp.dt.month == month].copy() - month_data['timestamp'] = month_data['timestamp'].dt.strftime('%m/%d/%y %M:%H:%S') + month_data['timestamp'] = month_data['timestamp'].dt.strftime('%m/%d/%y %H:%M:%S') month_path = os.path.join(turbine_path, '2013-{:02d}-.csv'.format(month)) LOGGER.info('Generating file %s', month_path) month_data[['signal_id', 'timestamp', 'value']].to_csv(month_path, index=False) diff --git a/greenguard/loaders/csv.py b/greenguard/loaders/csv.py index 5fec885..b0bfae5 100644 --- a/greenguard/loaders/csv.py +++ b/greenguard/loaders/csv.py @@ -36,7 +36,7 @@ class CSVLoader: Only used when resampling. Defaults to ``False``. """ - DEFAULT_DATETIME_FMT = '%m/%d/%y %M:%H:%S' + DEFAULT_DATETIME_FMT = '%m/%d/%y %H:%M:%S' DEFAULT_FILENAME_FMT = '%Y-%m-.csv' def __init__(self, readings_path='.', rule=None, aggregation='mean', unstack=False, From fe662815629094e35ee2adbb9bc1043fe2d75e8f Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 20 Feb 2020 21:43:12 -0500 Subject: [PATCH 024/171] Resample once per file. Also improve select_valid_targets --- greenguard/loaders/csv.py | 30 ++++++++++++++++++------------ greenguard/pipeline.py | 4 +++- greenguard/targets.py | 31 +++++++++++++++++++++++++++---- 3 files changed, 48 insertions(+), 17 deletions(-) diff --git a/greenguard/loaders/csv.py b/greenguard/loaders/csv.py index 5fec885..86f7b74 100644 --- a/greenguard/loaders/csv.py +++ b/greenguard/loaders/csv.py @@ -4,7 +4,7 @@ import dask import pandas as pd -from greenguard.targets import select_valid_targets +from greenguard.targets import drop_duplicates, select_valid_targets LOGGER = logging.getLogger(__name__) @@ -54,6 +54,12 @@ def __filter_by_signal(self, readings, signals): LOGGER.debug('Filtering by signal') readings = readings[readings.signal_id.isin(signals)] + try: + readings['value'] = readings['value'].astype(float) + except ValueError: + signals = readings[readings['value'].str.isnumeric()].signal_id.unique() + raise ValueError('Signals contain non-numerical values: {}'.format(signals)) + LOGGER.debug('Selected %s readings by signal', len(readings)) return readings.copy() @@ -97,12 +103,6 @@ def __load_readings_file(self, turbine_file, timestamps, signals): @dask.delayed def __consolidate(self, readings, turbine_id): readings = pd.concat(readings, ignore_index=True) - try: - readings['value'] = readings['value'].astype(float) - except ValueError: - signals = readings[readings['value'].str.isnumeric()].signal_id.unique() - raise ValueError('Signals contain non-numerical values: {}'.format(signals)) - readings.insert(0, 'turbine_id', turbine_id) LOGGER.info('Loaded %s readings from turbine %s', len(readings), turbine_id) @@ -127,9 +127,12 @@ def _join_names(names): @dask.delayed def __resample(self, readings): LOGGER.info('Resampling: %s - %s', self._rule, self._aggregation) - grouped = readings.groupby(['turbine_id', 'signal_id']) + grouped = readings.groupby('signal_id') dfr = grouped.resample(rule=self._rule, on='timestamp') agg = dfr.agg(self._aggregation) + + LOGGER.info('%s readings reduced to %s', len(readings), len(agg)) + if self._unstack: agg = agg.unstack(level='signal_id').reset_index() agg.columns = agg.columns.map(self._join_names) @@ -149,14 +152,15 @@ def _load_turbine(self, turbine_id, timestamps, signals=None): file_readings = self.__load_readings_file(filename, timestamps, signals) file_readings = self.__filter_by_signal(file_readings, signals) file_readings = self.__filter_by_timestamp(file_readings, timestamps) + + if self._rule: + file_readings = self.__resample(file_readings) + readings.append(file_readings) if readings: readings = self.__consolidate(readings, turbine_id) - if self._rule: - readings = self.__resample(readings) - return readings @staticmethod @@ -199,6 +203,8 @@ def load(self, target_times, window_size, signals=None, debug=False, select_vali target_times = pd.read_csv(target_times) target_times['cutoff_time'] = pd.to_datetime(target_times['cutoff_time']) + target_times = drop_duplicates(target_times) + if isinstance(signals, pd.DataFrame): signals = signals.signal_id @@ -222,7 +228,7 @@ def load(self, target_times, window_size, signals=None, debug=False, select_vali LOGGER.info('Loaded %s turbine readings', len(readings)) if select_valid: - target_times = select_valid_targets(target_times, readings, window_size) + target_times = select_valid_targets(target_times, readings, window_size, self._rule) return target_times, readings return readings diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 6ac75af..5ed7ec1 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -224,6 +224,8 @@ def __init__(self, template, metric='accuracy', cost=False, init_params=None, st if self._preprocessing and (self._preprocessing > self._static): raise ValueError('Preprocessing cannot be bigger than static') + self.iterations = 0 + def __repr__(self): return ( "GreenGuardPipeline({})\n" @@ -444,7 +446,7 @@ def tune(self, target_times=None, readings=None, turbines=None, iterations=10): self._tuner = self._get_tuner() - for i in range(iterations): + for i in range(self.iterations, self.iterations + iterations): LOGGER.info('Scoring pipeline %s', i + 1) params = self._tuner.propose(1) diff --git a/greenguard/targets.py b/greenguard/targets.py index 18106b7..aef280a 100644 --- a/greenguard/targets.py +++ b/greenguard/targets.py @@ -119,12 +119,12 @@ def apply_function(row): except KeyError: return False - return times['min'] < cutoff < times['max'] + return times['min'] <= cutoff <= times['max'] return apply_function -def select_valid_targets(target_times, readings, window_size): +def select_valid_targets(target_times, readings, window_size, rule=None): """Filter out target_times without enough data for this window_size. The table_times table is scanned and checked against the readings table @@ -138,6 +138,9 @@ def select_valid_targets(target_times, readings, window_size): Readings table, with at least turbine_id, signal_id, and timestamp ields. window_size (str or pandas.TimeDelta): TimeDelta specification that indicates the lenght of the training window. + rule (str or pandas.TimeDelta): + Resampling rule specification. If given, add that to the max timestamp + to ensure the period is completely covered. Returns: pandas.DataFrame: @@ -147,9 +150,29 @@ def select_valid_targets(target_times, readings, window_size): timestamps = readings.groupby('turbine_id').timestamp.agg(['min', 'max']) timestamps['min'] += pd.to_timedelta(window_size) + if rule is not None: + timestamps['max'] += pd.to_timedelta(rule) + valid = target_times.apply(_valid_targets(timestamps), axis=1) - valid_targets = target_times[valid].copy() + valid_targets = target_times[valid] - LOGGER.info('Dropped %s invalid targets', len(target_times) - len(valid_targets)) + length = len(valid_targets) + LOGGER.info('Dropped %s targets without enough data. Final target_times size: %s', + len(target_times) - length, length) return valid_targets + + +def drop_duplicates(target_times): + length = len(target_times) + filtered = target_times.drop_duplicates() + new_length = len(filtered) + if length != new_length: + LOGGER.warn('Dropped %s duplicate targets!', length - new_length) + + filtered = filtered.drop_duplicates(subset=['turbine_id', 'cutoff_time'], keep=False) + final_length = len(filtered) + if new_length != final_length: + LOGGER.warn('Dropped %s incoherent targets!', new_length - final_length) + + return filtered.copy() From 75e248ec0bb8114739a6dbb3a0d03588f9389701 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 24 Feb 2020 18:32:25 -0500 Subject: [PATCH 025/171] Fix docker config and update notebooks to the latest changes --- Dockerfile | 5 +- docker-compose.yml | 1 + notebooks/1. GreenGuard Quickstart.ipynb | 176 +++++----- notebooks/2. Extract Readings.ipynb | 405 +++++++++++++---------- 4 files changed, 315 insertions(+), 272 deletions(-) diff --git a/Dockerfile b/Dockerfile index bd6411a..3aeebd1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,8 +7,11 @@ RUN adduser jupyter --uid $UID --disabled-password --system RUN mkdir /app COPY setup.py /app -RUN pip install -e /app && pip install jupyter +RUN mkdir /app/greenguard +COPY greenguard/__init__.py /app/greenguard +RUN pip install -e /app jupyter +RUN rm -r /app/greenguard COPY greenguard /app/greenguard COPY notebooks /app/notebooks diff --git a/docker-compose.yml b/docker-compose.yml index dfb7aed..a839518 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,6 +1,7 @@ version: '3' services: jupyter: + network_mode: host build: context: . args: diff --git a/notebooks/1. GreenGuard Quickstart.ipynb b/notebooks/1. GreenGuard Quickstart.ipynb index a32b494..9c0e2d7 100644 --- a/notebooks/1. GreenGuard Quickstart.ipynb +++ b/notebooks/1. GreenGuard Quickstart.ipynb @@ -380,7 +380,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -405,7 +405,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -420,7 +420,7 @@ " 'resample_600s_unstack_144_lstm_timeseries_classifier']" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -440,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -451,7 +451,7 @@ " 'resample_600s_unstack_dfs_1d_xgb_classifier']" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -470,18 +470,18 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'resample_600s_normalize_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json'}" + "{'resample_600s_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json'}" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -508,7 +508,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -537,7 +537,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -557,15 +557,15 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2020-02-03 12:51:46,145 - INFO - pipeline - Scoring the default pipeline\n", - "2020-02-03 12:51:46,147 - INFO - pipeline - Running static steps before cross validation\n" + "INFO:greenguard.pipeline:Scoring the default pipeline\n", + "INFO:greenguard.pipeline:Running static steps before cross validation\n" ] }, { @@ -573,32 +573,32 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 00:32 | Progress: 100%|██████████\n", - "Elapsed: 00:16 | Progress: 100%|██████████\n", + "Elapsed: 00:47 | Progress: 100%|██████████\n", + "Elapsed: 00:24 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:33 | Progress: 100%|██████████\n", - "Elapsed: 00:15 | Progress: 100%|██████████\n", + "Elapsed: 00:50 | Progress: 100%|██████████\n", + "Elapsed: 00:23 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:31 | Progress: 100%|██████████\n", - "Elapsed: 00:15 | Progress: 100%|██████████\n" + "Elapsed: 00:46 | Progress: 100%|██████████\n", + "Elapsed: 00:23 | Progress: 100%|██████████\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2020-02-03 12:54:14,195 - INFO - pipeline - Default Pipeline score: 0.605187908496732\n", - "2020-02-03 12:54:14,196 - INFO - pipeline - Scoring pipeline 1\n", - "2020-02-03 12:54:14,199 - INFO - gp - Using Uniform sampler as user specified r_minimum threshold is not met to start the GP based learning\n", - "2020-02-03 12:54:14,380 - INFO - pipeline - Pipeline 1 score: 0.5976760567286199\n", - "2020-02-03 12:54:14,385 - INFO - pipeline - Scoring pipeline 2\n", - "2020-02-03 12:54:14,888 - INFO - pipeline - Pipeline 2 score: 0.5965798320999443\n", - "2020-02-03 12:54:14,890 - INFO - pipeline - Scoring pipeline 3\n", - "2020-02-03 12:54:15,313 - INFO - pipeline - Pipeline 3 score: 0.6431783902372138\n", - "2020-02-03 12:54:15,316 - INFO - pipeline - Scoring pipeline 4\n", - "2020-02-03 12:54:15,729 - INFO - pipeline - Pipeline 4 score: 0.5642664541017163\n", - "2020-02-03 12:54:15,731 - INFO - pipeline - Scoring pipeline 5\n", - "2020-02-03 12:54:15,883 - INFO - pipeline - Pipeline 5 score: 0.5859328579916815\n" + "INFO:greenguard.pipeline:Default Pipeline score: 0.605187908496732\n", + "INFO:greenguard.pipeline:Scoring pipeline 1\n", + "INFO:btb:Using Uniform sampler as user specified r_minimum threshold is not met to start the GP based learning\n", + "INFO:greenguard.pipeline:Pipeline 1 score: 0.6188131761825791\n", + "INFO:greenguard.pipeline:Scoring pipeline 2\n", + "INFO:greenguard.pipeline:Pipeline 2 score: 0.6271095502877767\n", + "INFO:greenguard.pipeline:Scoring pipeline 3\n", + "INFO:greenguard.pipeline:Pipeline 3 score: 0.6305597783858653\n", + "INFO:greenguard.pipeline:Scoring pipeline 4\n", + "INFO:greenguard.pipeline:Pipeline 4 score: 0.6024864024864024\n", + "INFO:greenguard.pipeline:Scoring pipeline 5\n", + "INFO:greenguard.pipeline:Pipeline 5 score: 0.6141217155301661\n" ] } ], @@ -618,21 +618,21 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': 28},\n", - " 'xgboost.XGBClassifier#1': {'n_estimators': 549,\n", - " 'max_depth': 3,\n", - " 'learning_rate': 0.09499856413762053,\n", - " 'gamma': 0.48809516357182936,\n", - " 'min_child_weight': 7}}" + "{'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': 82},\n", + " 'xgboost.XGBClassifier#1': {'n_estimators': 785,\n", + " 'max_depth': 7,\n", + " 'learning_rate': 0.12220259756122442,\n", + " 'gamma': 0.07359343182340616,\n", + " 'min_child_weight': 9}}" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -651,16 +651,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6431783902372138" + "0.6305597783858653" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -680,33 +680,33 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2020-02-03 12:54:15,971 - INFO - pipeline - Scoring pipeline 1\n", - "2020-02-03 12:54:16,421 - INFO - pipeline - Pipeline 1 score: 0.6220467704338674\n", - "2020-02-03 12:54:16,423 - INFO - pipeline - Scoring pipeline 2\n", - "2020-02-03 12:54:16,795 - INFO - pipeline - Pipeline 2 score: 0.5867369345630215\n", - "2020-02-03 12:54:16,797 - INFO - pipeline - Scoring pipeline 3\n", - "2020-02-03 12:54:17,227 - INFO - pipeline - Pipeline 3 score: 0.6161616161616162\n", - "2020-02-03 12:54:17,229 - INFO - pipeline - Scoring pipeline 4\n", - "2020-02-03 12:54:17,725 - INFO - pipeline - Pipeline 4 score: 0.6037324896256047\n", - "2020-02-03 12:54:17,727 - INFO - pipeline - Scoring pipeline 5\n", - "2020-02-03 12:54:18,287 - INFO - pipeline - Pipeline 5 score: 0.6169717350045217\n", - "2020-02-03 12:54:18,288 - INFO - pipeline - Scoring pipeline 6\n", - "2020-02-03 12:54:18,744 - INFO - pipeline - Pipeline 6 score: 0.639102564102564\n", - "2020-02-03 12:54:18,746 - INFO - pipeline - Scoring pipeline 7\n", - "2020-02-03 12:54:19,171 - INFO - pipeline - Pipeline 7 score: 0.6724889262202695\n", - "2020-02-03 12:54:19,174 - INFO - pipeline - Scoring pipeline 8\n", - "2020-02-03 12:54:19,627 - INFO - pipeline - Pipeline 8 score: 0.628250663400694\n", - "2020-02-03 12:54:19,629 - INFO - pipeline - Scoring pipeline 9\n", - "2020-02-03 12:54:20,250 - INFO - pipeline - Pipeline 9 score: 0.656191724941725\n", - "2020-02-03 12:54:20,253 - INFO - pipeline - Scoring pipeline 10\n", - "2020-02-03 12:54:20,799 - INFO - pipeline - Pipeline 10 score: 0.639014073371284\n" + "INFO:greenguard.pipeline:Scoring pipeline 1\n", + "INFO:greenguard.pipeline:Pipeline 1 score: 0.6635006784260514\n", + "INFO:greenguard.pipeline:Scoring pipeline 2\n", + "INFO:greenguard.pipeline:Pipeline 2 score: 0.6845139382452815\n", + "INFO:greenguard.pipeline:Scoring pipeline 3\n", + "INFO:greenguard.pipeline:Pipeline 3 score: 0.6424425247954658\n", + "INFO:greenguard.pipeline:Scoring pipeline 4\n", + "INFO:greenguard.pipeline:Pipeline 4 score: 0.6146558553876801\n", + "INFO:greenguard.pipeline:Scoring pipeline 5\n", + "INFO:greenguard.pipeline:Pipeline 5 score: 0.6188226349516671\n", + "INFO:greenguard.pipeline:Scoring pipeline 6\n", + "INFO:greenguard.pipeline:Pipeline 6 score: 0.6213326748609891\n", + "INFO:greenguard.pipeline:Scoring pipeline 7\n", + "INFO:greenguard.pipeline:Pipeline 7 score: 0.6431577681577682\n", + "INFO:greenguard.pipeline:Scoring pipeline 8\n", + "INFO:greenguard.pipeline:Pipeline 8 score: 0.6119918008302174\n", + "INFO:greenguard.pipeline:Scoring pipeline 9\n", + "INFO:greenguard.pipeline:Pipeline 9 score: 0.670814479638009\n", + "INFO:greenguard.pipeline:Scoring pipeline 10\n", + "INFO:greenguard.pipeline:Pipeline 10 score: 0.6781385082782808\n" ] } ], @@ -716,16 +716,16 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6724889262202695" + "0.6845139382452815" ] }, - "execution_count": 20, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -736,21 +736,21 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': 35},\n", - " 'xgboost.XGBClassifier#1': {'n_estimators': 542,\n", - " 'max_depth': 9,\n", - " 'learning_rate': 0.8024814826871371,\n", - " 'gamma': 0.8891378840299992,\n", + "{'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': 84},\n", + " 'xgboost.XGBClassifier#1': {'n_estimators': 788,\n", + " 'max_depth': 4,\n", + " 'learning_rate': 0.13866846579555614,\n", + " 'gamma': 0.652732260680545,\n", " 'min_child_weight': 10}}" ] }, - "execution_count": 21, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -774,7 +774,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -782,7 +782,7 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 00:35 | Progress: 100%|██████████\n" + "Elapsed: 00:52 | Progress: 100%|██████████\n" ] } ], @@ -801,14 +801,14 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 22, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:11 | Progress: 100%|██████████\n" + "Elapsed: 00:17 | Progress: 100%|██████████\n" ] } ], @@ -825,16 +825,16 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.7058823529411765" + "0.76" ] }, - "execution_count": 24, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -863,7 +863,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ @@ -882,7 +882,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -898,14 +898,14 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:11 | Progress: 100%|██████████\n" + "Elapsed: 00:17 | Progress: 100%|██████████\n" ] }, { @@ -914,7 +914,7 @@ "array([0, 0, 0, 1, 0])" ] }, - "execution_count": 27, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -941,7 +941,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.6.8" } }, "nbformat": 4, diff --git a/notebooks/2. Extract Readings.ipynb b/notebooks/2. Extract Readings.ipynb index 8379817..f8166a0 100644 --- a/notebooks/2. Extract Readings.ipynb +++ b/notebooks/2. Extract Readings.ipynb @@ -70,18 +70,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-02-14 09:42:07,018 - INFO - demo - Generating file readings/T001/2013-01-.csv\n", - "2020-02-14 09:42:07,574 - INFO - demo - Generating file readings/T001/2013-02-.csv\n", - "2020-02-14 09:42:08,123 - INFO - demo - Generating file readings/T001/2013-03-.csv\n", - "2020-02-14 09:42:08,668 - INFO - demo - Generating file readings/T001/2013-04-.csv\n", - "2020-02-14 09:42:09,231 - INFO - demo - Generating file readings/T001/2013-05-.csv\n", - "2020-02-14 09:42:09,782 - INFO - demo - Generating file readings/T001/2013-06-.csv\n", - "2020-02-14 09:42:10,342 - INFO - demo - Generating file readings/T001/2013-07-.csv\n", - "2020-02-14 09:42:10,929 - INFO - demo - Generating file readings/T001/2013-08-.csv\n", - "2020-02-14 09:42:11,468 - INFO - demo - Generating file readings/T001/2013-09-.csv\n", - "2020-02-14 09:42:12,023 - INFO - demo - Generating file readings/T001/2013-10-.csv\n", - "2020-02-14 09:42:12,571 - INFO - demo - Generating file readings/T001/2013-11-.csv\n", - "2020-02-14 09:42:13,127 - INFO - demo - Generating file readings/T001/2013-12-.csv\n" + "INFO:greenguard.demo:Generating file readings/T001/2013-01-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-02-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-03-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-04-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-05-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-06-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-07-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-08-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-09-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-10-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-11-.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-12-.csv\n" ] } ], @@ -453,14 +453,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-02-14 09:42:33,976 - INFO - csv - Loaded 1306052 readings from turbine T001\n", - "2020-02-14 09:42:34,006 - INFO - csv - Loaded 1306052 turbine readings\n", - "2020-02-14 09:42:34,268 - INFO - targets - Dropped 2 invalid targets\n" + "INFO:greenguard.loaders.csv:Loaded 1306052 readings from turbine T001\n", + "INFO:greenguard.loaders.csv:Loaded 1306052 turbine readings\n", + "INFO:greenguard.targets:Dropped 0 targets without enough data. Final target_times size: 353\n" ] } ], "source": [ - "target_times, readings = csv_loader.load(target_times, '1d')" + "new_target_times, readings = csv_loader.load(target_times, '1d')" ] }, { @@ -610,16 +610,6 @@ "* `value (float)`: Numerical value of this reading." ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also see in the logged output above that there is a message that indicates that there\n", - "are 2 invalid targets that have been dropped. This is because within our readings there was not\n", - "enough data to cover the entire trainin window for them, and they have been discarded to ensure\n", - "that there is no missing data in our problem data." - ] - }, { "cell_type": "code", "execution_count": 14, @@ -628,7 +618,7 @@ { "data": { "text/plain": [ - "(351, 3)" + "(353, 3)" ] }, "execution_count": 14, @@ -637,7 +627,7 @@ } ], "source": [ - "target_times.shape" + "new_target_times.shape" ] }, { @@ -656,21 +646,24 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-02-14 09:42:54,273 - INFO - csv - Loaded 1306052 readings from turbine T001\n", - "2020-02-14 09:42:54,309 - INFO - csv - Loaded 1306052 turbine readings\n", - "2020-02-14 09:42:54,535 - INFO - targets - Dropped 29 invalid targets\n" + "INFO:greenguard.loaders.csv:Loaded 1309796 readings from turbine T001\n", + "INFO:greenguard.loaders.csv:Loaded 1309796 turbine readings\n", + "INFO:greenguard.targets:Dropped 28 targets without enough data. Final target_times size: 325\n" ] } ], "source": [ - "target_times, readings = csv_loader.load(target_times, '30d')" + "new_target_times, readings = csv_loader.load(target_times, '30d')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that now more targets were be dropped, because there was not enough data for them." + "We can see now in the logged output above that there is a message that indicates that there\n", + "were 28 invalid targets that were dropped. This is because within our readings there was not\n", + "enough data to cover the entire training window for each traning example, so the ones that were\n", + "not covered were dropped to ensure that all the training examples are valid to work with them." ] }, { @@ -681,7 +674,7 @@ { "data": { "text/plain": [ - "(322, 3)" + "(325, 3)" ] }, "execution_count": 16, @@ -690,7 +683,7 @@ } ], "source": [ - "target_times.shape" + "new_target_times.shape" ] }, { @@ -709,7 +702,7 @@ { "data": { "text/plain": [ - "(1306052, 4)" + "(1309796, 4)" ] }, "execution_count": 17, @@ -770,15 +763,38 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-02-14 09:43:13,166 - INFO - csv - Loaded 1239279 readings from turbine T001\n", - "2020-02-14 09:43:13,168 - INFO - csv - Resampling: 4h - mean\n", - "2020-02-14 09:43:13,443 - INFO - csv - Loaded 52286 turbine readings\n", - "2020-02-14 09:43:13,586 - INFO - targets - Dropped 2 invalid targets\n" + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:81749 readings reduced to 3432\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:103319 readings reduced to 4368\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:110938 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:115647 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:115979 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:111862 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:114477 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:105321 readings reduced to 4550\n", + "INFO:greenguard.loaders.csv:115615 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:114400 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:108371 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:112118 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:Loaded 55250 readings from turbine T001\n", + "INFO:greenguard.loaders.csv:Loaded 55250 turbine readings\n", + "INFO:greenguard.targets:Dropped 12 targets without enough data. Final target_times size: 341\n" ] } ], "source": [ - "target_times, readings = csv_loader.load(target_times, '14d')" + "new_target_times, readings = csv_loader.load(target_times, '14d')" ] }, { @@ -796,7 +812,7 @@ { "data": { "text/plain": [ - "(52286, 4)" + "(55250, 4)" ] }, "execution_count": 20, @@ -845,36 +861,36 @@ " 0\n", " T001\n", " S01\n", - " 2013-01-27 00:00:00\n", - " 791.333333\n", + " 2013-01-10 00:00:00\n", + " 253.041667\n", " \n", " \n", " 1\n", " T001\n", " S01\n", - " 2013-01-27 04:00:00\n", - " 746.750000\n", + " 2013-01-10 04:00:00\n", + " 572.083333\n", " \n", " \n", " 2\n", " T001\n", " S01\n", - " 2013-01-27 08:00:00\n", - " 808.750000\n", + " 2013-01-10 08:00:00\n", + " 688.791667\n", " \n", " \n", " 3\n", " T001\n", " S01\n", - " 2013-01-27 12:00:00\n", - " 760.125000\n", + " 2013-01-10 12:00:00\n", + " 396.333333\n", " \n", " \n", " 4\n", " T001\n", " S01\n", - " 2013-01-27 16:00:00\n", - " 720.833333\n", + " 2013-01-10 16:00:00\n", + " 390.458333\n", " \n", " \n", "\n", @@ -882,11 +898,11 @@ ], "text/plain": [ " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-27 00:00:00 791.333333\n", - "1 T001 S01 2013-01-27 04:00:00 746.750000\n", - "2 T001 S01 2013-01-27 08:00:00 808.750000\n", - "3 T001 S01 2013-01-27 12:00:00 760.125000\n", - "4 T001 S01 2013-01-27 16:00:00 720.833333" + "0 T001 S01 2013-01-10 00:00:00 253.041667\n", + "1 T001 S01 2013-01-10 04:00:00 572.083333\n", + "2 T001 S01 2013-01-10 08:00:00 688.791667\n", + "3 T001 S01 2013-01-10 12:00:00 396.333333\n", + "4 T001 S01 2013-01-10 16:00:00 390.458333" ] }, "execution_count": 21, @@ -906,7 +922,7 @@ { "data": { "text/plain": [ - "(320, 3)" + "(341, 3)" ] }, "execution_count": 22, @@ -915,7 +931,7 @@ } ], "source": [ - "target_times.shape" + "new_target_times.shape" ] }, { @@ -943,16 +959,39 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-02-14 09:43:33,528 - INFO - csv - Loaded 1231791 readings from turbine T001\n", - "2020-02-14 09:43:33,530 - INFO - csv - Resampling: 4h - mean\n", - "2020-02-14 09:43:33,831 - INFO - csv - Loaded 1999 turbine readings\n", - "2020-02-14 09:43:33,970 - INFO - targets - Dropped 2 invalid targets\n" + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:81749 readings reduced to 3432\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:103319 readings reduced to 4368\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:110938 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:115979 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:108371 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:105321 readings reduced to 4550\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:115615 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:114400 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:115647 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:112118 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:111862 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:114477 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:Loaded 2125 readings from turbine T001\n", + "INFO:greenguard.loaders.csv:Loaded 2125 turbine readings\n", + "INFO:greenguard.targets:Dropped 12 targets without enough data. Final target_times size: 341\n" ] } ], "source": [ "csv_loader = CSVLoader(readings_path, rule='4h', aggregation='mean', unstack=True)\n", - "target_times, readings = csv_loader.load(target_times, '14d')" + "new_target_times, readings = csv_loader.load(target_times, '14d')" ] }, { @@ -970,7 +1009,7 @@ { "data": { "text/plain": [ - "(1999, 28)" + "(2125, 28)" ] }, "execution_count": 24, @@ -1035,122 +1074,122 @@ " \n", " 0\n", " T001\n", - " 2013-01-28 00:00:00\n", - " 715.750000\n", - " 709.333333\n", - " 710.208333\n", - " 796.666667\n", - " 771.750000\n", - " 732.916667\n", - " 766.166667\n", - " 3.361627e+06\n", + " 2013-01-10 00:00:00\n", + " 253.041667\n", + " 268.250000\n", + " 268.041667\n", + " 297.166667\n", + " 234.666667\n", + " 261.916667\n", + " 206.791667\n", + " 3.198335e+06\n", " ...\n", - " 13.487500\n", - " 4.272212e+06\n", - " 49.041667\n", - " 49.041667\n", - " 49.041667\n", - " 49.041667\n", - " 49.041667\n", - " 49.041667\n", - " 49.041667\n", - " 336.000000\n", + " 9.079167\n", + " 3.134510e+06\n", + " 42.416667\n", + " 44.958333\n", + " 44.833333\n", + " 49.625000\n", + " 39.208333\n", + " 43.833333\n", + " 34.625\n", + " 293.166667\n", " \n", " \n", " 1\n", " T001\n", - " 2013-01-28 04:00:00\n", - " 779.416667\n", - " 777.500000\n", - " 779.666667\n", - " 824.125000\n", - " 800.083333\n", - " 765.291667\n", - " 791.958333\n", - " 3.362652e+06\n", + " 2013-01-10 04:00:00\n", + " 572.083333\n", + " 555.291667\n", + " 538.666667\n", + " 592.291667\n", + " 557.166667\n", + " 534.000000\n", + " 544.250000\n", + " 3.199514e+06\n", " ...\n", - " 14.695833\n", - " 4.279238e+06\n", - " 43.875000\n", - " 43.875000\n", - " 43.875000\n", - " 43.875000\n", - " 43.916667\n", - " 43.875000\n", - " 43.916667\n", - " 301.083333\n", + " 10.837500\n", + " 3.142505e+06\n", + " 62.083333\n", + " 62.500000\n", + " 63.625000\n", + " 63.541667\n", + " 61.333333\n", + " 62.541667\n", + " 54.000\n", + " 421.208333\n", " \n", " \n", " 2\n", " T001\n", - " 2013-01-28 08:00:00\n", - " 732.583333\n", - " 757.375000\n", - " 738.125000\n", - " 794.583333\n", - " 765.291667\n", - " 736.541667\n", - " 766.916667\n", - " 3.364190e+06\n", + " 2013-01-10 08:00:00\n", + " 688.791667\n", + " 696.791667\n", + " 706.625000\n", + " 750.791667\n", + " 714.250000\n", + " 683.333333\n", + " 658.166667\n", + " 3.201449e+06\n", " ...\n", - " 14.100000\n", - " 4.289814e+06\n", - " 81.666667\n", - " 82.375000\n", - " 82.416667\n", - " 82.875000\n", - " 82.541667\n", - " 83.250000\n", - " 81.416667\n", - " 564.041667\n", + " 12.754167\n", + " 3.155809e+06\n", + " 92.208333\n", + " 94.958333\n", + " 94.666667\n", + " 97.333333\n", + " 94.125000\n", + " 93.583333\n", + " 86.375\n", + " 638.291667\n", " \n", " \n", " 3\n", " T001\n", - " 2013-01-28 12:00:00\n", - " 743.833333\n", - " 779.083333\n", - " 775.833333\n", - " 804.208333\n", - " 771.458333\n", - " 736.166667\n", - " 761.000000\n", - " 3.366258e+06\n", + " 2013-01-10 12:00:00\n", + " 396.333333\n", + " 418.500000\n", + " 415.791667\n", + " 438.541667\n", + " 382.250000\n", + " 364.666667\n", + " 320.333333\n", + " 3.203319e+06\n", " ...\n", - " 13.691667\n", - " 4.304198e+06\n", - " 88.250000\n", - " 90.833333\n", - " 90.875000\n", - " 91.500000\n", - " 90.166667\n", - " 90.875000\n", - " 88.916667\n", - " 616.833333\n", + " 10.916667\n", + " 3.168640e+06\n", + " 55.750000\n", + " 60.083333\n", + " 58.583333\n", + " 61.291667\n", + " 52.791667\n", + " 52.791667\n", + " 44.000\n", + " 376.125000\n", " \n", " \n", " 4\n", " T001\n", - " 2013-01-28 16:00:00\n", - " 640.416667\n", - " 678.000000\n", - " 675.958333\n", - " 709.166667\n", - " 675.833333\n", - " 670.666667\n", - " 682.166667\n", - " 3.368310e+06\n", + " 2013-01-10 16:00:00\n", + " 390.458333\n", + " 408.875000\n", + " 409.500000\n", + " 458.000000\n", + " 415.583333\n", + " 363.000000\n", + " 364.458333\n", + " 3.204504e+06\n", " ...\n", - " 12.454167\n", - " 4.318658e+06\n", - " 80.458333\n", - " 83.541667\n", - " 85.333333\n", - " 85.916667\n", - " 83.500000\n", - " 86.375000\n", - " 83.333333\n", - " 574.958333\n", + " 10.412500\n", + " 3.176672e+06\n", + " 49.958333\n", + " 53.875000\n", + " 54.458333\n", + " 56.750000\n", + " 52.708333\n", + " 46.708333\n", + " 47.625\n", + " 354.750000\n", " \n", " \n", "\n", @@ -1159,32 +1198,32 @@ ], "text/plain": [ " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", - "0 T001 2013-01-28 00:00:00 715.750000 709.333333 710.208333 \n", - "1 T001 2013-01-28 04:00:00 779.416667 777.500000 779.666667 \n", - "2 T001 2013-01-28 08:00:00 732.583333 757.375000 738.125000 \n", - "3 T001 2013-01-28 12:00:00 743.833333 779.083333 775.833333 \n", - "4 T001 2013-01-28 16:00:00 640.416667 678.000000 675.958333 \n", + "0 T001 2013-01-10 00:00:00 253.041667 268.250000 268.041667 \n", + "1 T001 2013-01-10 04:00:00 572.083333 555.291667 538.666667 \n", + "2 T001 2013-01-10 08:00:00 688.791667 696.791667 706.625000 \n", + "3 T001 2013-01-10 12:00:00 396.333333 418.500000 415.791667 \n", + "4 T001 2013-01-10 16:00:00 390.458333 408.875000 409.500000 \n", "\n", " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", - "0 796.666667 771.750000 732.916667 766.166667 3.361627e+06 ... \n", - "1 824.125000 800.083333 765.291667 791.958333 3.362652e+06 ... \n", - "2 794.583333 765.291667 736.541667 766.916667 3.364190e+06 ... \n", - "3 804.208333 771.458333 736.166667 761.000000 3.366258e+06 ... \n", - "4 709.166667 675.833333 670.666667 682.166667 3.368310e+06 ... \n", + "0 297.166667 234.666667 261.916667 206.791667 3.198335e+06 ... \n", + "1 592.291667 557.166667 534.000000 544.250000 3.199514e+06 ... \n", + "2 750.791667 714.250000 683.333333 658.166667 3.201449e+06 ... \n", + "3 438.541667 382.250000 364.666667 320.333333 3.203319e+06 ... \n", + "4 458.000000 415.583333 363.000000 364.458333 3.204504e+06 ... \n", "\n", " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", - "0 13.487500 4.272212e+06 49.041667 49.041667 49.041667 49.041667 \n", - "1 14.695833 4.279238e+06 43.875000 43.875000 43.875000 43.875000 \n", - "2 14.100000 4.289814e+06 81.666667 82.375000 82.416667 82.875000 \n", - "3 13.691667 4.304198e+06 88.250000 90.833333 90.875000 91.500000 \n", - "4 12.454167 4.318658e+06 80.458333 83.541667 85.333333 85.916667 \n", + "0 9.079167 3.134510e+06 42.416667 44.958333 44.833333 49.625000 \n", + "1 10.837500 3.142505e+06 62.083333 62.500000 63.625000 63.541667 \n", + "2 12.754167 3.155809e+06 92.208333 94.958333 94.666667 97.333333 \n", + "3 10.916667 3.168640e+06 55.750000 60.083333 58.583333 61.291667 \n", + "4 10.412500 3.176672e+06 49.958333 53.875000 54.458333 56.750000 \n", "\n", " value_S23 value_S24 value_S25 value_S26 \n", - "0 49.041667 49.041667 49.041667 336.000000 \n", - "1 43.916667 43.875000 43.916667 301.083333 \n", - "2 82.541667 83.250000 81.416667 564.041667 \n", - "3 90.166667 90.875000 88.916667 616.833333 \n", - "4 83.500000 86.375000 83.333333 574.958333 \n", + "0 39.208333 43.833333 34.625 293.166667 \n", + "1 61.333333 62.541667 54.000 421.208333 \n", + "2 94.125000 93.583333 86.375 638.291667 \n", + "3 52.791667 52.791667 44.000 376.125000 \n", + "4 52.708333 46.708333 47.625 354.750000 \n", "\n", "[5 rows x 28 columns]" ] @@ -1221,7 +1260,7 @@ "metadata": {}, "outputs": [], "source": [ - "target_times.to_csv('my_problem_target_times.csv', index=False)" + "new_target_times.to_csv('my_problem_target_times.csv', index=False)" ] }, { @@ -1290,7 +1329,7 @@ "import pickle\n", "\n", "with open('my_problem.plk', 'wb') as pickle_file:\n", - " pickle.dump((target_times, readings), pickle_file)" + " pickle.dump((new_target_times, readings), pickle_file)" ] }, { @@ -1327,7 +1366,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.6.8" } }, "nbformat": 4, From 4dbbc33a03b591e6fdc88b636dbb7c6a6300c136 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 25 Feb 2020 00:28:30 -0500 Subject: [PATCH 026/171] Fix docker config for windows and mac --- Dockerfile | 11 ++--------- docker-compose.yml | 1 - 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/Dockerfile b/Dockerfile index 3aeebd1..0917f4c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,18 +3,11 @@ FROM python:3.6 ARG UID=1000 EXPOSE 8888 -RUN adduser jupyter --uid $UID --disabled-password --system - RUN mkdir /app COPY setup.py /app -RUN mkdir /app/greenguard -COPY greenguard/__init__.py /app/greenguard -RUN pip install -e /app jupyter - -RUN rm -r /app/greenguard COPY greenguard /app/greenguard COPY notebooks /app/notebooks +RUN pip install -e /app jupyter WORKDIR /app -USER jupyter -CMD /usr/local/bin/jupyter notebook --ip 0.0.0.0 --NotebookApp.token='' +CMD pip install -e /app && /usr/local/bin/jupyter notebook --ip 0.0.0.0 --NotebookApp.token='' --allow-root diff --git a/docker-compose.yml b/docker-compose.yml index a839518..dfb7aed 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,6 @@ version: '3' services: jupyter: - network_mode: host build: context: . args: From 10dcfebde5ce41c6d4c9f294ad4e59a8ac59f42e Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Tue, 25 Feb 2020 00:49:50 -0500 Subject: [PATCH 027/171] Update MANIFEST to include pipelines --- MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index 469520f..4ebe1c6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,6 +4,8 @@ include HISTORY.md include LICENSE include README.md +recursive-include greenguard *.json + recursive-include tests * recursive-exclude * __pycache__ recursive-exclude * *.py[co] From b2b6784d4b25b028bc340ee0189a2e1d1333f2d3 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 18 May 2020 20:08:01 +0200 Subject: [PATCH 028/171] Update links --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index aecf61c..bef0983 100644 --- a/README.md +++ b/README.md @@ -13,17 +13,17 @@ AutoML for Renewable Energy Industries. [![PyPI Shield](https://img.shields.io/pypi/v/greenguard.svg)](https://pypi.python.org/pypi/greenguard) -[![Travis CI Shield](https://travis-ci.org/D3-AI/GreenGuard.svg?branch=master)](https://travis-ci.org/D3-AI/GreenGuard) +[![Travis CI Shield](https://travis-ci.org/signals-dev/GreenGuard.svg?branch=master)](https://travis-ci.org/signals-dev/GreenGuard) [![Downloads](https://pepy.tech/badge/greenguard)](https://pepy.tech/project/greenguard) # GreenGuard -- License: [MIT](https://github.com/D3-AI/GreenGuard/blob/master/LICENSE) -- Documentation: https://D3-AI.github.io/GreenGuard -- Homepage: https://github.com/D3-AI/GreenGuard +- License: [MIT](https://github.com/signals-dev/GreenGuard/blob/master/LICENSE) +- Documentation: https://signals-dev.github.io/GreenGuard +- Homepage: https://github.com/signals-dev/GreenGuard # Overview @@ -78,7 +78,7 @@ https://docs.docker.com/compose/install/) installed on your system and then foll 1. Clone this repository and go into the `GreenGuard` folder: ```bash -git clone git@github.com:D3-AI/GreenGuard.git +git clone git@github.com:signals-dev/GreenGuard.git cd GreenGuard ``` @@ -298,6 +298,6 @@ f1_score(test_targets, predictions) ## What's next? For more details about **GreenGuard** and all its possibilities and features, please check the -[project documentation site](https://D3-AI.github.io/GreenGuard/) +[project documentation site](https://signals-dev.github.io/GreenGuard/) Also do not forget to have a look at the [notebook tutorials]( -https://github.com/D3-AI/GreenGuard/tree/master/notebooks)! +https://github.com/signals-dev/GreenGuard/tree/master/notebooks)! From 47981a4efb213970b7e75602bdf936cd558e5870 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 19 May 2020 14:35:46 +0200 Subject: [PATCH 029/171] Integrate BTBSession with greenguard. --- greenguard/pipeline.py | 186 +++++++++++++++++++---------------------- setup.py | 3 +- 2 files changed, 90 insertions(+), 99 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 5ed7ec1..a2119e6 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -1,14 +1,15 @@ # -*- coding: utf-8 -*- +import json import logging import os -from collections import defaultdict from copy import deepcopy +from hashlib import md5 import cloudpickle import numpy as np -from btb import HyperParameter -from btb.tuning import GP +from btb import BTBSession +from btb.tuning import Tunable from mlblocks import MLPipeline from mlblocks.discovery import load_pipeline from sklearn.exceptions import NotFittedError @@ -157,7 +158,8 @@ def _count_static_steps(self): return 0 def _build_pipeline(self): - self._pipeline = MLPipeline(self.template) + self._pipeline = MLPipeline(self.template_name) + if self._hyperparameters: self._pipeline.set_hyperparameters(self._hyperparameters) @@ -184,6 +186,25 @@ def set_init_params(self, init_params): self._update_params(template_params, init_params) self._build_pipeline() + @staticmethod + def _get_templates(template): + if not isinstance(template, list): + templates = [template] + else: + templates = template + + templates_dict = dict() + for template in templates: + if isinstance(template, str): + template_name = template + template = load_pipeline(template_name) + else: + template_name = md5(json.dumps(template)).digest() + + templates_dict[template_name] = template + + return templates_dict + def __init__(self, template, metric='accuracy', cost=False, init_params=None, stratify=True, cv_splits=5, shuffle=True, random_state=0, preprocessing=0): @@ -194,12 +215,12 @@ def __init__(self, template, metric='accuracy', cost=False, init_params=None, st self._metric = metric self._cost = cost + self.cv_score = np.inf if cost else -np.inf + self._splits = dict() - if isinstance(template, str): - self.template_name = template - self.template = load_pipeline(template) - else: - self.template = template + self.templates = self._get_templates(template) + self.template_name = list(self.templates.keys())[0] + self.template = self.templates[self.template_name] # Make sure to have block number in all init_params names template_params = self.template.setdefault('init_params', dict()) @@ -270,9 +291,14 @@ def _is_better(self, score): return score > self.cv_score - def _generate_splits(self, X, y, readings, turbines=None): + def _generate_splits(self, template_name, target_times, readings, turbines=None): + template = self.templates.get(template_name) + + X = target_times[['turbine_id', 'cutoff_time']] + y = target_times['target'] + if self._preprocessing: - pipeline = MLPipeline(self.template) + pipeline = MLPipeline(template) LOGGER.debug('Running %s preprocessing steps', self._preprocessing) context = pipeline.fit(X=X, y=y, readings=readings, turbines=turbines, output_=self._preprocessing - 1) @@ -290,7 +316,7 @@ def _generate_splits(self, X, y, readings, turbines=None): X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] - pipeline = MLPipeline(self.template) + pipeline = MLPipeline(template) fit = pipeline.fit(X_train, y_train, output_=self._static - 1, start_=self._preprocessing, **context) predict = pipeline.predict(X_test, output_=self._static - 1, @@ -300,7 +326,7 @@ def _generate_splits(self, X, y, readings, turbines=None): return splits - def cross_validate(self, X=None, y=None, readings=None, turbines=None, params=None): + def cross_validate(self, template_splits=None, params=None): """Compute cross validation score using the given data. If the splits have not been previously computed, compute them now. @@ -332,19 +358,17 @@ def cross_validate(self, X=None, y=None, readings=None, turbines=None, params=No Computed cross validation score. This score is the average of the scores obtained accross all the cross validation folds. """ - - if self._splits is None: - LOGGER.info('Running static steps before cross validation') - self._splits = self._generate_splits(X, y, readings, turbines) - scores = [] - for fold, pipeline, fit, predict, y_test in self._splits: + if template_splits is None: + template_splits = self._splits.get(self.template_name) + + for fold, pipeline, fit, predict, y_test in template_splits: LOGGER.debug('Scoring fold %s', fold) if params: pipeline.set_hyperparameters(params) else: - pipeline.set_hyperparameters(self._pipeline.get_hyperparameters()) + pipeline.set_hyperparameters(pipeline.get_hyperparameters()) pipeline.fit(start_=self._static, **fit) predictions = pipeline.predict(start_=self._static, **predict) @@ -355,71 +379,61 @@ def cross_validate(self, X=None, y=None, readings=None, turbines=None, params=No scores.append(score) cv_score = np.mean(scores) - if self.cv_score is None: - self.cv_score = cv_score - return cv_score - def _to_dicts(self, hyperparameters): - params_tree = defaultdict(dict) - for (block, hyperparameter), value in hyperparameters.items(): - if isinstance(value, np.integer): - value = int(value) + @staticmethod + def _parse_params(param_details): + param_type = param_details['type'] + param_details['type'] = 'str' if param_type == 'string' else param_type + + if param_details['type'] == 'bool': + param_details['range'] = [True, False] + else: + param_details['range'] = param_details.get('range') or param_details.get('values') - elif isinstance(value, np.floating): - value = float(value) + if 'default' not in param_details: + param_details['default'] = param_details['range'][0] - elif isinstance(value, np.ndarray): - value = value.tolist() + return param_details - elif value == 'None': - value = None + @classmethod + def _get_tunables(cls, templates): + pipelines = {name: MLPipeline(template) for name, template in templates.items()} + tunables = {} - params_tree[block][hyperparameter] = value + for pipeline_name, pipeline in pipelines.items(): + pipeline_tunables = {} + for name, param_details in pipeline.get_tunable_hyperparameters(flat=True).items(): + pipeline_tunables[name] = cls._parse_params(param_details) - return params_tree + tunables[pipeline_name] = Tunable.from_dict(pipeline_tunables) - def _to_tuples(self, params_tree, tunable_keys): - param_tuples = defaultdict(dict) - for block_name, params in params_tree.items(): - for param, value in params.items(): - key = (block_name, param) - if key in tunable_keys: - param_tuples[key] = 'None' if value is None else value + return tunables - return param_tuples + def _make_btb_scorer(self, target_times, readings, turbines): - def _get_tunables(self): - tunables = [] - tunable_keys = [] - for block_name, params in self._pipeline.get_tunable_hyperparameters().items(): - for param_name, param_details in params.items(): - key = (block_name, param_name) - param_type = param_details['type'] - param_type = 'string' if param_type == 'str' else param_type + def scorer(template_name, config): - if param_type == 'bool': - param_range = [True, False] - else: - param_range = param_details.get('range') or param_details.get('values') + template_splits = self._splits.get(template_name) + if not template_splits: + template_splits = self._generate_splits( + template_name, target_times, readings, turbines) - value = HyperParameter(param_type, param_range) - tunables.append((key, value)) - tunable_keys.append(key) + self._splits[template_name] = template_splits - return tunables, tunable_keys + score = self.cross_validate(template_splits, config) - def _get_tuner(self): - tunables, tunable_keys = self._get_tunables() - tuner = GP(tunables) + if self._is_better(score): + self.cv_score = score + self.template_name = template_name + self._hyperparameters = deepcopy(config) + self._build_pipeline() - # Inform the tuner about the score that the default hyperparmeters obtained - param_tuples = self._to_tuples(self._pipeline.get_hyperparameters(), tunable_keys) - tuner.add(param_tuples, self.cv_score) + return score - return tuner + return scorer - def tune(self, target_times=None, readings=None, turbines=None, iterations=10): + def tune(self, target_times, readings, turbines=None, iterations=10): """Tune this pipeline for the indicated number of iterations. Args: @@ -436,37 +450,13 @@ def tune(self, target_times=None, readings=None, turbines=None, iterations=10): iterations (int): Number of iterations to perform. """ - if not self._tuner: - LOGGER.info('Scoring the default pipeline') - X = target_times[['turbine_id', 'cutoff_time']] - y = target_times['target'] - self.cv_score = self.cross_validate(X, y, readings, turbines) - - LOGGER.info('Default Pipeline score: %s', self.cv_score) - - self._tuner = self._get_tuner() - - for i in range(self.iterations, self.iterations + iterations): - LOGGER.info('Scoring pipeline %s', i + 1) - - params = self._tuner.propose(1) - param_dicts = self._to_dicts(params) - - try: - score = self.cross_validate(params=param_dicts) - - LOGGER.info('Pipeline %s score: %s', i + 1, score) - - if self._is_better(score): - self.cv_score = score - self.set_hyperparameters(param_dicts) - - self._tuner.add(params, score) + scoring_function = self._make_btb_scorer(target_times, readings, turbines) + tunables = self._get_tunables(self.templates) + session = BTBSession(tunables, scoring_function, maximize=not self._cost) + if iterations: + session.run(iterations) - except Exception: - failed = '\n'.join('{}: {}'.format(k, v) for k, v in params.items()) - LOGGER.exception("Caught an exception scoring pipeline %s with params:\n%s", - i + 1, failed) + return session def fit(self, target_times, readings, turbines=None): """Fit this pipeline to the given data. diff --git a/setup.py b/setup.py index 96b2019..0389bee 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,8 @@ install_requires = [ 'mlblocks>=0.3.4,<0.4', 'mlprimitives>=0.2.4,<0.3', - 'baytune>=0.2.3,<0.3', + 'scipy>=1.0.1,<1.4.0', + 'baytune>=0.3.9,<0.4', 'numpy>=1.15.4,<1.17', 'pymongo>=3.7.2,<4', 'scikit-learn>=0.20.1,<0.21', From de21d40b2517c1a12d1ac06e0c9cdb7fb668505f Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Thu, 21 May 2020 13:10:28 +0200 Subject: [PATCH 030/171] =?UTF-8?q?Bump=20version:=200.2.1.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.1.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 35fcad0..df69d4a 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.1.dev0' +__version__ = '0.2.1.dev1' import os diff --git a/setup.cfg b/setup.cfg index 3596a0b..f6734b1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1.dev0 +current_version = 0.2.1.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 96b2019..85d6a2c 100644 --- a/setup.py +++ b/setup.py @@ -104,6 +104,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.1.dev0', + version='0.2.1.dev1', zip_safe=False, ) From ffc27fef213853c0d431005c69c7b7a3c6c45279 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 1 Jun 2020 17:21:02 +0200 Subject: [PATCH 031/171] WIP: Support multiple templates for BTBSession. --- greenguard/pipeline.py | 200 +++++++++++++++++++++-------------------- 1 file changed, 103 insertions(+), 97 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index a2119e6..190997a 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -138,8 +138,9 @@ class GreenGuardPipeline(object): _cost = False _tuner = None _pipeline = None - _splits = None _static = None + _init_params = None + _preprocessing = None def _get_cv(self, stratify, cv_splits, shuffle, random_state): if stratify: @@ -149,22 +150,15 @@ def _get_cv(self, stratify, cv_splits, shuffle, random_state): return cv_class(n_splits=cv_splits, shuffle=shuffle, random_state=random_state) - def _count_static_steps(self): - tunable_hyperparams = self._pipeline.get_tunable_hyperparameters() - for index, block_name in enumerate(self._pipeline.blocks.keys()): + @staticmethod + def _count_static_steps(pipeline): + tunable_hyperparams = pipeline.get_tunable_hyperparameters() + for index, block_name in enumerate(pipeline.blocks.keys()): if tunable_hyperparams[block_name]: return index return 0 - def _build_pipeline(self): - self._pipeline = MLPipeline(self.template_name) - - if self._hyperparameters: - self._pipeline.set_hyperparameters(self._hyperparameters) - - self.fitted = False - @staticmethod def _update_params(old, new): for name, params in new.items(): @@ -175,17 +169,6 @@ def _update_params(old, new): for param, value in params.items(): block_params[param] = value - def set_init_params(self, init_params): - """Set new init params for the template and pipeline. - - Args: - init_params (dict): - New init_params to use. - """ - template_params = self.template['init_params'] - self._update_params(template_params, init_params) - self._build_pipeline() - @staticmethod def _get_templates(template): if not isinstance(template, list): @@ -205,6 +188,42 @@ def _get_templates(template): return templates_dict + def _get_init_params(self, template_name): + if self._init_params is None: + return {} + + elif template_name in self._init_params: + return self._init_params.get(template_name) + + return self._init_params + + def _get_preprocessing(self, template_name): + if isinstance(self._preprocessing, int): + return self._preprocessing + + if isinstance(self._preprocessing, dict): + return self._preprocessing.get(template_name) or 0 + + return 0 # by default + + def _build_pipeline(self, hyperparameters=None): + template_params = self.template.setdefault('init_params', dict()) + for name, params in list(template_params.items()): + if '#' not in name: + template_params[name + '#1'] = template_params.pop(name) + + init_params = self._get_init_params(self.template_name) + + if init_params: + self._update_params(template_params, init_params) + + self._pipeline = MLPipeline(self.template_name) + + if hyperparameters: + self._pipeline.set_hyperparameters(hyperparameters) + + self.fitted = False + def __init__(self, template, metric='accuracy', cost=False, init_params=None, stratify=True, cv_splits=5, shuffle=True, random_state=0, preprocessing=0): @@ -215,39 +234,33 @@ def __init__(self, template, metric='accuracy', cost=False, init_params=None, st self._metric = metric self._cost = cost + self._init_params = init_params + self._preprocessing = preprocessing + self.cv_score = np.inf if cost else -np.inf - self._splits = dict() self.templates = self._get_templates(template) self.template_name = list(self.templates.keys())[0] self.template = self.templates[self.template_name] - # Make sure to have block number in all init_params names - template_params = self.template.setdefault('init_params', dict()) - for name, params in list(template_params.items()): - if '#' not in name: - template_params[name + '#1'] = template_params.pop(name) - - self._hyperparameters = dict() - if init_params: - self.set_init_params(init_params) - else: - self._build_pipeline() - - self._static = self._count_static_steps() - self._preprocessing = preprocessing - - self.steps = self._pipeline.primitives.copy() - self.preprocessing = self.steps[:self._preprocessing] - self.static = self.steps[self._preprocessing:self._static] - self.tunable = self.steps[self._static:] + self._build_pipeline() - if self._preprocessing and (self._preprocessing > self._static): + _static = self._count_static_steps(self._pipeline) + _preprocessing = self._get_preprocessing(self.template_name) + if _preprocessing and (_preprocessing > _static): raise ValueError('Preprocessing cannot be bigger than static') self.iterations = 0 def __repr__(self): + steps = self._pipeline.primitives.copy() + preprocessing = self._get_preprocessing(self.template_name) + static = self._count_static_steps() + + preprocessing_steps = steps[:preprocessing] + static_steps = steps[preprocessing:static] + tunable_steps = steps[static:] + return ( "GreenGuardPipeline({})\n" " preprocessing:\n{}\n" @@ -255,9 +268,9 @@ def __repr__(self): " tunable:\n{}\n" ).format( self.template_name, - '\n'.join(' {}'.format(step) for step in self.preprocessing), - '\n'.join(' {}'.format(step) for step in self.static), - '\n'.join(' {}'.format(step) for step in self.tunable), + '\n'.join(' {}'.format(step) for step in preprocessing_steps), + '\n'.join(' {}'.format(step) for step in static_steps), + '\n'.join(' {}'.format(step) for step in tunable_steps), ) def get_hyperparameters(self): @@ -269,18 +282,6 @@ def get_hyperparameters(self): """ return deepcopy(self._hyperparameters) - def set_hyperparameters(self, hyperparameters): - """Set new hyperparameters for this pipeline instance. - - The template ``init_params`` remain unmodified. - - Args: - hyperparameters (dict): - New hyperparameters to use. - """ - self._update_params(self._hyperparameters, hyperparameters) - self._build_pipeline() - @staticmethod def _clone_pipeline(pipeline): return MLPipeline.from_dict(pipeline.to_dict()) @@ -297,11 +298,14 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) X = target_times[['turbine_id', 'cutoff_time']] y = target_times['target'] - if self._preprocessing: - pipeline = MLPipeline(template) - LOGGER.debug('Running %s preprocessing steps', self._preprocessing) + pipeline = MLPipeline(template) + preprocessing = self._get_preprocessing(template_name) + static = self._count_static_steps(pipeline) + + if preprocessing: + LOGGER.debug('Running %s preprocessing steps', preprocessing) context = pipeline.fit(X=X, y=y, readings=readings, - turbines=turbines, output_=self._preprocessing - 1) + turbines=turbines, output_=preprocessing - 1) del context['X'] del context['y'] else: @@ -311,22 +315,26 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) } splits = list() - for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)): - LOGGER.debug('Running static steps for fold %s', fold) - X_train, X_test = X.iloc[train_index], X.iloc[test_index] - y_train, y_test = y.iloc[train_index], y.iloc[test_index] + try: + for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)): + LOGGER.debug('Running static steps for fold %s', fold) + X_train, X_test = X.iloc[train_index], X.iloc[test_index] + y_train, y_test = y.iloc[train_index], y.iloc[test_index] - pipeline = MLPipeline(template) - fit = pipeline.fit(X_train, y_train, output_=self._static - 1, - start_=self._preprocessing, **context) - predict = pipeline.predict(X_test, output_=self._static - 1, - start_=self._preprocessing, **context) + pipeline = MLPipeline(template) + fit = pipeline.fit(X_train, y_train, output_=static - 1, + start_=preprocessing, **context) + predict = pipeline.predict(X_test, output_=static - 1, + start_=preprocessing, **context) - splits.append((fold, pipeline, fit, predict, y_test)) + splits.append((fold, pipeline, fit, predict, y_test, static)) + + except Exception: + LOGGER.info('Could not generate splits for %', template_name) return splits - def cross_validate(self, template_splits=None, params=None): + def cross_validate(self, template_name, template_splits, params=None): """Compute cross validation score using the given data. If the splits have not been previously computed, compute them now. @@ -359,19 +367,14 @@ def cross_validate(self, template_splits=None, params=None): of the scores obtained accross all the cross validation folds. """ scores = [] - if template_splits is None: - template_splits = self._splits.get(self.template_name) - for fold, pipeline, fit, predict, y_test in template_splits: + for fold, pipeline, fit, predict, y_test, static in template_splits: LOGGER.debug('Scoring fold %s', fold) - if params: - pipeline.set_hyperparameters(params) - else: - pipeline.set_hyperparameters(pipeline.get_hyperparameters()) + pipeline.set_hyperparameters(params) - pipeline.fit(start_=self._static, **fit) - predictions = pipeline.predict(start_=self._static, **predict) + pipeline.fit(start_=static, **fit) + predictions = pipeline.predict(start_=static, **predict) score = self._metric(y_test, predictions) @@ -379,6 +382,13 @@ def cross_validate(self, template_splits=None, params=None): scores.append(score) cv_score = np.mean(scores) + + if self._is_better(cv_score): + self.cv_score = cv_score + self.template_name = template_name + self._hyperparameters = deepcopy(params) + self._build_pipeline(self._hyperparameters) + return cv_score @staticmethod @@ -412,22 +422,18 @@ def _get_tunables(cls, templates): def _make_btb_scorer(self, target_times, readings, turbines): - def scorer(template_name, config): - - template_splits = self._splits.get(template_name) - if not template_splits: - template_splits = self._generate_splits( - template_name, target_times, readings, turbines) + splits = { + template_name: self._generate_splits(template_name, target_times, readings, turbines) + for template_name in list(self.templates.keys()) + } - self._splits[template_name] = template_splits - - score = self.cross_validate(template_splits, config) + def scorer(template_name, config): + template_splits = splits.get(template_name) + if template_splits: + score = self.cross_validate(template_name, template_splits, config) - if self._is_better(score): - self.cv_score = score - self.template_name = template_name - self._hyperparameters = deepcopy(config) - self._build_pipeline() + else: + return None return score From f23af78bd9567fece6ab314971205aa053e7a0ac Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Wed, 3 Jun 2020 12:46:52 +0200 Subject: [PATCH 032/171] Code review / improved aproach --- greenguard/pipeline.py | 226 ++++++++++++++++++++++------------------- 1 file changed, 122 insertions(+), 104 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 190997a..a6d8a15 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -142,6 +142,10 @@ class GreenGuardPipeline(object): _init_params = None _preprocessing = None + @staticmethod + def _clone_pipeline(pipeline): + return MLPipeline.from_dict(pipeline.to_dict()) + def _get_cv(self, stratify, cv_splits, shuffle, random_state): if stratify: cv_class = StratifiedKFold @@ -150,14 +154,30 @@ def _get_cv(self, stratify, cv_splits, shuffle, random_state): return cv_class(n_splits=cv_splits, shuffle=shuffle, random_state=random_state) - @staticmethod - def _count_static_steps(pipeline): - tunable_hyperparams = pipeline.get_tunable_hyperparameters() - for index, block_name in enumerate(pipeline.blocks.keys()): - if tunable_hyperparams[block_name]: - return index + def _get_init_params(self, template_name): + if self._init_params is None: + return {} - return 0 + elif any(name in self._init_params for name in list(self.template_names.keys())): + return self._init_params.get(template_name) + + return self._init_params + + def _set_hyperparameters(self, new_hyperparameters): + self._hyperparameters = deepcopy(new_hyperparameters) + + def _set_template(self, template_name): + self.template_name = deepcopy(template_name) + self.template = self.templates[self.template_name] + + def _get_preprocessing(self, template_name): + if isinstance(self._preprocessing, int): + return self._preprocessing + + if isinstance(self._preprocessing, dict): + return self._preprocessing.get(template_name) or 0 + + return 0 # by default @staticmethod def _update_params(old, new): @@ -169,6 +189,32 @@ def _update_params(old, new): for param, value in params.items(): block_params[param] = value + def _build_pipeline(self): + template_params = self.template.setdefault('init_params', dict()) + for name, params in list(template_params.items()): + if '#' not in name: + template_params[name + '#1'] = template_params.pop(name) + + init_params = self._get_init_params(self.template_name) + if init_params: + self._update_params(template_params, init_params) + + self._pipeline = MLPipeline(self.template) + + if self._hyperparameters: + self._pipeline.set_hyperparameters(self._hyperparameters) + + self.fitted = False + + @staticmethod + def _count_static_steps(pipeline): + tunable_hyperparams = pipeline.get_tunable_hyperparameters() + for index, block_name in enumerate(pipeline.blocks.keys()): + if tunable_hyperparams[block_name]: + return index + + return 0 + @staticmethod def _get_templates(template): if not isinstance(template, list): @@ -188,42 +234,6 @@ def _get_templates(template): return templates_dict - def _get_init_params(self, template_name): - if self._init_params is None: - return {} - - elif template_name in self._init_params: - return self._init_params.get(template_name) - - return self._init_params - - def _get_preprocessing(self, template_name): - if isinstance(self._preprocessing, int): - return self._preprocessing - - if isinstance(self._preprocessing, dict): - return self._preprocessing.get(template_name) or 0 - - return 0 # by default - - def _build_pipeline(self, hyperparameters=None): - template_params = self.template.setdefault('init_params', dict()) - for name, params in list(template_params.items()): - if '#' not in name: - template_params[name + '#1'] = template_params.pop(name) - - init_params = self._get_init_params(self.template_name) - - if init_params: - self._update_params(template_params, init_params) - - self._pipeline = MLPipeline(self.template_name) - - if hyperparameters: - self._pipeline.set_hyperparameters(hyperparameters) - - self.fitted = False - def __init__(self, template, metric='accuracy', cost=False, init_params=None, stratify=True, cv_splits=5, shuffle=True, random_state=0, preprocessing=0): @@ -240,13 +250,14 @@ def __init__(self, template, metric='accuracy', cost=False, init_params=None, st self.cv_score = np.inf if cost else -np.inf self.templates = self._get_templates(template) - self.template_name = list(self.templates.keys())[0] - self.template = self.templates[self.template_name] + self._set_template(list(self.templates.keys())[0]) + self._hyperparameters = dict() self._build_pipeline() _static = self._count_static_steps(self._pipeline) _preprocessing = self._get_preprocessing(self.template_name) + if _preprocessing and (_preprocessing > _static): raise ValueError('Preprocessing cannot be bigger than static') @@ -282,10 +293,6 @@ def get_hyperparameters(self): """ return deepcopy(self._hyperparameters) - @staticmethod - def _clone_pipeline(pipeline): - return MLPipeline.from_dict(pipeline.to_dict()) - def _is_better(self, score): if self._cost: return score < self.cv_score @@ -303,6 +310,10 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) static = self._count_static_steps(pipeline) if preprocessing: + + if preprocessing > static: + raise ValueError('Preprocessing cannot be bigger than static') + LOGGER.debug('Running %s preprocessing steps', preprocessing) context = pipeline.fit(X=X, y=y, readings=readings, turbines=turbines, output_=preprocessing - 1) @@ -315,26 +326,65 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) } splits = list() - try: - for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)): - LOGGER.debug('Running static steps for fold %s', fold) - X_train, X_test = X.iloc[train_index], X.iloc[test_index] - y_train, y_test = y.iloc[train_index], y.iloc[test_index] - - pipeline = MLPipeline(template) - fit = pipeline.fit(X_train, y_train, output_=static - 1, - start_=preprocessing, **context) - predict = pipeline.predict(X_test, output_=static - 1, - start_=preprocessing, **context) + for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)): + LOGGER.debug('Running static steps for fold %s', fold) + X_train, X_test = X.iloc[train_index], X.iloc[test_index] + y_train, y_test = y.iloc[train_index], y.iloc[test_index] - splits.append((fold, pipeline, fit, predict, y_test, static)) + pipeline = MLPipeline(template) + fit = pipeline.fit(X_train, y_train, output_=static - 1, + start_=preprocessing, **context) + predict = pipeline.predict(X_test, output_=static - 1, + start_=preprocessing, **context) - except Exception: - LOGGER.info('Could not generate splits for %', template_name) + splits.append((fold, pipeline, fit, predict, y_test, static)) return splits - def cross_validate(self, template_name, template_splits, params=None): + def _cross_validate(self, template_splits, hyperparams): + scores = [] + for fold, pipeline, fit, predict, y_test, static in template_splits: + LOGGER.debug('Scoring fold %s', fold) + + pipeline.set_hyperparameters(hyperparams) + pipeline.fit(start_=static, **fit) + predictions = pipeline.predict(start_=static, **predict) + + score = self._metric(y_test, predictions) + LOGGER.debug('Fold fold %s score: %s', fold, score) + scores.append(score) + + cv_score = np.mean(scores) + + return cv_score + + def _make_btb_scorer(self, target_times, readings, turbines): + + splits = {} + + def scorer(template_name, config): + + template_splits = splits.get(template_name) + if template_splits is None: + template_splits = self._generate_splits( + template_name, target_times, readings, turbines) + + splits[template_name] = template_splits + + cv_score = self._cross_validate(template_name, template_splits, config) + + if self._is_better(cv_score): + self.cv_score = cv_score + self._set_template(template_name) + self._set_hyperparameters(deepcopy(config)) + self._build_pipeline() + + return cv_score + + return scorer + + def cross_validate(self, target_times, readings, turbines, + template_name=None, hyperparams=None): """Compute cross validation score using the given data. If the splits have not been previously computed, compute them now. @@ -366,30 +416,17 @@ def cross_validate(self, template_name, template_splits, params=None): Computed cross validation score. This score is the average of the scores obtained accross all the cross validation folds. """ - scores = [] - - for fold, pipeline, fit, predict, y_test, static in template_splits: - LOGGER.debug('Scoring fold %s', fold) - - pipeline.set_hyperparameters(params) - - pipeline.fit(start_=static, **fit) - predictions = pipeline.predict(start_=static, **predict) - score = self._metric(y_test, predictions) + if not template_name: + template_name = self.template_name + if hyperparams is None: + hyperparams = self.get_hyperparams() - LOGGER.debug('Fold fold %s score: %s', fold, score) - scores.append(score) - - cv_score = np.mean(scores) + elif hyperparams is None: + hyperparams = {} - if self._is_better(cv_score): - self.cv_score = cv_score - self.template_name = template_name - self._hyperparameters = deepcopy(params) - self._build_pipeline(self._hyperparameters) - - return cv_score + template_splits = self._generate_splits(template_name, target_times, readings, turbines) + return self._cross_validate(template_splits, hyperparams) @staticmethod def _parse_params(param_details): @@ -420,25 +457,6 @@ def _get_tunables(cls, templates): return tunables - def _make_btb_scorer(self, target_times, readings, turbines): - - splits = { - template_name: self._generate_splits(template_name, target_times, readings, turbines) - for template_name in list(self.templates.keys()) - } - - def scorer(template_name, config): - template_splits = splits.get(template_name) - if template_splits: - score = self.cross_validate(template_name, template_splits, config) - - else: - return None - - return score - - return scorer - def tune(self, target_times, readings, turbines=None, iterations=10): """Tune this pipeline for the indicated number of iterations. From f83777ad369599fee330d63694f7b4a105aed9a3 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Wed, 3 Jun 2020 17:31:18 +0200 Subject: [PATCH 033/171] code improvements. --- greenguard/pipeline.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index a6d8a15..f81ae99 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -371,7 +371,7 @@ def scorer(template_name, config): splits[template_name] = template_splits - cv_score = self._cross_validate(template_name, template_splits, config) + cv_score = self._cross_validate(template_splits, config) if self._is_better(cv_score): self.cv_score = cv_score @@ -420,7 +420,7 @@ def cross_validate(self, target_times, readings, turbines, if not template_name: template_name = self.template_name if hyperparams is None: - hyperparams = self.get_hyperparams() + hyperparams = self.get_hyperparameters() elif hyperparams is None: hyperparams = {} From 60cde7ade9fb4f0fe3336cdbcba7021ae3e0b183 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 4 Jun 2020 16:18:38 +0200 Subject: [PATCH 034/171] Update notebook --- notebooks/1. GreenGuard Quickstart.ipynb | 304 +++++++++++++++-------- 1 file changed, 204 insertions(+), 100 deletions(-) diff --git a/notebooks/1. GreenGuard Quickstart.ipynb b/notebooks/1. GreenGuard Quickstart.ipynb index 9c0e2d7..be4b2f6 100644 --- a/notebooks/1. GreenGuard Quickstart.ipynb +++ b/notebooks/1. GreenGuard Quickstart.ipynb @@ -411,13 +411,13 @@ { "data": { "text/plain": [ - "['resample_600s_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_double_144_lstm_timeseries_classifier',\n", + "['resample_600s_unstack_144_lstm_timeseries_classifier',\n", " 'resample_3600s_unstack_24_lstm_timeseries_classifier',\n", - " 'resample_3600s_unstack_double_24_lstm_timeseries_classifier',\n", " 'resample_600s_unstack_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_144_lstm_timeseries_classifier']" + " 'resample_600s_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_3600s_unstack_double_24_lstm_timeseries_classifier',\n", + " 'resample_600s_unstack_double_144_lstm_timeseries_classifier',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier']" ] }, "execution_count": 10, @@ -446,9 +446,9 @@ { "data": { "text/plain": [ - "['resample_600s_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_dfs_1d_xgb_classifier']" + "['resample_600s_unstack_dfs_1d_xgb_classifier',\n", + " 'resample_600s_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier']" ] }, "execution_count": 11, @@ -476,9 +476,9 @@ { "data": { "text/plain": [ - "{'resample_600s_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json'}" + "{'resample_600s_unstack_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json'}" ] }, "execution_count": 12, @@ -495,15 +495,18 @@ "metadata": {}, "source": [ "For the rest of this tutorial, we will select and use the pipeline\n", - "`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` as our template.\n", + "`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` and `resample_600s_normalize_dfs_1d_xgb_classifier`.\n", "\n", - "This templates contains the following steps:\n", + "The `resample_600s_unstack_normalize_dfs_1d_xgb_classifier` template contains the following steps:\n", "\n", "- Resample the data using a 10 minute average aggregation\n", "- Unstack the data by signal, so each signal is in a different column\n", "- Normalize the Turbine IDs into a new table to assist DFS aggregations\n", "- Use DFS on the readings based on the target_times cutoff times using a 1d window size\n", - "- Apply an XGBoost Classifier" + "- Apply an XGBoost Classifier\n", + "\n", + "And the `resample_600s_normalize_dfs_1d_xgb_classifier` template contains the above steps but without\n", + "unstacking the data by signal." ] }, { @@ -512,7 +515,10 @@ "metadata": {}, "outputs": [], "source": [ - "template = 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier'" + "templates = [\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier', \n", + " 'resample_600s_normalize_dfs_1d_xgb_classifier'\n", + "]" ] }, { @@ -543,16 +549,21 @@ "source": [ "from greenguard.pipeline import GreenGuardPipeline\n", "\n", - "pipeline = GreenGuardPipeline(template, metric='f1', cv_splits=3)" + "pipeline = GreenGuardPipeline(templates, metric='f1', cv_splits=3)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Once we have created the pipeline, we can call its `tune` method to find the best possible\n", - "hyperparameters for our data, passing the `target_times` and `readings` variables,\n", - "as well as an indication of the number of tuning iterations that we want to perform." + "Once we have created the pipeline, we can find which template and which combination of hyperparameters works best for our data by calling the `tune` method of our pipeline, passing its `target_times` and `readings` variables.\n", + "This method will return a `BTBSession` session that will:\n", + "- Select and tune templates.\n", + "- If a template or hyperparameters that get a higher score than the previous one is found, automatically update our pipeline so that it uses that template with those hyperparameters.\n", + "- Remove templates that don't work with the given data and focus on tuning only the ones that do.\n", + "\n", + "Also, if we specify `iterations` the `tune` method will return a session that has already perfromed tuning\n", + "for the given amount of iterations:" ] }, { @@ -564,8 +575,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.pipeline:Scoring the default pipeline\n", - "INFO:greenguard.pipeline:Running static steps before cross validation\n" + "INFO:btb.session:Obtaining default configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n" ] }, { @@ -573,47 +583,63 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 00:47 | Progress: 100%|██████████\n", - "Elapsed: 00:24 | Progress: 100%|██████████\n", + "Elapsed: 01:46 | Progress: 100%|██████████\n", + "Elapsed: 00:43 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:50 | Progress: 100%|██████████\n", - "Elapsed: 00:23 | Progress: 100%|██████████\n", + "Elapsed: 00:57 | Progress: 100%|██████████\n", + "Elapsed: 00:27 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:46 | Progress: 100%|██████████\n", - "Elapsed: 00:23 | Progress: 100%|██████████\n" + "Elapsed: 00:54 | Progress: 100%|██████████\n", + "Elapsed: 00:24 | Progress: 100%|██████████\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.605187908496732\n", + "INFO:btb.session:Obtaining default configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 99 features\n", + "Elapsed: 03:44 | Progress: 100%|██████████\n", + "Elapsed: 01:11 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 02:24 | Progress: 100%|██████████\n", + "Elapsed: 01:10 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 02:55 | Progress: 100%|██████████\n", + "Elapsed: 02:15 | Progress: 100%|██████████\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.pipeline:Default Pipeline score: 0.605187908496732\n", - "INFO:greenguard.pipeline:Scoring pipeline 1\n", - "INFO:btb:Using Uniform sampler as user specified r_minimum threshold is not met to start the GP based learning\n", - "INFO:greenguard.pipeline:Pipeline 1 score: 0.6188131761825791\n", - "INFO:greenguard.pipeline:Scoring pipeline 2\n", - "INFO:greenguard.pipeline:Pipeline 2 score: 0.6271095502877767\n", - "INFO:greenguard.pipeline:Scoring pipeline 3\n", - "INFO:greenguard.pipeline:Pipeline 3 score: 0.6305597783858653\n", - "INFO:greenguard.pipeline:Scoring pipeline 4\n", - "INFO:greenguard.pipeline:Pipeline 4 score: 0.6024864024864024\n", - "INFO:greenguard.pipeline:Scoring pipeline 5\n", - "INFO:greenguard.pipeline:Pipeline 5 score: 0.6141217155301661\n" + "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6074772975193733\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n" ] } ], "source": [ - "pipeline.tune(target_times, readings, iterations=5)" + "session = pipeline.tune(target_times, readings, iterations=5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "After the tuning process has finished, the hyperparameters have been already set in the classifier.\n", + "After the tuning process has finished, the template and the hyperparameters \n", + "that have obtained the best score have been already set in the classifier.\n", "\n", - "We can see the found hyperparameters by calling the `get_hyperparameters` method,\n", - "which will return a dictionary with the best hyperparameters found so far:" + "We can see the `best_proposal` that contains the tempalte name, hyperparameters\n", + "and score by accessing the `session.best_proposal`:" ] }, { @@ -624,12 +650,16 @@ { "data": { "text/plain": [ - "{'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': 82},\n", - " 'xgboost.XGBClassifier#1': {'n_estimators': 785,\n", - " 'max_depth': 7,\n", - " 'learning_rate': 0.12220259756122442,\n", - " 'gamma': 0.07359343182340616,\n", - " 'min_child_weight': 9}}" + "{'id': 'c18f45d5e3bc2e41b3b3456b24d34add',\n", + " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 82,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 940,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.5949116894971435,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.14299079052852726,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9},\n", + " 'score': 0.6074772975193733}" ] }, "execution_count": 16, @@ -637,10 +667,70 @@ "output_type": "execute_result" } ], + "source": [ + "session.best_proposal" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can check that the new hyperparameters are already set by callgin `get_hyperparameters` method: " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 82,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 940,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.5949116894971435,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.14299079052852726,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.get_hyperparameters()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can check the template name that is used to generate the pipeline:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'resample_600s_unstack_normalize_dfs_1d_xgb_classifier'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.template_name" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -651,16 +741,16 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6305597783858653" + "0.6074772975193733" ] }, - "execution_count": 17, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -673,59 +763,72 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "**NOTE**: If the score is not good enough, we can call the `tune` method again as many times\n", - "as needed and the pipeline will continue its tuning process every time based on the previous\n", - "results!" + "**NOTE**: If the score is not good enough, we can call the `run` method of the `session` again,\n", + "specifying the amount of iterations, and this will continue its tuning process continuing from\n", + "the previous results!" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.pipeline:Scoring pipeline 1\n", - "INFO:greenguard.pipeline:Pipeline 1 score: 0.6635006784260514\n", - "INFO:greenguard.pipeline:Scoring pipeline 2\n", - "INFO:greenguard.pipeline:Pipeline 2 score: 0.6845139382452815\n", - "INFO:greenguard.pipeline:Scoring pipeline 3\n", - "INFO:greenguard.pipeline:Pipeline 3 score: 0.6424425247954658\n", - "INFO:greenguard.pipeline:Scoring pipeline 4\n", - "INFO:greenguard.pipeline:Pipeline 4 score: 0.6146558553876801\n", - "INFO:greenguard.pipeline:Scoring pipeline 5\n", - "INFO:greenguard.pipeline:Pipeline 5 score: 0.6188226349516671\n", - "INFO:greenguard.pipeline:Scoring pipeline 6\n", - "INFO:greenguard.pipeline:Pipeline 6 score: 0.6213326748609891\n", - "INFO:greenguard.pipeline:Scoring pipeline 7\n", - "INFO:greenguard.pipeline:Pipeline 7 score: 0.6431577681577682\n", - "INFO:greenguard.pipeline:Scoring pipeline 8\n", - "INFO:greenguard.pipeline:Pipeline 8 score: 0.6119918008302174\n", - "INFO:greenguard.pipeline:Scoring pipeline 9\n", - "INFO:greenguard.pipeline:Pipeline 9 score: 0.670814479638009\n", - "INFO:greenguard.pipeline:Scoring pipeline 10\n", - "INFO:greenguard.pipeline:Pipeline 10 score: 0.6781385082782808\n" + "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6215756372962148\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6228241559394411\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:New optimal found: resample_600s_normalize_dfs_1d_xgb_classifier - 0.6310483870967741\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6421858959172391\n" ] + }, + { + "data": { + "text/plain": [ + "{'id': '597e7123769b671e0f0c964311ebc005',\n", + " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 5,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.8912106438743266,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.022878268134643553,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", + " 'score': 0.6421858959172391}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "pipeline.tune(target_times, readings, iterations=10)" + "session.run(iterations=10)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6845139382452815" + "0.6421858959172391" ] }, - "execution_count": 19, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -736,21 +839,22 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'mlprimitives.custom.feature_extraction.CategoricalEncoder#1': {'max_labels': 84},\n", - " 'xgboost.XGBClassifier#1': {'n_estimators': 788,\n", - " 'max_depth': 4,\n", - " 'learning_rate': 0.13866846579555614,\n", - " 'gamma': 0.652732260680545,\n", - " 'min_child_weight': 10}}" + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 5,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.8912106438743266,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.022878268134643553,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" ] }, - "execution_count": 20, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -774,7 +878,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -782,7 +886,7 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 00:52 | Progress: 100%|██████████\n" + "Elapsed: 02:08 | Progress: 100%|██████████\n" ] } ], @@ -801,14 +905,14 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:17 | Progress: 100%|██████████\n" + "Elapsed: 00:24 | Progress: 100%|██████████\n" ] } ], @@ -825,7 +929,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -834,7 +938,7 @@ "0.76" ] }, - "execution_count": 23, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -863,7 +967,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -882,7 +986,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -898,14 +1002,14 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:17 | Progress: 100%|██████████\n" + "Elapsed: 00:22 | Progress: 100%|██████████\n" ] }, { @@ -914,7 +1018,7 @@ "array([0, 0, 0, 1, 0])" ] }, - "execution_count": 26, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -941,7 +1045,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.10" } }, "nbformat": 4, From bacc0eaaa587e7300de518ec45bb015f919ff7bd Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 5 Jun 2020 16:13:36 +0200 Subject: [PATCH 035/171] Fix CSV filename format. --- docs/advanced_usage/csv.md | 8 +-- greenguard/demo.py | 2 +- greenguard/loaders/csv.py | 2 +- notebooks/2. Extract Readings.ipynb | 76 ++++++++++++++--------------- 4 files changed, 44 insertions(+), 44 deletions(-) diff --git a/docs/advanced_usage/csv.md b/docs/advanced_usage/csv.md index c020832..c267807 100644 --- a/docs/advanced_usage/csv.md +++ b/docs/advanced_usage/csv.md @@ -26,10 +26,10 @@ following structure: * `readings/T001` * `readings/T002` * ... -* Inside each turbine folder one CSV file exists for each month, named `%Y-%m-.csv`. - * `readings/T001/2010-01-.csv` - * `readings/T001/2010-02-.csv` - * `readings/T001/2010-03-.csv` +* Inside each turbine folder one CSV file exists for each month, named `%Y-%m.csv`. + * `readings/T001/2010-01.csv` + * `readings/T001/2010-02.csv` + * `readings/T001/2010-03.csv` * ... * Each CSV file contains three columns: * `signal_id`: name or id of the signal. diff --git a/greenguard/demo.py b/greenguard/demo.py index e15f71d..429e0e9 100644 --- a/greenguard/demo.py +++ b/greenguard/demo.py @@ -62,7 +62,7 @@ def generate_raw_readings(output_path='demo'): for month in range(1, 13): month_data = data[data.timestamp.dt.month == month].copy() month_data['timestamp'] = month_data['timestamp'].dt.strftime('%m/%d/%y %H:%M:%S') - month_path = os.path.join(turbine_path, '2013-{:02d}-.csv'.format(month)) + month_path = os.path.join(turbine_path, '2013-{:02d}.csv'.format(month)) LOGGER.info('Generating file %s', month_path) month_data[['signal_id', 'timestamp', 'value']].to_csv(month_path, index=False) diff --git a/greenguard/loaders/csv.py b/greenguard/loaders/csv.py index 6e3729a..97d33ee 100644 --- a/greenguard/loaders/csv.py +++ b/greenguard/loaders/csv.py @@ -37,7 +37,7 @@ class CSVLoader: """ DEFAULT_DATETIME_FMT = '%m/%d/%y %H:%M:%S' - DEFAULT_FILENAME_FMT = '%Y-%m-.csv' + DEFAULT_FILENAME_FMT = '%Y-%m.csv' def __init__(self, readings_path='.', rule=None, aggregation='mean', unstack=False, datetime_fmt=DEFAULT_DATETIME_FMT, filename_fmt=DEFAULT_FILENAME_FMT): diff --git a/notebooks/2. Extract Readings.ipynb b/notebooks/2. Extract Readings.ipynb index f8166a0..d306172 100644 --- a/notebooks/2. Extract Readings.ipynb +++ b/notebooks/2. Extract Readings.ipynb @@ -70,18 +70,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.demo:Generating file readings/T001/2013-01-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-02-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-03-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-04-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-05-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-06-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-07-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-08-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-09-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-10-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-11-.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-12-.csv\n" + "INFO:greenguard.demo:Generating file readings/T001/2013-01.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-02.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-03.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-04.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-05.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-06.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-07.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-08.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-09.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-10.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-11.csv\n", + "INFO:greenguard.demo:Generating file readings/T001/2013-12.csv\n" ] } ], @@ -108,7 +108,7 @@ "source": [ "import pandas as pd\n", "\n", - "readings_sample = pd.read_csv('readings/T001/2013-01-.csv')" + "readings_sample = pd.read_csv('readings/T001/2013-01.csv')" ] }, { @@ -206,10 +206,10 @@ " * `readings/T001`\n", " * `readings/T002`\n", " * ...\n", - "* Inside each turbine folder one CSV file exists for each month, named `%Y-%m-.csv`.\n", - " * `readings/T001/2010-01-.csv`\n", - " * `readings/T001/2010-02-.csv`\n", - " * `readings/T001/2010-03-.csv`\n", + "* Inside each turbine folder one CSV file exists for each month, named `%Y-%m.csv`.\n", + " * `readings/T001/2010-01.csv`\n", + " * `readings/T001/2010-02.csv`\n", + " * `readings/T001/2010-03.csv`\n", " * ...\n", "* Each CSV file contains three columns:\n", " * `signal_id`: name or id of the signal.\n", @@ -763,30 +763,30 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:81749 readings reduced to 3432\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:110938 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:112118 readings reduced to 4680\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:103319 readings reduced to 4368\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:111862 readings reduced to 4680\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:114400 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:105321 readings reduced to 4550\n", + "INFO:greenguard.loaders.csv:108371 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:115615 readings reduced to 4836\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:115647 readings reduced to 4836\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:110938 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:115647 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:103319 readings reduced to 4368\n", "INFO:greenguard.loaders.csv:115979 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:111862 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:114477 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:105321 readings reduced to 4550\n", - "INFO:greenguard.loaders.csv:115615 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:114400 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:108371 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:112118 readings reduced to 4680\n", "INFO:greenguard.loaders.csv:Loaded 55250 readings from turbine T001\n", "INFO:greenguard.loaders.csv:Loaded 55250 turbine readings\n", "INFO:greenguard.targets:Dropped 12 targets without enough data. Final target_times size: 341\n" @@ -961,28 +961,28 @@ "text": [ "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:108371 readings reduced to 4680\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:81749 readings reduced to 3432\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:103319 readings reduced to 4368\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:110938 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:115647 readings reduced to 4836\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:103319 readings reduced to 4368\n", + "INFO:greenguard.loaders.csv:115615 readings reduced to 4836\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:114400 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:114477 readings reduced to 4836\n", "INFO:greenguard.loaders.csv:115979 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:108371 readings reduced to 4680\n", + "INFO:greenguard.loaders.csv:111862 readings reduced to 4680\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", + "INFO:greenguard.loaders.csv:81749 readings reduced to 3432\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:105321 readings reduced to 4550\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:115615 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:114400 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:115647 readings reduced to 4836\n", "INFO:greenguard.loaders.csv:112118 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:111862 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:114477 readings reduced to 4836\n", + "INFO:greenguard.loaders.csv:110938 readings reduced to 4680\n", "INFO:greenguard.loaders.csv:Loaded 2125 readings from turbine T001\n", "INFO:greenguard.loaders.csv:Loaded 2125 turbine readings\n", "INFO:greenguard.targets:Dropped 12 targets without enough data. Final target_times size: 341\n" @@ -1366,7 +1366,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.8" + "version": "3.6.10" } }, "nbformat": 4, From bf6e56fab343d3dcebb83d3882b3f2e3016dbc14 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 5 Jun 2020 18:37:26 +0200 Subject: [PATCH 036/171] Update links. --- README.md | 4 ++-- notebooks/2. Extract Readings.ipynb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index bef0983..7bb64d4 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ pip install greenguard This will pull and install the latest stable release from [PyPi](https://pypi.org/). If you want to install from source or contribute to the project please read the -[Contributing Guide](https://d3-ai.github.io/GreenGuard/contributing.html#get-started). +[Contributing Guide](https://signals-dev.github.io/GreenGuard/contributing.html#get-started). ## Docker usage @@ -153,7 +153,7 @@ A part from the in-memory data format explained above, which is limited by the m allocation capabilities of the system where it is run, **GreenGuard** is also prepared to load and work with data stored as a collection of CSV files, drastically increasing the amount of data which it can work with. Further details about this format can be found in the -[project documentation site](https://d3-ai.github.io/GreenGuard/advanced_usage/csv.html). +[project documentation site](https://signals-dev.github.io/GreenGuard/advanced_usage/csv.html). # Quickstart diff --git a/notebooks/2. Extract Readings.ipynb b/notebooks/2. Extract Readings.ipynb index d306172..db55927 100644 --- a/notebooks/2. Extract Readings.ipynb +++ b/notebooks/2. Extract Readings.ipynb @@ -10,7 +10,7 @@ "that contains readings in the raw CSV format.\n", "\n", "The Raw CSV format es briefly explained below, but more details can be found in [the documentation site](\n", - "https://d3-ai.github.io/GreenGuard/advanced_usage/csv.html)/n", + "https://signals-dev.github.io/GreenGuard/advanced_usage/csv.html)/n", "\n", "In this notebook we will:\n", "\n", From b6ca7c22a3a93f285916cf0f9e822555ef86f571 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 8 Jun 2020 18:09:57 +0200 Subject: [PATCH 037/171] PR review and comments. --- greenguard/pipeline.py | 194 ++++++++------------ notebooks/1. GreenGuard Quickstart.ipynb | 224 ++++++++++++++--------- 2 files changed, 213 insertions(+), 205 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index f81ae99..228c68c 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -158,8 +158,8 @@ def _get_init_params(self, template_name): if self._init_params is None: return {} - elif any(name in self._init_params for name in list(self.template_names.keys())): - return self._init_params.get(template_name) + elif any(name in self._init_params for name in list(self.templates.keys())): + return self._init_params.get(template_name) or {} return self._init_params @@ -167,8 +167,8 @@ def _set_hyperparameters(self, new_hyperparameters): self._hyperparameters = deepcopy(new_hyperparameters) def _set_template(self, template_name): - self.template_name = deepcopy(template_name) - self.template = self.templates[self.template_name] + self.template_name = template_name + self.template = self._template_dicts[self.template_name] def _get_preprocessing(self, template_name): if isinstance(self._preprocessing, int): @@ -189,25 +189,8 @@ def _update_params(old, new): for param, value in params.items(): block_params[param] = value - def _build_pipeline(self): - template_params = self.template.setdefault('init_params', dict()) - for name, params in list(template_params.items()): - if '#' not in name: - template_params[name + '#1'] = template_params.pop(name) - - init_params = self._get_init_params(self.template_name) - if init_params: - self._update_params(template_params, init_params) - - self._pipeline = MLPipeline(self.template) - - if self._hyperparameters: - self._pipeline.set_hyperparameters(self._hyperparameters) - - self.fitted = False - - @staticmethod - def _count_static_steps(pipeline): + def _count_static_steps(self, template_name): + pipeline = MLPipeline(self._template_dicts.get(template_name)) tunable_hyperparams = pipeline.get_tunable_hyperparameters() for index, block_name in enumerate(pipeline.blocks.keys()): if tunable_hyperparams[block_name]: @@ -215,14 +198,9 @@ def _count_static_steps(pipeline): return 0 - @staticmethod - def _get_templates(template): - if not isinstance(template, list): - templates = [template] - else: - templates = template - - templates_dict = dict() + def _get_templates(self, templates): + template_dicts = dict() + template_names = list() for template in templates: if isinstance(template, str): template_name = template @@ -230,59 +208,70 @@ def _get_templates(template): else: template_name = md5(json.dumps(template)).digest() - templates_dict[template_name] = template + init_params = self._init_params.get(template_name, self._default_init_params) + self._update_params(template['init_params'], init_params) + template_dicts[template_name] = template + template_names.append(template_name) - return templates_dict + return template_names, template_dicts - def __init__(self, template, metric='accuracy', cost=False, init_params=None, stratify=True, - cv_splits=5, shuffle=True, random_state=0, preprocessing=0): + def _generate_init_params(self, init_params): + if not init_params: + self._init_params = {} + elif isinstance(init_params, list): + self._init_params = dict(zip(self._template_names, init_params)) + elif any(name in init_params for name in self._template_names): + self._init_params = init_params - self._cv = self._get_cv(stratify, cv_splits, shuffle, random_state) + def _generate_preprocessing(self, preprocessing): + if isinstance(preprocessing, int): + self._preprocessing = {name: preprocessing for name in self._template_names} + else: + if isinstance(preprocessing, list): + preprocessing = dict(zip(self._temlpate_names, preprocessing)) + + self._preprocessing = { + name: preprocessing.get(name, 0) + for name in self._template_names + } + + def _build_pipeline(self): + self._pipeline = MLPipeline(self.template) + + if self._hyperparameters: + self._pipeline.set_hyperparameters(self._hyperparameters) + + self.fitted = False + + def __init__(self, templates, metric='accuracy', cost=False, init_params=None, stratify=True, + cv_splits=5, shuffle=True, random_state=0, preprocessing=0): if isinstance(metric, str): metric, cost = METRICS[metric] self._metric = metric self._cost = cost - self._init_params = init_params - self._preprocessing = preprocessing - + self._cv = self._get_cv(stratify, cv_splits, shuffle, random_state) self.cv_score = np.inf if cost else -np.inf - self.templates = self._get_templates(template) - self._set_template(list(self.templates.keys())[0]) - - self._hyperparameters = dict() - self._build_pipeline() - - _static = self._count_static_steps(self._pipeline) - _preprocessing = self._get_preprocessing(self.template_name) + if not isinstance(templates, list): + templates = [templates] - if _preprocessing and (_preprocessing > _static): - raise ValueError('Preprocessing cannot be bigger than static') + self._default_init_params = {} + self._generate_init_params(init_params) - self.iterations = 0 + self.templates = templates + self._template_names, self._template_dicts = self._get_templates(templates) - def __repr__(self): - steps = self._pipeline.primitives.copy() - preprocessing = self._get_preprocessing(self.template_name) - static = self._count_static_steps() + self._generate_preprocessing(preprocessing) + self._static = { + name: self._count_static_steps(name) + for name in self._template_names + } - preprocessing_steps = steps[:preprocessing] - static_steps = steps[preprocessing:static] - tunable_steps = steps[static:] - - return ( - "GreenGuardPipeline({})\n" - " preprocessing:\n{}\n" - " static:\n{}\n" - " tunable:\n{}\n" - ).format( - self.template_name, - '\n'.join(' {}'.format(step) for step in preprocessing_steps), - '\n'.join(' {}'.format(step) for step in static_steps), - '\n'.join(' {}'.format(step) for step in tunable_steps), - ) + self._set_template(self._template_names[0]) + self._hyperparameters = dict() + self._build_pipeline() def get_hyperparameters(self): """Get the current hyperparameters. @@ -300,15 +289,14 @@ def _is_better(self, score): return score > self.cv_score def _generate_splits(self, template_name, target_times, readings, turbines=None): - template = self.templates.get(template_name) + template = self._template_dicts.get(template_name) + pipeline = MLPipeline(template) + preprocessing = self._preprocessing.get(template_name) + static = self._static.get(template_name) X = target_times[['turbine_id', 'cutoff_time']] y = target_times['target'] - pipeline = MLPipeline(template) - preprocessing = self._get_preprocessing(template_name) - static = self._count_static_steps(pipeline) - if preprocessing: if preprocessing > static: @@ -354,16 +342,13 @@ def _cross_validate(self, template_splits, hyperparams): LOGGER.debug('Fold fold %s score: %s', fold, score) scores.append(score) - cv_score = np.mean(scores) - - return cv_score + return np.mean(scores) def _make_btb_scorer(self, target_times, readings, turbines): splits = {} def scorer(template_name, config): - template_splits = splits.get(template_name) if template_splits is None: template_splits = self._generate_splits( @@ -372,11 +357,16 @@ def scorer(template_name, config): splits[template_name] = template_splits cv_score = self._cross_validate(template_splits, config) - if self._is_better(cv_score): + _config = '\n'.join(' {}: {}'.format(n, v) for n, v in config.items()) + LOGGER.info(('New configuration found:\n' + ' Template: %s \n' + ' Hyperparameters: \n' + '%s'), template_name, _config) + self.cv_score = cv_score self._set_template(template_name) - self._set_hyperparameters(deepcopy(config)) + self._set_hyperparameters(config) self._build_pipeline() return cv_score @@ -416,7 +406,6 @@ def cross_validate(self, target_times, readings, turbines, Computed cross validation score. This score is the average of the scores obtained accross all the cross validation folds. """ - if not template_name: template_name = self.template_name if hyperparams is None: @@ -428,37 +417,18 @@ def cross_validate(self, target_times, readings, turbines, template_splits = self._generate_splits(template_name, target_times, readings, turbines) return self._cross_validate(template_splits, hyperparams) - @staticmethod - def _parse_params(param_details): - param_type = param_details['type'] - param_details['type'] = 'str' if param_type == 'string' else param_type - - if param_details['type'] == 'bool': - param_details['range'] = [True, False] - else: - param_details['range'] = param_details.get('range') or param_details.get('values') - - if 'default' not in param_details: - param_details['default'] = param_details['range'][0] - - return param_details - @classmethod - def _get_tunables(cls, templates): - pipelines = {name: MLPipeline(template) for name, template in templates.items()} + def _get_tunables(cls, template_dicts): tunables = {} - - for pipeline_name, pipeline in pipelines.items(): - pipeline_tunables = {} - for name, param_details in pipeline.get_tunable_hyperparameters(flat=True).items(): - pipeline_tunables[name] = cls._parse_params(param_details) - - tunables[pipeline_name] = Tunable.from_dict(pipeline_tunables) + for name, template in template_dicts.items(): + pipeline = MLPipeline(template) + pipeline_tunables = pipeline.get_tunable_hyperparameters(flat=True) + tunables[name] = Tunable.from_dict(pipeline_tunables) return tunables - def tune(self, target_times, readings, turbines=None, iterations=10): - """Tune this pipeline for the indicated number of iterations. + def tune(self, target_times, readings, turbines=None): + """Create a tuning session object that tunes and selects the templates. Args: target_times (pandas.DataFrame): @@ -471,16 +441,10 @@ def tune(self, target_times, readings, turbines=None, iterations=10): turbines (pandas.DataFrame): ``turbines`` table. Only needed if the splits have not been previously computed. - iterations (int): - Number of iterations to perform. """ scoring_function = self._make_btb_scorer(target_times, readings, turbines) - tunables = self._get_tunables(self.templates) - session = BTBSession(tunables, scoring_function, maximize=not self._cost) - if iterations: - session.run(iterations) - - return session + tunables = self._get_tunables(self._template_dicts) + return BTBSession(tunables, scoring_function, maximize=not self._cost) def fit(self, target_times, readings, turbines=None): """Fit this pipeline to the given data. diff --git a/notebooks/1. GreenGuard Quickstart.ipynb b/notebooks/1. GreenGuard Quickstart.ipynb index be4b2f6..5415d8a 100644 --- a/notebooks/1. GreenGuard Quickstart.ipynb +++ b/notebooks/1. GreenGuard Quickstart.ipynb @@ -560,16 +560,30 @@ "This method will return a `BTBSession` session that will:\n", "- Select and tune templates.\n", "- If a template or hyperparameters that get a higher score than the previous one is found, automatically update our pipeline so that it uses that template with those hyperparameters.\n", - "- Remove templates that don't work with the given data and focus on tuning only the ones that do.\n", - "\n", - "Also, if we specify `iterations` the `tune` method will return a session that has already perfromed tuning\n", - "for the given amount of iterations:" + "- Remove templates that don't work with the given data and focus on tuning only the ones that do." ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, + "outputs": [], + "source": [ + "session = pipeline.tune(target_times, readings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once we have our `session` we can call it's method `run` with the amount of\n", + "tuning iterations that we want to perform:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -583,21 +597,30 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 01:46 | Progress: 100%|██████████\n", - "Elapsed: 00:43 | Progress: 100%|██████████\n", + "Elapsed: 01:40 | Progress: 100%|██████████\n", + "Elapsed: 00:53 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:57 | Progress: 100%|██████████\n", - "Elapsed: 00:27 | Progress: 100%|██████████\n", + "Elapsed: 01:37 | Progress: 100%|██████████\n", + "Elapsed: 00:45 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:54 | Progress: 100%|██████████\n", - "Elapsed: 00:24 | Progress: 100%|██████████\n" + "Elapsed: 01:52 | Progress: 100%|██████████\n", + "Elapsed: 00:58 | Progress: 100%|██████████\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.605187908496732\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6013257575757575\n", "INFO:btb.session:Obtaining default configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n" ] }, @@ -606,14 +629,14 @@ "output_type": "stream", "text": [ "Built 99 features\n", - "Elapsed: 03:44 | Progress: 100%|██████████\n", - "Elapsed: 01:11 | Progress: 100%|██████████\n", + "Elapsed: 05:21 | Progress: 100%|██████████\n", + "Elapsed: 02:31 | Progress: 100%|██████████\n", "Built 99 features\n", - "Elapsed: 02:24 | Progress: 100%|██████████\n", - "Elapsed: 01:10 | Progress: 100%|██████████\n", + "Elapsed: 02:37 | Progress: 100%|██████████\n", + "Elapsed: 01:14 | Progress: 100%|██████████\n", "Built 99 features\n", - "Elapsed: 02:55 | Progress: 100%|██████████\n", - "Elapsed: 02:15 | Progress: 100%|██████████\n" + "Elapsed: 02:17 | Progress: 100%|██████████\n", + "Elapsed: 01:10 | Progress: 100%|██████████\n" ] }, { @@ -621,48 +644,73 @@ "output_type": "stream", "text": [ "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6074772975193733\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 76\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 663\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6981330874338336\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.42260412740973985\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6636363636363637\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n" ] + }, + { + "data": { + "text/plain": [ + "{'id': '73d39187c5c6ecfac03b05d407bf709d',\n", + " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 76,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 663,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6981330874338336,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.42260412740973985,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", + " 'score': 0.6636363636363637}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "session = pipeline.tune(target_times, readings, iterations=5)" + "session.run(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "After the tuning process has finished, the template and the hyperparameters \n", - "that have obtained the best score have been already set in the classifier.\n", - "\n", - "We can see the `best_proposal` that contains the tempalte name, hyperparameters\n", - "and score by accessing the `session.best_proposal`:" + "When this is done, the `best_proposal` will be printed out. We can access it anytime\n", + "using `session.best_proposal`:" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': 'c18f45d5e3bc2e41b3b3456b24d34add',\n", + "{'id': '73d39187c5c6ecfac03b05d407bf709d',\n", " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 82,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 940,\n", + " 'max_labels'): 76,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 663,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.5949116894971435,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.14299079052852726,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9},\n", - " 'score': 0.6074772975193733}" + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6981330874338336,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.42260412740973985,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", + " 'score': 0.6636363636363637}" ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -680,22 +728,22 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 82,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 940,\n", + " 'max_labels'): 76,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 663,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.5949116894971435,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.14299079052852726,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9}" + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6981330874338336,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.42260412740973985,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" ] }, - "execution_count": 17, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -713,7 +761,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -722,7 +770,7 @@ "'resample_600s_unstack_normalize_dfs_1d_xgb_classifier'" ] }, - "execution_count": 18, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -741,16 +789,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6074772975193733" + "0.6636363636363637" ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -770,7 +818,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -779,36 +827,42 @@ "text": [ "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6215756372962148\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6228241559394411\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:New optimal found: resample_600s_normalize_dfs_1d_xgb_classifier - 0.6310483870967741\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6421858959172391\n" + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 17\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 880\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3214711402471415\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9330408960929772\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6807110281923715\n" ] }, { "data": { "text/plain": [ - "{'id': '597e7123769b671e0f0c964311ebc005',\n", + "{'id': 'f47187d007ea31262e087264580716c9',\n", " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 5,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.8912106438743266,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.022878268134643553,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.6421858959172391}" + " 'max_labels'): 17,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 880,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3214711402471415,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9330408960929772,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", + " 'score': 0.6807110281923715}" ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -819,16 +873,16 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6421858959172391" + "0.6807110281923715" ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -839,22 +893,22 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 5,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 119,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.8912106438743266,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.022878268134643553,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + " 'max_labels'): 17,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 880,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3214711402471415,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9330408960929772,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -878,7 +932,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -886,7 +940,7 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 02:08 | Progress: 100%|██████████\n" + "Elapsed: 00:55 | Progress: 100%|██████████\n" ] } ], @@ -905,14 +959,14 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:24 | Progress: 100%|██████████\n" + "Elapsed: 00:17 | Progress: 100%|██████████\n" ] } ], @@ -929,7 +983,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 26, "metadata": {}, "outputs": [ { @@ -938,7 +992,7 @@ "0.76" ] }, - "execution_count": 25, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -967,7 +1021,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -986,7 +1040,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1002,25 +1056,15 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:22 | Progress: 100%|██████████\n" + "Elapsed: 00:01 | Progress: 11%|█ " ] - }, - { - "data": { - "text/plain": [ - "array([0, 0, 0, 1, 0])" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ From fc06af57fde8dbe69527406aea9dabfdd2d59feb Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 8 Jun 2020 19:58:42 +0200 Subject: [PATCH 038/171] Fix init params --- greenguard/pipeline.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 228c68c..a8e55d0 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -207,9 +207,6 @@ def _get_templates(self, templates): template = load_pipeline(template_name) else: template_name = md5(json.dumps(template)).digest() - - init_params = self._init_params.get(template_name, self._default_init_params) - self._update_params(template['init_params'], init_params) template_dicts[template_name] = template template_names.append(template_name) @@ -228,7 +225,7 @@ def _generate_preprocessing(self, preprocessing): self._preprocessing = {name: preprocessing for name in self._template_names} else: if isinstance(preprocessing, list): - preprocessing = dict(zip(self._temlpate_names, preprocessing)) + preprocessing = dict(zip(self._template_names, preprocessing)) self._preprocessing = { name: preprocessing.get(name, 0) @@ -257,11 +254,14 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s if not isinstance(templates, list): templates = [templates] + self.templates = templates + self._template_names, self._template_dicts = self._get_templates(templates) self._default_init_params = {} self._generate_init_params(init_params) - self.templates = templates - self._template_names, self._template_dicts = self._get_templates(templates) + for name, template in self._template_dicts.items(): + init_params = self._init_params.get(name, self._default_init_params) + self._update_params(template['init_params'], init_params) self._generate_preprocessing(preprocessing) self._static = { From 5fd72b27454f038aba3fbddfa74a3a41d290e171 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 8 Jun 2020 20:21:42 +0200 Subject: [PATCH 039/171] Lint and code improvement. --- greenguard/pipeline.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index a8e55d0..852343b 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -142,10 +142,6 @@ class GreenGuardPipeline(object): _init_params = None _preprocessing = None - @staticmethod - def _clone_pipeline(pipeline): - return MLPipeline.from_dict(pipeline.to_dict()) - def _get_cv(self, stratify, cv_splits, shuffle, random_state): if stratify: cv_class = StratifiedKFold @@ -298,7 +294,6 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) y = target_times['target'] if preprocessing: - if preprocessing > static: raise ValueError('Preprocessing cannot be bigger than static') @@ -345,7 +340,6 @@ def _cross_validate(self, template_splits, hyperparams): return np.mean(scores) def _make_btb_scorer(self, target_times, readings, turbines): - splits = {} def scorer(template_name, config): From e132aac3f9321ea462eeddb05c5339473e0ef3db Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 8 Jun 2020 20:40:38 +0200 Subject: [PATCH 040/171] Fix test. --- greenguard/pipeline.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 852343b..efcfa44 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -257,7 +257,8 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s for name, template in self._template_dicts.items(): init_params = self._init_params.get(name, self._default_init_params) - self._update_params(template['init_params'], init_params) + template_params = template.setdefault('init_params', {}) + self._update_params(template_params, init_params) self._generate_preprocessing(preprocessing) self._static = { From a0040d43623051341d83d86b8faa8e5b8a17cd87 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 9 Jun 2020 14:08:51 +0200 Subject: [PATCH 041/171] remove unecessary code --- greenguard/pipeline.py | 29 ++--------------------------- 1 file changed, 2 insertions(+), 27 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index efcfa44..8bcb218 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -150,15 +150,6 @@ def _get_cv(self, stratify, cv_splits, shuffle, random_state): return cv_class(n_splits=cv_splits, shuffle=shuffle, random_state=random_state) - def _get_init_params(self, template_name): - if self._init_params is None: - return {} - - elif any(name in self._init_params for name in list(self.templates.keys())): - return self._init_params.get(template_name) or {} - - return self._init_params - def _set_hyperparameters(self, new_hyperparameters): self._hyperparameters = deepcopy(new_hyperparameters) @@ -166,15 +157,6 @@ def _set_template(self, template_name): self.template_name = template_name self.template = self._template_dicts[self.template_name] - def _get_preprocessing(self, template_name): - if isinstance(self._preprocessing, int): - return self._preprocessing - - if isinstance(self._preprocessing, dict): - return self._preprocessing.get(template_name) or 0 - - return 0 # by default - @staticmethod def _update_params(old, new): for name, params in new.items(): @@ -185,8 +167,7 @@ def _update_params(old, new): for param, value in params.items(): block_params[param] = value - def _count_static_steps(self, template_name): - pipeline = MLPipeline(self._template_dicts.get(template_name)) + def _count_static_steps(self, pipeline): tunable_hyperparams = pipeline.get_tunable_hyperparameters() for index, block_name in enumerate(pipeline.blocks.keys()): if tunable_hyperparams[block_name]: @@ -261,11 +242,6 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s self._update_params(template_params, init_params) self._generate_preprocessing(preprocessing) - self._static = { - name: self._count_static_steps(name) - for name in self._template_names - } - self._set_template(self._template_names[0]) self._hyperparameters = dict() self._build_pipeline() @@ -289,8 +265,7 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) template = self._template_dicts.get(template_name) pipeline = MLPipeline(template) preprocessing = self._preprocessing.get(template_name) - static = self._static.get(template_name) - + static = self._count_static_steps(pipeline) X = target_times[['turbine_id', 'cutoff_time']] y = target_times['target'] From 23c77e6a753deb44ebfd612b8cbf182426f64dca Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 15 Jun 2020 15:27:23 +0200 Subject: [PATCH 042/171] Update docstrings for the release v0.2.1 --- docs/advanced_usage/concepts.md | 15 +- greenguard/pipeline.py | 19 ++- notebooks/1. GreenGuard Quickstart.ipynb | 200 +++++++++++++---------- 3 files changed, 138 insertions(+), 96 deletions(-) diff --git a/docs/advanced_usage/concepts.md b/docs/advanced_usage/concepts.md index 302d34a..16c6f97 100644 --- a/docs/advanced_usage/concepts.md +++ b/docs/advanced_usage/concepts.md @@ -47,10 +47,23 @@ which hyperparameters are more likely to get the best results in the next iterat We call each one of these evaluations a **tuning iteration**. +## Tuning Session + +We call tuning session to the [BTBSession]( +https://hdi-project.github.io/BTB/tutorials/03_Session.html) instance generated for a given +collection of templates and data. This tuning session searches for the best solution for the +tuning problem by performing tuning and selection over the given templates, evaluating wich +template to try next according to their previous score using a [Multi-armed Bandit]( +https://en.wikipedia.org/wiki/Multi-armed_bandit) aproach. + +The tuning session is in charge of discarding the templates that are not useful, updating the +best template to be used and it's hyperparameters that have generated the best score for the +given data. + ## GreenGuardPipeline This class is the one in charge of loading the **MLBlocks Pipelines** configured in the system and use them to learn from the data and make predictions. -This class is also responsible for tuning the pipeline hyperparameters using [BTB]( +This class is also responsible for creating the tuning session with [BTB]( https://hdi-project.github.io/BTB/) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 8bcb218..f094d77 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -97,18 +97,22 @@ class GreenGuardPipeline(object): the tuning loop. Args: - template (str or MLPipeline): + templates (str, MLPipeline or list): Template to use. If a ``str`` is given, load the corresponding - ``MLPipeline``. + ``MLPipeline``. Also can be a list combining both. metric (str or function): Metric to use. If an ``str`` is give it must be one of the metrics defined in the ``greenguard.metrics.METRICS`` dictionary. cost (bool): Whether the metric is a cost function (the lower the better) or not. Defaults to ``False``. - init_params (dict): + init_params (dict or list): Initial parameters to pass to the underlying MLPipeline if something - other than the defaults need to be used. + other than the defaults need to be used. If a single dict is given + it will be used for all the templates. If is a list of dicts, those + will be matched by position with the templates. If a dict that has + as keys the name of the templates and as values a dict with init + params, those will be used for each corresponding template. Defaults to ``None``. stratify (bool): Whether to stratify the data when partitioning for cross validation. @@ -121,10 +125,13 @@ class GreenGuardPipeline(object): random_state (int or RandomState): random state to use for the cross validation partitioning. Defaults to ``0``. - preprocessing (int): + preprocessing (int, dict or list): Number of steps to execute during the preprocessing stage. The number of preprocessing steps cannot be higher than the - number of static steps in the given template. + number of static steps in the given template. If is a list of ints, + those will be matched by position with the templates. If a dict that + has as keys the name of the templates and as values a int those will be + used for each corresponding template. Defaults to ``0``. """ diff --git a/notebooks/1. GreenGuard Quickstart.ipynb b/notebooks/1. GreenGuard Quickstart.ipynb index 5415d8a..4fd08b8 100644 --- a/notebooks/1. GreenGuard Quickstart.ipynb +++ b/notebooks/1. GreenGuard Quickstart.ipynb @@ -14,9 +14,9 @@ "This notebook shows how to use GreenGuard to:\n", "\n", "- Load demo data\n", - "- Find available pipelines and load one as a template\n", - "- Tune the template arguments to generate the optimal pipeline\n", - "- Fit the pipeline to our data\n", + "- Find available pipelines and load two of them as templates\n", + "- Tune the templates to find the best template for the given data and its hyperparameters\n", + "- Fit the found pipeline to our data\n", "- Make predictions using the pipeline\n", "- Evaluate the goodness-of-fit" ] @@ -393,10 +393,10 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Finding a Template\n", + "## 3. Finding the Templates\n", "\n", - "The next step will be to select a template from the ones available in\n", - "GreenGuard.\n", + "The next step will be to select a collection of templates from the ones\n", + "available in GreenGuard.\n", "\n", "For this, we can use the `greenguard.get_pipelines` function, which will\n", "return us the list of all the available MLBlocks pipelines found in the\n", @@ -476,9 +476,9 @@ { "data": { "text/plain": [ - "{'resample_600s_unstack_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json'}" + "{'resample_600s_unstack_dfs_1d_xgb_classifier': '/home/pacho/Projects/mit/GreenGuard/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_normalize_dfs_1d_xgb_classifier': '/home/pacho/Projects/mit/GreenGuard/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/home/pacho/Projects/mit/GreenGuard/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json'}" ] }, "execution_count": 12, @@ -494,8 +494,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "For the rest of this tutorial, we will select and use the pipeline\n", - "`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` and `resample_600s_normalize_dfs_1d_xgb_classifier`.\n", + "For the rest of this tutorial, we will select and use the templates\n", + "`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` and\n", + "`resample_600s_normalize_dfs_1d_xgb_classifier`.\n", "\n", "The `resample_600s_unstack_normalize_dfs_1d_xgb_classifier` template contains the following steps:\n", "\n", @@ -525,11 +526,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Finding the best Pipeline\n", + "## 4. Finding the best Pipeline\n", "\n", "Once we have loaded the data, we create a **GreenGuardPipeline** instance by passing:\n", "\n", - "* `template (string)`: the name of a template or the path to a template json file.\n", + "* `templates (string or list)`: the name of a template, the path to a template json file or\n", + "a list that can combine both of them.\n", "* `metric (string or function)`: The name of the metric to use or a metric function to use.\n", "* `cost (bool)`: Whether the metric is a cost function to be minimized or a score to be maximized.\n", "\n", @@ -597,14 +599,14 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 01:40 | Progress: 100%|██████████\n", - "Elapsed: 00:53 | Progress: 100%|██████████\n", + "Elapsed: 00:41 | Progress: 100%|██████████\n", + "Elapsed: 00:18 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 01:37 | Progress: 100%|██████████\n", - "Elapsed: 00:45 | Progress: 100%|██████████\n", + "Elapsed: 00:37 | Progress: 100%|██████████\n", + "Elapsed: 00:18 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 01:52 | Progress: 100%|██████████\n", - "Elapsed: 00:58 | Progress: 100%|██████████\n" + "Elapsed: 00:37 | Progress: 100%|██████████\n", + "Elapsed: 00:18 | Progress: 100%|██████████\n" ] }, { @@ -620,7 +622,7 @@ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6013257575757575\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6079987550575785\n", "INFO:btb.session:Obtaining default configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n" ] }, @@ -629,13 +631,13 @@ "output_type": "stream", "text": [ "Built 99 features\n", - "Elapsed: 05:21 | Progress: 100%|██████████\n", - "Elapsed: 02:31 | Progress: 100%|██████████\n", + "Elapsed: 02:06 | Progress: 100%|██████████\n", + "Elapsed: 01:02 | Progress: 100%|██████████\n", "Built 99 features\n", - "Elapsed: 02:37 | Progress: 100%|██████████\n", - "Elapsed: 01:14 | Progress: 100%|██████████\n", + "Elapsed: 01:53 | Progress: 100%|██████████\n", + "Elapsed: 00:54 | Progress: 100%|██████████\n", "Built 99 features\n", - "Elapsed: 02:17 | Progress: 100%|██████████\n", + "Elapsed: 01:55 | Progress: 100%|██████████\n", "Elapsed: 01:10 | Progress: 100%|██████████\n" ] }, @@ -647,30 +649,40 @@ "INFO:greenguard.pipeline:New configuration found:\n", " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 76\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 663\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 9\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 28\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6981330874338336\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.42260412740973985\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6636363636363637\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3977560491030686\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.19143248884807773\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 8\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6418782052584869\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n" + "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 14\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 18\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 5\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39294364912150626\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.3393295330438333\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6671775409915827\n" ] }, { "data": { "text/plain": [ - "{'id': '73d39187c5c6ecfac03b05d407bf709d',\n", + "{'id': '2a494af25e2d986c9178fd47820d4b00',\n", " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 76,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 663,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6981330874338336,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.42260412740973985,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", - " 'score': 0.6636363636363637}" + " 'max_labels'): 14,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 18,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39294364912150626,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.3393295330438333,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9},\n", + " 'score': 0.6671775409915827}" ] }, "execution_count": 16, @@ -698,16 +710,16 @@ { "data": { "text/plain": [ - "{'id': '73d39187c5c6ecfac03b05d407bf709d',\n", + "{'id': '2a494af25e2d986c9178fd47820d4b00',\n", " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 76,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 663,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6981330874338336,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.42260412740973985,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", - " 'score': 0.6636363636363637}" + " 'max_labels'): 14,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 18,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39294364912150626,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.3393295330438333,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9},\n", + " 'score': 0.6671775409915827}" ] }, "execution_count": 17, @@ -723,7 +735,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "You can check that the new hyperparameters are already set by callgin `get_hyperparameters` method: " + "You can check that the new hyperparameters are already set by calling `get_hyperparameters` method: " ] }, { @@ -735,12 +747,12 @@ "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 76,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 663,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 4,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6981330874338336,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.42260412740973985,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" + " 'max_labels'): 14,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 18,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39294364912150626,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.3393295330438333,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9}" ] }, "execution_count": 18, @@ -756,7 +768,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can check the template name that is used to generate the pipeline:" + "We can also check the template name that is used to generate the pipeline:" ] }, { @@ -795,7 +807,7 @@ { "data": { "text/plain": [ - "0.6636363636363637" + "0.6671775409915827" ] }, "execution_count": 20, @@ -812,7 +824,7 @@ "metadata": {}, "source": [ "**NOTE**: If the score is not good enough, we can call the `run` method of the `session` again,\n", - "specifying the amount of iterations, and this will continue its tuning process continuing from\n", + "specifying the amount of iterations, and this will resume its tuning process continuing from\n", "the previous results!" ] }, @@ -827,39 +839,39 @@ "text": [ "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 99\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 143\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 9\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.06337107325877978\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.932864412690726\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", + "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6854149434794596\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:greenguard.pipeline:New configuration found:\n", - " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 17\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 880\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3214711402471415\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9330408960929772\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6807110281923715\n" + "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n" ] }, { "data": { "text/plain": [ - "{'id': 'f47187d007ea31262e087264580716c9',\n", + "{'id': '9999fcb9fdc53cf7bf8f1398cea07fab',\n", " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 17,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 880,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3214711402471415,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9330408960929772,\n", + " 'max_labels'): 99,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 143,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 9,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.06337107325877978,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.932864412690726,\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", - " 'score': 0.6807110281923715}" + " 'score': 0.6854149434794596}" ] }, "execution_count": 21, @@ -879,7 +891,7 @@ { "data": { "text/plain": [ - "0.6807110281923715" + "0.6854149434794596" ] }, "execution_count": 22, @@ -900,11 +912,11 @@ "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 17,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 880,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3214711402471415,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9330408960929772,\n", + " 'max_labels'): 99,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 143,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 9,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.06337107325877978,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.932864412690726,\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" ] }, @@ -921,7 +933,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 4. Fitting the pipeline\n", + "## 5. Fitting the pipeline\n", "\n", "Once we are satisfied with the obtained cross validation score, we can proceed to call\n", "the `fit` method passing again the same data elements.\n", @@ -940,7 +952,7 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 00:55 | Progress: 100%|██████████\n" + "Elapsed: 00:48 | Progress: 100%|██████████\n" ] } ], @@ -952,7 +964,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 5. Use the fitted pipeline\n", + "## 6. Use the fitted pipeline\n", "\n", "After fitting the pipeline, we are ready to make predictions on new data:" ] @@ -989,7 +1001,7 @@ { "data": { "text/plain": [ - "0.76" + "0.7346938775510203" ] }, "execution_count": 26, @@ -1007,7 +1019,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 6. Save and load the pipeline\n", + "## 7. Save and load the pipeline\n", "\n", "Since the tuning and fitting process takes time to execute and requires a lot of data, you\n", "will probably want to save a fitted instance and load it later to analyze new signals\n", @@ -1056,15 +1068,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:01 | Progress: 11%|█ " + "Elapsed: 00:19 | Progress: 100%|██████████\n" ] + }, + { + "data": { + "text/plain": [ + "array([0, 0, 0, 1, 0])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ From 0b9c4dec5193f402604715a09f669a29e3b1fe1d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Mon, 15 Jun 2020 16:26:50 +0200 Subject: [PATCH 043/171] Improve phrasing --- docs/advanced_usage/concepts.md | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/docs/advanced_usage/concepts.md b/docs/advanced_usage/concepts.md index 16c6f97..f39bffa 100644 --- a/docs/advanced_usage/concepts.md +++ b/docs/advanced_usage/concepts.md @@ -38,8 +38,8 @@ https://hdi-project.github.io/MLBlocks), or using the **GreenGuardPipeline**. ## Tuning -We call tuning the process of, given a dataset and a template, finding the pipeline derived from -the template that gets the best possible score on the dataset. +We call tuning the process of, given a dataset and a collection of templates, finding the pipeline +derived from the templates that gets the best possible score on the dataset. This process usually involves fitting and evaluating multiple pipelines with different hyperparameter configurations on the same data while using optimization algorithms to deduce @@ -47,23 +47,15 @@ which hyperparameters are more likely to get the best results in the next iterat We call each one of these evaluations a **tuning iteration**. -## Tuning Session - -We call tuning session to the [BTBSession]( -https://hdi-project.github.io/BTB/tutorials/03_Session.html) instance generated for a given -collection of templates and data. This tuning session searches for the best solution for the -tuning problem by performing tuning and selection over the given templates, evaluating wich -template to try next according to their previous score using a [Multi-armed Bandit]( -https://en.wikipedia.org/wiki/Multi-armed_bandit) aproach. - -The tuning session is in charge of discarding the templates that are not useful, updating the -best template to be used and it's hyperparameters that have generated the best score for the -given data. +The process of selecting and tuning the templates is handled by a [BTBSession]( +https://hdi-project.github.io/BTB/tutorials/03_Session.html), which is responsible for +discarding the templates that do not work on the given data and for keeping +track of the template and hyperparameters that obtain the best performance. ## GreenGuardPipeline This class is the one in charge of loading the **MLBlocks Pipelines** configured in the system and use them to learn from the data and make predictions. -This class is also responsible for creating the tuning session with [BTB]( -https://hdi-project.github.io/BTB/) +This class is also responsible for creating the BTBSession that will handle the +selection and tuning of the templates. From 2fe6bf69723f5f8b5a3390ac52ab8bd23ed370d7 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 15 Jun 2020 18:35:04 +0200 Subject: [PATCH 044/171] Anon path. --- notebooks/1. GreenGuard Quickstart.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/notebooks/1. GreenGuard Quickstart.ipynb b/notebooks/1. GreenGuard Quickstart.ipynb index 4fd08b8..ec7c0a7 100644 --- a/notebooks/1. GreenGuard Quickstart.ipynb +++ b/notebooks/1. GreenGuard Quickstart.ipynb @@ -476,9 +476,9 @@ { "data": { "text/plain": [ - "{'resample_600s_unstack_dfs_1d_xgb_classifier': '/home/pacho/Projects/mit/GreenGuard/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_normalize_dfs_1d_xgb_classifier': '/home/pacho/Projects/mit/GreenGuard/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/home/pacho/Projects/mit/GreenGuard/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json'}" + "{'resample_600s_unstack_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json'}" ] }, "execution_count": 12, From 76142d3ca35f3b5e39e310074dbde178bb4a282b Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 15 Jun 2020 21:54:11 +0200 Subject: [PATCH 045/171] Improve pipeline docstring --- greenguard/pipeline.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index f094d77..135ebbd 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -107,12 +107,14 @@ class GreenGuardPipeline(object): Whether the metric is a cost function (the lower the better) or not. Defaults to ``False``. init_params (dict or list): - Initial parameters to pass to the underlying MLPipeline if something - other than the defaults need to be used. If a single dict is given - it will be used for all the templates. If is a list of dicts, those - will be matched by position with the templates. If a dict that has - as keys the name of the templates and as values a dict with init - params, those will be used for each corresponding template. + There are three possible values for init_params: + + * Init params ``dict``: It will be used for all templates. + * ``dict`` with the name of the template as a key and dictionary with its + init params. + * ``list``: each value will be assigned to the corresponding position of + self.templates. + Defaults to ``None``. stratify (bool): Whether to stratify the data when partitioning for cross validation. @@ -126,12 +128,14 @@ class GreenGuardPipeline(object): random state to use for the cross validation partitioning. Defaults to ``0``. preprocessing (int, dict or list): - Number of steps to execute during the preprocessing stage. - The number of preprocessing steps cannot be higher than the - number of static steps in the given template. If is a list of ints, - those will be matched by position with the templates. If a dict that - has as keys the name of the templates and as values a int those will be - used for each corresponding template. + There are three possible values for preprocessing: + + * ``int``: the value will be used for all templates. + * ``dict`` with the template name as a key and a number as a value, will + be used for that template. + * ``list``: each value will be assigned to the corresponding position of + self.templates. + Defaults to ``0``. """ From 58c75f2cb0d39760ef8a5fd0ed5f2b06cd3430ef Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 16 Jun 2020 11:06:11 +0200 Subject: [PATCH 046/171] Add release notes for v0.2.1 --- HISTORY.md | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 90dec27..c5a9de0 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,30 @@ # History +## 0.2.1 - 2020-06-16 + +With this release we give the possibility to the user to specify more than one template when +creating a GreenGuardPipeline. When the `tune` method of this is called, an instance of BTBSession +is returned and it is in charge of selecting the templates and tuning their hyperparameters until +achieving the best pipeline. + +### Internal Improvements + +* Resample by filename inside the `CSVLoader` to avoid oversampling of data that will not be used. +* Select targets now allows them to be equal. +* Fixed the csv filename format. +* Upgraded to BTB. + +### Bug Fixes + +* Issue #33: Wrong default datetime format + +### Resolved Issues + +* Issue #35: Select targets is too strict +* Issue #36: resample by filename inside csvloader +* Issue #39: Upgrade BTB +* Issue #41: Fix CSV filename format + ## 0.2.0 - 2020-02-14 First stable release: From 43e9a3ee13dafa37541be2dbd876252c7996f7ee Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 16 Jun 2020 11:06:15 +0200 Subject: [PATCH 047/171] =?UTF-8?q?Bump=20version:=200.2.1.dev1=20?= =?UTF-8?q?=E2=86=92=200.2.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 3 +-- setup.py | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index df69d4a..db65211 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.1.dev1' +__version__ = '0.2.1' import os diff --git a/setup.cfg b/setup.cfg index f6734b1..aa01481 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1.dev1 +current_version = 0.2.1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? @@ -45,4 +45,3 @@ test = pytest [tool:pytest] collect_ignore = ['setup.py'] - diff --git a/setup.py b/setup.py index 89083b4..accc5d1 100644 --- a/setup.py +++ b/setup.py @@ -105,6 +105,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.1.dev1', + version='0.2.1', zip_safe=False, ) From dba471652ad7e1833732e7531b91d7cb6698d8bd Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 16 Jun 2020 11:06:54 +0200 Subject: [PATCH 048/171] =?UTF-8?q?Bump=20version:=200.2.1=20=E2=86=92=200?= =?UTF-8?q?.2.2.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index db65211..c9e61c2 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.1' +__version__ = '0.2.2.dev0' import os diff --git a/setup.cfg b/setup.cfg index aa01481..7f91cdd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1 +current_version = 0.2.2.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index accc5d1..67516b5 100644 --- a/setup.py +++ b/setup.py @@ -105,6 +105,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.1', + version='0.2.2.dev0', zip_safe=False, ) From fb289e8e89306fa31371171eb689cb8e1bc3d5db Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 22 Jun 2020 10:52:24 +0200 Subject: [PATCH 049/171] Rearrange docker --- .dockerignore | 2 - Makefile | 64 +++++++------ docker/.dockerignore | 2 + Dockerfile => docker/Dockerfile | 0 docker/Makefile | 45 ++++++++++ DOCKER.md => docker/README.md | 89 ++++++++++++++----- .../docker-compose.yml | 0 docker/greenguard-deployment.yml | 34 +++++++ 8 files changed, 187 insertions(+), 49 deletions(-) delete mode 100644 .dockerignore create mode 100644 docker/.dockerignore rename Dockerfile => docker/Dockerfile (100%) create mode 100644 docker/Makefile rename DOCKER.md => docker/README.md (50%) rename docker-compose.yml => docker/docker-compose.yml (100%) create mode 100644 docker/greenguard-deployment.yml diff --git a/.dockerignore b/.dockerignore deleted file mode 100644 index 7ea8e51..0000000 --- a/.dockerignore +++ /dev/null @@ -1,2 +0,0 @@ -notebooks-private/ -.tox/ diff --git a/Makefile b/Makefile index 7683786..32a5ba3 100644 --- a/Makefile +++ b/Makefile @@ -213,30 +213,40 @@ release-major: check-release bumpversion-major release # DOCKER TARGETS -.PHONY: docker-jupyter-clean -docker-jupyter-clean: ## Remove the greenguard-jupyter docker image - docker rmi -f greenguard-jupyter - -.PHONY: docker-jupyter-build -docker-jupyter-build: ## Build the greenguard-jupyter docker image using repo2docker - docker build -t greenguard-jupyter . - -.PHONY: docker-jupyter-save -docker-jupyter-save: docker-jupyter-build ## Build the greenguard-jupyter image and save it as greenguard-jupyter.tar - docker save --output greenguard-jupyter.tar greenguard-jupyter - -.PHONY: docker-jupyter-load -docker-jupyter-load: ## Load the greenguard-jupyter image from greenguard-jupyter.tar - docker load --input greenguard-jupyter.tar - -.PHONY: docker-jupyter-run -docker-jupyter-run: ## Run the greenguard-jupyter image in editable mode - docker run --rm -v $(shell pwd):/greenguard -ti -p8888:8888 --name greenguard-jupyter greenguard-jupyter - -.PHONY: docker-jupyter-start -docker-jupyter-start: ## Start the greenguard-jupyter image as a daemon - docker run --rm -d -v $(shell pwd):/greenguard -ti -p8888:8888 --name greenguard-jupyter greenguard-jupyter - -.PHONY: docker-jupyter-stop -docker-jupyter-stop: ## Stop the greenguard-jupyter daemon - docker stop greenguard-jupyter +.PHONY: docker-clean +docker-clean: ## Remove the greenguard docker image + docker rmi -f greenguard + +.PHONY: docker-build +docker-build: + docker build -f docker/Dockerfile -t greenguard . + +.PHONY: docker-save +docker-save: docker-build ## Build the greenguard image and save it as greenguard.tar + docker save --output greenguard.tar greenguard + +.PHONY: docker-load +docker-load: ## Load the greenguard image from greenguard.tar + docker load --input greenguard.tar + +.PHONY: docker-run +docker-run: ## Run the greenguard image in editable mode + docker run --rm -v $(shell pwd):/greenguard -ti -p8888:8888 --name greenguard greenguard + +.PHONY: docker-start +docker-start: ## Start the greenguard image as a daemon + docker run --rm -d -v $(shell pwd):/greenguard -ti -p8888:8888 --name greenguard greenguard + +.PHONY: docker-stop +docker-stop: ## Stop the greenguard daemon + docker stop greenguard + +.PHONY: docker-login +docker-login: + docker login + +.PHONY: docker-push +docker-push: docker-login docker-build + @$(eval VERSION := $(shell python -c 'import greenguard; print(greenguard.__version__)')) + docker tag greenguard signals-dev/greenguard:$(VERSION) + docker push signals-dev/greenguard:$(VERSION) diff --git a/docker/.dockerignore b/docker/.dockerignore new file mode 100644 index 0000000..a87abf0 --- /dev/null +++ b/docker/.dockerignore @@ -0,0 +1,2 @@ +../notebooks-private/ +../.tox/ diff --git a/Dockerfile b/docker/Dockerfile similarity index 100% rename from Dockerfile rename to docker/Dockerfile diff --git a/docker/Makefile b/docker/Makefile new file mode 100644 index 0000000..afc04f0 --- /dev/null +++ b/docker/Makefile @@ -0,0 +1,45 @@ +.DEFAULT_GOAL := help + +define BROWSER_PYSCRIPT +import os, webbrowser, sys + +try: + from urllib import pathname2url +except: + from urllib.request import pathname2url + +webbrowser.open("file://" + pathname2url(http://github.com/os.path.abspath(sys.argv[1]))) +endef +export BROWSER_PYSCRIPT + +define PRINT_HELP_PYSCRIPT +import re, sys + +for line in sys.stdin: + match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) + if match: + target, help = match.groups() + print("%-20s %s" % (target, help)) +endef +export PRINT_HELP_PYSCRIPT + +BROWSER := python -c "$$BROWSER_PYSCRIPT" + +.PHONY: help +help: + @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) + +# DOCKER TARGET +.PHONY: docker-login +docker-login: + docker login + +.PHONY: docker-build +docker-build: + docker build -t greenguard-jupyter . + +.PHONY: docker-push +docker-push: docker-login docker-build + @$(eval VERSION := $(shell python -c 'import greenguard; print(greenguard.__version__)')) + docker tag greenguard-jupyter signals-dev/greenguard:$(VERSION) + docker push signals-dev/greenguard:$(VERSION) diff --git a/DOCKER.md b/docker/README.md similarity index 50% rename from DOCKER.md rename to docker/README.md index 91d1a28..19c12b3 100644 --- a/DOCKER.md +++ b/docker/README.md @@ -9,36 +9,38 @@ installed. The only requirement in order to run the GreenGuard Docker image is to have Docker installed and that the user has enough permissions to run it. -Installation instructions for any possible system compatible can be found [here](https://docs.docker.com/install/) +Installation instructions for any possible system compatible can be found [here]( +https://docs.docker.com/install/). Additionally, the system that builds the GreenGuard Docker image will also need to have a working internet connection that allows downloading the base image and the additional python depenedencies. ## Building the GreenGuard Docker Image -After having cloned the **GreenGuard** repository, all you have to do in order to build the GreenGuard Docker -Image is running this command: +After having cloned the **GreenGuard** repository, all you have to do in order to build the +GreenGuard Docker Image is running this command: ```bash -make docker-jupyter-build +make docker-build ``` -After a few minutes, the new image, called `greenguard-jupyter`, will have been built into the system +After a few minutes, the new image, called `greenguard`, will have been built into the system and will be ready to be used or distributed. ## Distributing the GreenGuard Docker Image -Once the `greenguard-jupyter` image is built, it can be distributed in several ways. +Once the `greenguard` image is built, it can be distributed in several ways. ### Distributing using a Docker registry -The simplest way to distribute the recently created image is [using a registry](https://docs.docker.com/registry/). +The simplest way to distribute the recently created image is [using a registry]( +https://docs.docker.com/registry/). In order to do so, we will need to have write access to a public or private registry (remember to [login](https://docs.docker.com/engine/reference/commandline/login/)!) and execute these commands: ```bash -docker tag greenguard-jupyter:latest your-registry-name:some-tag +docker tag greenguard:latest your-registry-name:some-tag docker push your-registry-name:some-tag ``` @@ -46,7 +48,7 @@ Afterwards, in the receiving machine: ```bash docker pull your-registry-name:some-tag -docker tag your-registry-name:some-tag greenguard-jupyter:latest +docker tag your-registry-name:some-tag greenguard:latest ``` ### Distributing as a file @@ -57,32 +59,32 @@ using the following command. In the system that already has the image: ```bash -docker save --output greenguard-jupyter.tar greenguard-jupyter +docker save --output greenguard.tar greenguard ``` -Then copy over the file `greenguard-jupyter.tar` to the new system and there, run: +Then copy over the file `greenguard.tar` to the new system and there, run: ```bash -docker load --input greenguard-jupyter.tar +docker load --input greenguard.tar ``` -After these commands, the `greenguard-jupyter` image should be available and ready to be used in the +After these commands, the `greenguard` image should be available and ready to be used in the new system. -## Running the greenguard-jupyter image +## Running the greenguard image -Once the `greenguard-jupyter` image has been built, pulled or loaded, it is ready to be run. +Once the `greenguard` image has been built, pulled or loaded, it is ready to be run. This can be done in two ways: -### Running greenguard-jupyter with the code +### Running greenguard with the code If the GreenGuard source code is available in the system, running the image is as simple as running this command from within the root of the project: ```bash -make docker-jupyter-run +make docker-run ``` This will start a jupyter notebook using the docker image, which you can access by pointing your @@ -93,20 +95,67 @@ which means that any changes that you do in your local code will immediately be within your notebooks, and that any notebook that you create within jupyter will also show up in your `notebooks` folder! -### Running greenguard-jupyter without the greenguard code +### Running greenguard without the greenguard code If the GreenGuard source code is not available in the system and only the Docker Image is, you can still run the image by using this command: ```bash -docker run -ti -p 8888:8888 greenguard-jupyter +docker run -ti -p 8888:8888 greenguard ``` In this case, the code changes and the notebooks that you create within jupyter will stay inside the container and you will only be able to access and download them through the jupyter interface. +## Running the greenguard image on kubernetes + +### Running as pod + +There is a possiblity to run GreenGuard's docker image on a local kubernetes cluster. Once you have +created the docker image (locally or remotely) and you have [kubernetes]( +https://kubernetes.io/docs/home/) properly setup at your local environment, copy and paste the +following pod configuration into a `yml` file: + +```yml +apiVersion: v1 +kind: Pod +metadata: + name: greenguard +spec: + containers: + - name: greenguard + image: signals-dev/greenguard-jupyter:0.2.2.dev0 + ports: + - containerPort: 8888 +``` + +**Note** If you would like to use your local image that you created previously, or an image +from another repository that's not the official one, change the `image` value to the one that +corresponds to yours. + +Once you have created the `yml` file, you can run the following command to launch the pod: + +```bash +kubectl apply -f file.yml +``` + +This will create a pod named `greenguard` and in order to access it, we will have to forward +the port 8888 from the pod to our localhost. To do so, just run the following command: + +```bash +kubectl port-forward greeguard 8888 +``` + +Finally we can point our browser to http://localhost:8888 and use the GreenGuard software. + +### Running GreenGuard a service + +Kubernetes allows the posibility to run a docker image as a services, inside this folder you +will find a `greenguard-deployment.yml` file, ready to use as an deployment service, which has +the port forwarded to the `30088`. You can use this template to adapt it to your needs. + ## What's next? For more details about **GreenGuard** and all its possibilities and features, please check the -[project documentation site](https://D3-AI.github.io/GreenGuard/)! +[project documentation site](https://signals-dev.github.io/GreenGuard/)! diff --git a/docker-compose.yml b/docker/docker-compose.yml similarity index 100% rename from docker-compose.yml rename to docker/docker-compose.yml diff --git a/docker/greenguard-deployment.yml b/docker/greenguard-deployment.yml new file mode 100644 index 0000000..a51139a --- /dev/null +++ b/docker/greenguard-deployment.yml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: Service +metadata: + name: greenguard-jupyter +spec: + ports: + - name: jupyter + port: 8888 + nodePort: 30088 + selector: + app: greenguard-jupyter + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: greenguard-jupyter +spec: + selector: + matchLabels: + app: greenguard-jupyter + strategy: + type: Recreate + template: + metadata: + labels: + app: greenguard-jupyter + spec: + containers: + - image: pvkdev/greenguard-jupyter:0.2.2.dev0 + name: greenguard-jupyter + ports: + - containerPort: 8888 + name: jupyter From 859627746456a12a10a280de4b71f9365bd8b93f Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Mon, 22 Jun 2020 10:53:05 +0200 Subject: [PATCH 050/171] Delete Makefile --- docker/Makefile | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 docker/Makefile diff --git a/docker/Makefile b/docker/Makefile deleted file mode 100644 index afc04f0..0000000 --- a/docker/Makefile +++ /dev/null @@ -1,45 +0,0 @@ -.DEFAULT_GOAL := help - -define BROWSER_PYSCRIPT -import os, webbrowser, sys - -try: - from urllib import pathname2url -except: - from urllib.request import pathname2url - -webbrowser.open("file://" + pathname2url(http://github.com/os.path.abspath(sys.argv[1]))) -endef -export BROWSER_PYSCRIPT - -define PRINT_HELP_PYSCRIPT -import re, sys - -for line in sys.stdin: - match = re.match(r'^([a-zA-Z_-]+):.*?## (.*)$$', line) - if match: - target, help = match.groups() - print("%-20s %s" % (target, help)) -endef -export PRINT_HELP_PYSCRIPT - -BROWSER := python -c "$$BROWSER_PYSCRIPT" - -.PHONY: help -help: - @python -c "$$PRINT_HELP_PYSCRIPT" < $(MAKEFILE_LIST) - -# DOCKER TARGET -.PHONY: docker-login -docker-login: - docker login - -.PHONY: docker-build -docker-build: - docker build -t greenguard-jupyter . - -.PHONY: docker-push -docker-push: docker-login docker-build - @$(eval VERSION := $(shell python -c 'import greenguard; print(greenguard.__version__)')) - docker tag greenguard-jupyter signals-dev/greenguard:$(VERSION) - docker push signals-dev/greenguard:$(VERSION) From 5881709e524d4569fa2d99bc64f967f0a202706f Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 22 Jun 2020 13:02:17 +0200 Subject: [PATCH 051/171] Updated structure --- Makefile | 28 +------ README.md | 23 +----- docker/README.md | 160 ++++++++++---------------------------- docker/docker-compose.yml | 11 --- 4 files changed, 45 insertions(+), 177 deletions(-) delete mode 100644 docker/docker-compose.yml diff --git a/Makefile b/Makefile index 32a5ba3..96cfec3 100644 --- a/Makefile +++ b/Makefile @@ -213,34 +213,10 @@ release-major: check-release bumpversion-major release # DOCKER TARGETS -.PHONY: docker-clean -docker-clean: ## Remove the greenguard docker image - docker rmi -f greenguard - .PHONY: docker-build docker-build: docker build -f docker/Dockerfile -t greenguard . -.PHONY: docker-save -docker-save: docker-build ## Build the greenguard image and save it as greenguard.tar - docker save --output greenguard.tar greenguard - -.PHONY: docker-load -docker-load: ## Load the greenguard image from greenguard.tar - docker load --input greenguard.tar - -.PHONY: docker-run -docker-run: ## Run the greenguard image in editable mode - docker run --rm -v $(shell pwd):/greenguard -ti -p8888:8888 --name greenguard greenguard - -.PHONY: docker-start -docker-start: ## Start the greenguard image as a daemon - docker run --rm -d -v $(shell pwd):/greenguard -ti -p8888:8888 --name greenguard greenguard - -.PHONY: docker-stop -docker-stop: ## Stop the greenguard daemon - docker stop greenguard - .PHONY: docker-login docker-login: docker login @@ -248,5 +224,5 @@ docker-login: .PHONY: docker-push docker-push: docker-login docker-build @$(eval VERSION := $(shell python -c 'import greenguard; print(greenguard.__version__)')) - docker tag greenguard signals-dev/greenguard:$(VERSION) - docker push signals-dev/greenguard:$(VERSION) + docker tag greenguard signalsdev/greenguard:$(VERSION) + docker push signalsdev/greenguard:$(VERSION) diff --git a/README.md b/README.md index 7bb64d4..8a3c794 100644 --- a/README.md +++ b/README.md @@ -69,26 +69,9 @@ If you want to install from source or contribute to the project please read the ## Docker usage -Alternatively, **GreenGuard** is prepared to be run inside a docker environment using -`docker-compose`. - -For this, make sure to have both [docker](https://docs.docker.com/install/) and [docker-compose]( -https://docs.docker.com/compose/install/) installed on your system and then follow these steps: - -1. Clone this repository and go into the `GreenGuard` folder: - -```bash -git clone git@github.com:signals-dev/GreenGuard.git -cd GreenGuard -``` - -2. Start a Jupyter Notebook inside a docker container. - -```bash -docker-compose up --build -``` - -3. Point your browser at http://127.0.0.1:8888 +**GreenGuard** is prepared to be run inside a docker environment. Please check the +[docker documantation](https://github.com/signals-dev/GreenGuard/blob/master/docker/README.md) +about how run **GreenGuard** using docker. # Data Format diff --git a/docker/README.md b/docker/README.md index 19c12b3..564ca4c 100644 --- a/docker/README.md +++ b/docker/README.md @@ -1,160 +1,80 @@ -# Docker Usage +# Run GreenGuard using Docker -**GreenGuard** comes configured and ready to be distributed and run as a docker image which starts -a jupyter notebook already configured to use greenguard, with all the required dependencies already -installed. +GreenGuard is prepared to be run using [Docker](https://docker.com/). -## Docker Requirements - -The only requirement in order to run the GreenGuard Docker image is to have Docker installed and -that the user has enough permissions to run it. - -Installation instructions for any possible system compatible can be found [here]( -https://docs.docker.com/install/). - -Additionally, the system that builds the GreenGuard Docker image will also need to have a working -internet connection that allows downloading the base image and the additional python depenedencies. - -## Building the GreenGuard Docker Image - -After having cloned the **GreenGuard** repository, all you have to do in order to build the -GreenGuard Docker Image is running this command: +This is the command needed to start a Docker container locally that runs a [Jupyter Notebook]( +https://jupyter.org/) already configured to run GreenGuard. ```bash -make docker-build +docker run -ti -p8888:8888 signals-dev/greenguard:latest ``` -After a few minutes, the new image, called `greenguard`, will have been built into the system -and will be ready to be used or distributed. - -## Distributing the GreenGuard Docker Image +Further details about the usage of this image can be found [here]( +https://hub.docker.com/repository/docker/signalsdev/greenguard). -Once the `greenguard` image is built, it can be distributed in several ways. +## Run GreenGuard on Kubernetes -### Distributing using a Docker registry +GreenGuard can also be started using [Kubernetes](https://kubernetes.io/). -The simplest way to distribute the recently created image is [using a registry]( -https://docs.docker.com/registry/). +Here are the minimum steps to do so: -In order to do so, we will need to have write access to a public or private registry (remember to -[login](https://docs.docker.com/engine/reference/commandline/login/)!) and execute these commands: +1. Create a POD yaml file with the these contents: -```bash -docker tag greenguard:latest your-registry-name:some-tag -docker push your-registry-name:some-tag -``` - -Afterwards, in the receiving machine: - -```bash -docker pull your-registry-name:some-tag -docker tag your-registry-name:some-tag greenguard:latest +```yml +apiVersion: v1 +kind: Pod +metadata: + name: greenguard +spec: + containers: + - name: greenguard + image: signalsdev/greenguard:latest + ports: + - containerPort: 8888 ``` -### Distributing as a file - -If the distribution of the image has to be done offline for any reason, it can be achieved -using the following command. - -In the system that already has the image: +2. Start the POD locally ```bash -docker save --output greenguard.tar greenguard +kubectl apply -f pod-file.yml ``` -Then copy over the file `greenguard.tar` to the new system and there, run: +3. Forward the port 8888 ```bash -docker load --input greenguard.tar +kubectl port-forward greenguard 8888 ``` -After these commands, the `greenguard` image should be available and ready to be used in the -new system. - - -## Running the greenguard image - -Once the `greenguard` image has been built, pulled or loaded, it is ready to be run. +4. Point your browser at http://localhost:8888 -This can be done in two ways: +On the other hand, if you are planing to run GreenGuard on a distributed service, we provided a +[template file]( +https://github.com/signals-dev/GreenGuard/blob/master/docker/greenguard-deployment.yml) +that you can use to achieve so. -### Running greenguard with the code +## Building the Docker image from scratch -If the GreenGuard source code is available in the system, running the image is as simple as running -this command from within the root of the project: +In order to build the Docker image from scratch you will need to: -```bash -make docker-run -``` - -This will start a jupyter notebook using the docker image, which you can access by pointing your -browser at http://127.0.0.1:8888 - -In this case, the local version of the project will also mounted within the Docker container, -which means that any changes that you do in your local code will immediately be available -within your notebooks, and that any notebook that you create within jupyter will also show -up in your `notebooks` folder! - -### Running greenguard without the greenguard code - -If the GreenGuard source code is not available in the system and only the Docker Image is, you can -still run the image by using this command: +1. Clone the repository ```bash -docker run -ti -p 8888:8888 greenguard +git clone git@github.com:signals-dev/GreenGuard.git +cd GreenGuard ``` -In this case, the code changes and the notebooks that you create within jupyter will stay -inside the container and you will only be able to access and download them through the -jupyter interface. - -## Running the greenguard image on kubernetes - -### Running as pod - -There is a possiblity to run GreenGuard's docker image on a local kubernetes cluster. Once you have -created the docker image (locally or remotely) and you have [kubernetes]( -https://kubernetes.io/docs/home/) properly setup at your local environment, copy and paste the -following pod configuration into a `yml` file: - -```yml -apiVersion: v1 -kind: Pod -metadata: - name: greenguard -spec: - containers: - - name: greenguard - image: signals-dev/greenguard-jupyter:0.2.2.dev0 - ports: - - containerPort: 8888 -``` - -**Note** If you would like to use your local image that you created previously, or an image -from another repository that's not the official one, change the `image` value to the one that -corresponds to yours. - -Once you have created the `yml` file, you can run the following command to launch the pod: +2. Build the docker image ```bash -kubectl apply -f file.yml +make docker-build ``` -This will create a pod named `greenguard` and in order to access it, we will have to forward -the port 8888 from the pod to our localhost. To do so, just run the following command: +3. If you are generating a new release, you can push to Docker hub using: ```bash -kubectl port-forward greeguard 8888 +make docker-push ``` -Finally we can point our browser to http://localhost:8888 and use the GreenGuard software. - -### Running GreenGuard a service - -Kubernetes allows the posibility to run a docker image as a services, inside this folder you -will find a `greenguard-deployment.yml` file, ready to use as an deployment service, which has -the port forwarded to the `30088`. You can use this template to adapt it to your needs. - ## What's next? For more details about **GreenGuard** and all its possibilities and features, please check the diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml deleted file mode 100644 index dfb7aed..0000000 --- a/docker/docker-compose.yml +++ /dev/null @@ -1,11 +0,0 @@ -version: '3' -services: - jupyter: - build: - context: . - args: - - UID=${UID:-1000} - ports: - - "8888:8888" - volumes: - - .:/app From 5b7733018e1b255a60c50c3a21402b440c565ba4 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 23 Jun 2020 15:14:38 +0200 Subject: [PATCH 052/171] Fix docker hub link / version. --- README.md | 2 +- docker/README.md | 4 ++-- docker/greenguard-deployment.yml | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 8a3c794..88f2402 100644 --- a/README.md +++ b/README.md @@ -71,7 +71,7 @@ If you want to install from source or contribute to the project please read the **GreenGuard** is prepared to be run inside a docker environment. Please check the [docker documantation](https://github.com/signals-dev/GreenGuard/blob/master/docker/README.md) -about how run **GreenGuard** using docker. +about how to run **GreenGuard** using docker. # Data Format diff --git a/docker/README.md b/docker/README.md index 564ca4c..30232c7 100644 --- a/docker/README.md +++ b/docker/README.md @@ -6,7 +6,7 @@ This is the command needed to start a Docker container locally that runs a [Jupy https://jupyter.org/) already configured to run GreenGuard. ```bash -docker run -ti -p8888:8888 signals-dev/greenguard:latest +docker run -ti -p8888:8888 signals-dev/greenguard:0.2.2.dev0 ``` Further details about the usage of this image can be found [here]( @@ -28,7 +28,7 @@ metadata: spec: containers: - name: greenguard - image: signalsdev/greenguard:latest + image: signalsdev/greenguard:0.2.2.dev0 ports: - containerPort: 8888 ``` diff --git a/docker/greenguard-deployment.yml b/docker/greenguard-deployment.yml index a51139a..ff195ae 100644 --- a/docker/greenguard-deployment.yml +++ b/docker/greenguard-deployment.yml @@ -1,34 +1,34 @@ apiVersion: v1 kind: Service metadata: - name: greenguard-jupyter + name: greenguard spec: ports: - name: jupyter port: 8888 nodePort: 30088 selector: - app: greenguard-jupyter + app: greenguard type: NodePort --- apiVersion: apps/v1 kind: Deployment metadata: - name: greenguard-jupyter + name: greenguard spec: selector: matchLabels: - app: greenguard-jupyter + app: greenguard strategy: type: Recreate template: metadata: labels: - app: greenguard-jupyter + app: greenguard spec: containers: - - image: pvkdev/greenguard-jupyter:0.2.2.dev0 - name: greenguard-jupyter + - image: signalsdev/greenguard:0.2.2.dev0 + name: greenguard ports: - containerPort: 8888 name: jupyter From 0827312f80e3a92fd1d15f6dfe49ff94a2748a90 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 25 Jun 2020 10:12:27 +0200 Subject: [PATCH 053/171] Add relative link to docker readme. --- README.md | 2 +- docker/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 88f2402..bbb62f1 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ If you want to install from source or contribute to the project please read the ## Docker usage **GreenGuard** is prepared to be run inside a docker environment. Please check the -[docker documantation](https://github.com/signals-dev/GreenGuard/blob/master/docker/README.md) +[docker documantation](docker/README.md) about how to run **GreenGuard** using docker. # Data Format diff --git a/docker/README.md b/docker/README.md index 30232c7..6b15766 100644 --- a/docker/README.md +++ b/docker/README.md @@ -2,7 +2,7 @@ GreenGuard is prepared to be run using [Docker](https://docker.com/). -This is the command needed to start a Docker container locally that runs a [Jupyter Notebook]( +This are the commands needed to start a Docker container locally that runs a [Jupyter Notebook]( https://jupyter.org/) already configured to run GreenGuard. ```bash From fba74a8c592d2731bc7a4d92eca41b46a196f525 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 25 Jun 2020 13:44:23 +0200 Subject: [PATCH 054/171] Syntax --- docker/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/README.md b/docker/README.md index 6b15766..8706034 100644 --- a/docker/README.md +++ b/docker/README.md @@ -2,7 +2,7 @@ GreenGuard is prepared to be run using [Docker](https://docker.com/). -This are the commands needed to start a Docker container locally that runs a [Jupyter Notebook]( +These are the commands needed to start a Docker container locally that runs a [Jupyter Notebook]( https://jupyter.org/) already configured to run GreenGuard. ```bash From a582f4beb41471cdf3cb8b1867bd03bd71ec798d Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 25 Jun 2020 15:19:45 +0200 Subject: [PATCH 055/171] Pr Review and comments --- Makefile | 3 +++ README.md | 3 +-- docker/README.md | 41 +++++++++++++++++++++++------------------ 3 files changed, 27 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 96cfec3..b614603 100644 --- a/Makefile +++ b/Makefile @@ -226,3 +226,6 @@ docker-push: docker-login docker-build @$(eval VERSION := $(shell python -c 'import greenguard; print(greenguard.__version__)')) docker tag greenguard signalsdev/greenguard:$(VERSION) docker push signalsdev/greenguard:$(VERSION) + docker tag greenguard signalsdev/greenguard + docker push signalsdev/greenguard + diff --git a/README.md b/README.md index bbb62f1..14dd621 100644 --- a/README.md +++ b/README.md @@ -70,8 +70,7 @@ If you want to install from source or contribute to the project please read the ## Docker usage **GreenGuard** is prepared to be run inside a docker environment. Please check the -[docker documantation](docker/README.md) -about how to run **GreenGuard** using docker. +[docker documantation](docker/README.md) for details about how to run **GreenGuard** using docker. # Data Format diff --git a/docker/README.md b/docker/README.md index 8706034..631f877 100644 --- a/docker/README.md +++ b/docker/README.md @@ -6,9 +6,12 @@ These are the commands needed to start a Docker container locally that runs a [J https://jupyter.org/) already configured to run GreenGuard. ```bash -docker run -ti -p8888:8888 signals-dev/greenguard:0.2.2.dev0 +docker run -ti -p8888:8888 signalsdev/greenguard:latest ``` +This will start a Jupyter Notebook instance on your computer already configured to use GreenGuard. +You can access it by pointing your browser at http://127.0.0.1:8888 + Further details about the usage of this image can be found [here]( https://hub.docker.com/repository/docker/signalsdev/greenguard). @@ -16,9 +19,11 @@ https://hub.docker.com/repository/docker/signalsdev/greenguard). GreenGuard can also be started using [Kubernetes](https://kubernetes.io/). -Here are the minimum steps to do so: +Here are the minimum steps required to create a POD in a local Kubernetes cluster: + +1. Create a yaml file with these contents: -1. Create a POD yaml file with the these contents: +For this example, we are assuming that the yaml file is named `greegunard-pod.yml`. ```yml apiVersion: v1 @@ -28,33 +33,39 @@ metadata: spec: containers: - name: greenguard - image: signalsdev/greenguard:0.2.2.dev0 + image: signalsdev/greenguard:latest ports: - containerPort: 8888 ``` -2. Start the POD locally +2. Create a POD: + +After creating the yaml file, you can create a POD in your Kubernetes cluster using the `kubectl` +command: ```bash -kubectl apply -f pod-file.yml +kubectl apply -f greenguard-pod.yml ``` 3. Forward the port 8888 +After the POD is started, you still need to forward a local port to it in order to access the +Jupyter instance. + ```bash kubectl port-forward greenguard 8888 ``` 4. Point your browser at http://localhost:8888 -On the other hand, if you are planing to run GreenGuard on a distributed service, we provided a -[template file]( -https://github.com/signals-dev/GreenGuard/blob/master/docker/greenguard-deployment.yml) -that you can use to achieve so. +> **NOTE**: If GreenGuard is run in a production environment we recommend you to use a service and +a deployment instead of just a simple POD. You can find a template of this setup [here]( +greenguard-deployment.yml) ## Building the Docker image from scratch -In order to build the Docker image from scratch you will need to: +If you want to build the Docker image from scratch instead of using the dockerhub image +you will need to: 1. Clone the repository @@ -63,18 +74,12 @@ git clone git@github.com:signals-dev/GreenGuard.git cd GreenGuard ``` -2. Build the docker image +2. Build the docker image using the GreenGuard make command. ```bash make docker-build ``` -3. If you are generating a new release, you can push to Docker hub using: - -```bash -make docker-push -``` - ## What's next? For more details about **GreenGuard** and all its possibilities and features, please check the From 0479cb1a2fbe92118fb7d7a43510e3d952293241 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 25 Jun 2020 15:40:42 +0200 Subject: [PATCH 056/171] Update image version --- docker/greenguard-deployment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/greenguard-deployment.yml b/docker/greenguard-deployment.yml index ff195ae..4736ce5 100644 --- a/docker/greenguard-deployment.yml +++ b/docker/greenguard-deployment.yml @@ -27,7 +27,7 @@ spec: app: greenguard spec: containers: - - image: signalsdev/greenguard:0.2.2.dev0 + - image: signalsdev/greenguard:latest name: greenguard ports: - containerPort: 8888 From 18be5ae009c7807d53fc2bcf313816111639ed1e Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 25 Jun 2020 17:45:02 +0200 Subject: [PATCH 057/171] Fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 14dd621..85ec4ee 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ If you want to install from source or contribute to the project please read the ## Docker usage **GreenGuard** is prepared to be run inside a docker environment. Please check the -[docker documantation](docker/README.md) for details about how to run **GreenGuard** using docker. +[docker documentation](docker/README.md) for details about how to run **GreenGuard** using docker. # Data Format From d84b83dfc74402b90a5740cd38bcf6f1d54ea910 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 3 Jul 2020 19:28:39 +0200 Subject: [PATCH 058/171] Allow saving splits on disk --- greenguard/pipeline.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 135ebbd..d97ec2a 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -3,6 +3,7 @@ import json import logging import os +import pickle from copy import deepcopy from hashlib import md5 @@ -307,15 +308,22 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) predict = pipeline.predict(X_test, output_=static - 1, start_=preprocessing, **context) - splits.append((fold, pipeline, fit, predict, y_test, static)) + os.makedirs('splits', exist_ok=True) + export_path = os.path.join('splits', '{}_{}.pkl'.format(template_name, fold)) + with open(export_path, 'wb') as split_file: + pickle.dump((fold, pipeline, fit, predict, y_test, static), split_file) + + splits.append(export_path) return splits def _cross_validate(self, template_splits, hyperparams): scores = [] - for fold, pipeline, fit, predict, y_test, static in template_splits: - LOGGER.debug('Scoring fold %s', fold) + for split_path in template_splits: + with open(split_path, 'rb') as split_file: + fold, pipeline, fit, predict, y_test, static = pickle.load(split_file) + LOGGER.debug('Scoring fold %s', fold) pipeline.set_hyperparameters(hyperparams) pipeline.fit(start_=static, **fit) predictions = pipeline.predict(start_=static, **predict) From 39c6c3d7f2f8441cec3820ed7c77613f101e469d Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Sun, 5 Jul 2020 19:51:52 +0200 Subject: [PATCH 059/171] Allow saving splits on disk --- greenguard/pipeline.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index d97ec2a..e7c17a9 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -1,5 +1,6 @@ # -*- coding: utf-8 -*- +import gc import json import logging import os @@ -314,6 +315,8 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) pickle.dump((fold, pipeline, fit, predict, y_test, static), split_file) splits.append(export_path) + del fold, pipeline, fit, predict, y_test + gc.collect() return splits From 446b6da61ddcb2eafa68927242f02416158098e1 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 7 Jul 2020 09:53:57 +0200 Subject: [PATCH 060/171] Allow optionally to save splits on disk. --- greenguard/pipeline.py | 51 ++++++++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index e7c17a9..288da57 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -139,6 +139,9 @@ class GreenGuardPipeline(object): self.templates. Defaults to ``0``. + cache_path (str): + If given, cache the generated cross validation splits in this folder. + Defatuls to ``None``. """ template = None @@ -231,7 +234,7 @@ def _build_pipeline(self): self.fitted = False def __init__(self, templates, metric='accuracy', cost=False, init_params=None, stratify=True, - cv_splits=5, shuffle=True, random_state=0, preprocessing=0): + cv_splits=5, shuffle=True, random_state=0, preprocessing=0, cache_path=None): if isinstance(metric, str): metric, cost = METRICS[metric] @@ -258,6 +261,9 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s self._set_template(self._template_names[0]) self._hyperparameters = dict() self._build_pipeline() + self._cache_path = cache_path + if cache_path: + os.makedirs(cache_path, exist_ok=True) def get_hyperparameters(self): """Get the current hyperparameters. @@ -291,6 +297,8 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) turbines=turbines, output_=preprocessing - 1) del context['X'] del context['y'] + gc.collect() + else: context = { 'readings': readings, @@ -300,6 +308,7 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) splits = list() for fold, (train_index, test_index) in enumerate(self._cv.split(X, y)): LOGGER.debug('Running static steps for fold %s', fold) + gc.collect() X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] @@ -309,22 +318,31 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) predict = pipeline.predict(X_test, output_=static - 1, start_=preprocessing, **context) - os.makedirs('splits', exist_ok=True) - export_path = os.path.join('splits', '{}_{}.pkl'.format(template_name, fold)) - with open(export_path, 'wb') as split_file: - pickle.dump((fold, pipeline, fit, predict, y_test, static), split_file) + split = (fold, pipeline, fit, predict, y_test, static) - splits.append(export_path) - del fold, pipeline, fit, predict, y_test - gc.collect() + if self._cache_path: + split_name = '{}_{}.pkl'.format(template_name, fold) + split_path = os.path.join(self._cache_path, split_name) + + with open(split_path, 'wb') as split_file: + pickle.dump(split, split_file) + + split = split_path + splits.append(split) + + gc.collect() return splits def _cross_validate(self, template_splits, hyperparams): scores = [] - for split_path in template_splits: - with open(split_path, 'rb') as split_file: - fold, pipeline, fit, predict, y_test, static = pickle.load(split_file) + for split in template_splits: + gc.collect() + if self._cache_path: + with open(split, 'rb') as split_file: + split = pickle.load(split_file) + + fold, pipeline, fit, predict, y_test, static = split LOGGER.debug('Scoring fold %s', fold) pipeline.set_hyperparameters(hyperparams) @@ -339,15 +357,14 @@ def _cross_validate(self, template_splits, hyperparams): def _make_btb_scorer(self, target_times, readings, turbines): splits = {} + for name in self._template_names: + splits[name] = self._generate_splits(name, target_times, readings, turbines) + + del target_times, readings, turbines + gc.collect() def scorer(template_name, config): template_splits = splits.get(template_name) - if template_splits is None: - template_splits = self._generate_splits( - template_name, target_times, readings, turbines) - - splits[template_name] = template_splits - cv_score = self._cross_validate(template_splits, config) if self._is_better(cv_score): _config = '\n'.join(' {}: {}'.format(n, v) for n, v in config.items()) From 8e49e9f56d2fa59a347c300c89dc01c1b9e8820e Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 10 Jul 2020 11:16:02 +0200 Subject: [PATCH 061/171] Patch keras to be pickable on splits --- greenguard/pipeline.py | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 288da57..8735e7b 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -5,15 +5,18 @@ import logging import os import pickle +import tempfile from copy import deepcopy from hashlib import md5 import cloudpickle +import keras import numpy as np from btb import BTBSession from btb.tuning import Tunable from mlblocks import MLPipeline from mlblocks.discovery import load_pipeline +from mlprimitives.adapters.keras import Sequential from sklearn.exceptions import NotFittedError from sklearn.model_selection import KFold, StratifiedKFold @@ -25,6 +28,32 @@ PIPELINES_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), 'pipelines')) +# Patch Keras to save on disk without a model trained +def __getstate__(self): + state = self.__dict__.copy() + if 'model' in state: + with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd: + keras.models.save_model(state.pop('model'), fd.name, overwrite=True) + state['model_str'] = fd.read() + + return state + + +def __setstate__(self, state): + if 'model_str' in state: + with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd: + fd.write(state.pop('model_str')) + fd.flush() + + state['model'] = keras.models.load_model(fd.name) + + self.__dict__ = state + + +Sequential.__getstate__ = __getstate__ +Sequential.__setstate__ = __setstate__ + + def get_pipelines(pattern='', path=False, unstacked=False): """Get the list of available pipelines. From cc20522a1c23198f88435dcf2fe0bd90d5360e77 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 10 Jul 2020 11:36:22 +0200 Subject: [PATCH 062/171] Add keras as we are importing it in our module to patch it. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 67516b5..c02e9a9 100644 --- a/setup.py +++ b/setup.py @@ -17,6 +17,7 @@ install_requires = [ 'mlblocks>=0.3.4,<0.4', + 'Keras>=2.1.6,<2.4', 'mlprimitives>=0.2.4,<0.3', 'scipy>=1.0.1,<1.4.0', 'baytune>=0.3.9,<0.4', From eb3d5c9b68dcbe74c3efe798d3100c57f06de3a1 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 10 Jul 2020 11:43:38 +0200 Subject: [PATCH 063/171] Move notebooks to tutorials --- .../01_GreenGuard_Machine_Learning.ipynb | 349 +++--------------- .../02_Extract_Readings.ipynb | 8 +- 2 files changed, 63 insertions(+), 294 deletions(-) rename notebooks/1. GreenGuard Quickstart.ipynb => tutorials/01_GreenGuard_Machine_Learning.ipynb (63%) rename notebooks/2. Extract Readings.ipynb => tutorials/02_Extract_Readings.ipynb (99%) diff --git a/notebooks/1. GreenGuard Quickstart.ipynb b/tutorials/01_GreenGuard_Machine_Learning.ipynb similarity index 63% rename from notebooks/1. GreenGuard Quickstart.ipynb rename to tutorials/01_GreenGuard_Machine_Learning.ipynb index ec7c0a7..7738871 100644 --- a/notebooks/1. GreenGuard Quickstart.ipynb +++ b/tutorials/01_GreenGuard_Machine_Learning.ipynb @@ -4,21 +4,24 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# GreenGuard Quickstart" + "# GreenGuard Machine Learning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "This notebook shows how to use GreenGuard to:\n", + "In this tutorial we will show you how to use GreenGuard to solve a Machine Learning problem\n", + "defined via a Target Times table.\n", "\n", - "- Load demo data\n", + "During the next steps we will:\n", + "\n", + "- Load demo target times and readings\n", "- Find available pipelines and load two of them as templates\n", - "- Tune the templates to find the best template for the given data and its hyperparameters\n", - "- Fit the found pipeline to our data\n", - "- Make predictions using the pipeline\n", - "- Evaluate the goodness-of-fit" + "- Use GreenGuard AutoML to select the best template and hyperparameters for our problem\n", + "- Build and fit a Machine Learning pipeline based on the found template and hyperparameters\n", + "- Make predictions using the fitted pipeline\n", + "- Evaluate how good the predictions are" ] }, { @@ -393,7 +396,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Finding the Templates\n", + "## 3. Finding the available Templates\n", "\n", "The next step will be to select a collection of templates from the ones\n", "available in GreenGuard.\n", @@ -411,13 +414,13 @@ { "data": { "text/plain": [ - "['resample_600s_unstack_144_lstm_timeseries_classifier',\n", + "['resample_600s_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_double_144_lstm_timeseries_classifier',\n", " 'resample_3600s_unstack_24_lstm_timeseries_classifier',\n", - " 'resample_600s_unstack_dfs_1d_xgb_classifier',\n", - " 'resample_600s_normalize_dfs_1d_xgb_classifier',\n", " 'resample_3600s_unstack_double_24_lstm_timeseries_classifier',\n", - " 'resample_600s_unstack_double_144_lstm_timeseries_classifier',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier']" + " 'resample_600s_unstack_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_144_lstm_timeseries_classifier']" ] }, "execution_count": 10, @@ -446,9 +449,9 @@ { "data": { "text/plain": [ - "['resample_600s_unstack_dfs_1d_xgb_classifier',\n", - " 'resample_600s_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier']" + "['resample_600s_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'resample_600s_unstack_dfs_1d_xgb_classifier']" ] }, "execution_count": 11, @@ -476,9 +479,9 @@ { "data": { "text/plain": [ - "{'resample_600s_unstack_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/app/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json'}" + "{'resample_600s_normalize_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json',\n", + " 'resample_600s_unstack_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json'}" ] }, "execution_count": 12, @@ -584,14 +587,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "INFO:btb.session:Obtaining default configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n" + "2020-07-10 11:39:49,290 - INFO - session - Obtaining default configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n" ] }, { @@ -599,21 +602,21 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 00:41 | Progress: 100%|██████████\n", - "Elapsed: 00:18 | Progress: 100%|██████████\n", + "Elapsed: 00:32 | Progress: 100%|██████████\n", + "Elapsed: 00:16 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:37 | Progress: 100%|██████████\n", - "Elapsed: 00:18 | Progress: 100%|██████████\n", + "Elapsed: 00:32 | Progress: 100%|██████████\n", + "Elapsed: 00:16 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:37 | Progress: 100%|██████████\n", - "Elapsed: 00:18 | Progress: 100%|██████████\n" + "Elapsed: 00:32 | Progress: 100%|██████████\n", + "Elapsed: 00:15 | Progress: 100%|██████████\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.pipeline:New configuration found:\n", + "2020-07-10 11:42:19,561 - INFO - pipeline - New configuration found:\n", " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", @@ -622,8 +625,8 @@ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6079987550575785\n", - "INFO:btb.session:Obtaining default configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n" + "2020-07-10 11:42:19,563 - INFO - session - New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.604136604136604\n", + "2020-07-10 11:42:19,565 - INFO - session - Obtaining default configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n" ] }, { @@ -631,63 +634,8 @@ "output_type": "stream", "text": [ "Built 99 features\n", - "Elapsed: 02:06 | Progress: 100%|██████████\n", - "Elapsed: 01:02 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 01:53 | Progress: 100%|██████████\n", - "Elapsed: 00:54 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 01:55 | Progress: 100%|██████████\n", - "Elapsed: 01:10 | Progress: 100%|██████████\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:greenguard.pipeline:New configuration found:\n", - " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 9\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 28\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3977560491030686\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.19143248884807773\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 8\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6418782052584869\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:greenguard.pipeline:New configuration found:\n", - " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 14\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 18\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 5\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39294364912150626\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.3393295330438333\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6671775409915827\n" + "Elapsed: 00:45 | Progress: 53%|█████▎ " ] - }, - { - "data": { - "text/plain": [ - "{'id': '2a494af25e2d986c9178fd47820d4b00',\n", - " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", - " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 14,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 18,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39294364912150626,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.3393295330438333,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9},\n", - " 'score': 0.6671775409915827}" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" } ], "source": [ @@ -704,29 +652,9 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'id': '2a494af25e2d986c9178fd47820d4b00',\n", - " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", - " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 14,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 18,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39294364912150626,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.3393295330438333,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9},\n", - " 'score': 0.6671775409915827}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "session.best_proposal" ] @@ -740,26 +668,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 14,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 18,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 5,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39294364912150626,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.3393295330438333,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9}" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline.get_hyperparameters()" ] @@ -773,20 +684,9 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'resample_600s_unstack_normalize_dfs_1d_xgb_classifier'" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline.template_name" ] @@ -801,20 +701,9 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.6671775409915827" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline.cv_score" ] @@ -830,101 +719,27 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:greenguard.pipeline:New configuration found:\n", - " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 99\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 143\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 9\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.06337107325877978\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.932864412690726\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", - "INFO:btb.session:New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6854149434794596\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n" - ] - }, - { - "data": { - "text/plain": [ - "{'id': '9999fcb9fdc53cf7bf8f1398cea07fab',\n", - " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", - " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 99,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 143,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 9,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.06337107325877978,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.932864412690726,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", - " 'score': 0.6854149434794596}" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "session.run(iterations=10)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.6854149434794596" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline.cv_score" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 99,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 143,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 9,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.06337107325877978,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.932864412690726,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "pipeline.get_hyperparameters()" ] @@ -944,18 +759,9 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Built 165 features\n", - "Elapsed: 00:48 | Progress: 100%|██████████\n" - ] - } - ], + "outputs": [], "source": [ "pipeline.fit(train, readings)" ] @@ -971,17 +777,9 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Elapsed: 00:17 | Progress: 100%|██████████\n" - ] - } - ], + "outputs": [], "source": [ "predictions = pipeline.predict(test, readings)" ] @@ -995,20 +793,9 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.7346938775510203" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "from sklearn.metrics import f1_score\n", "\n", @@ -1033,7 +820,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1052,7 +839,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1068,27 +855,9 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Elapsed: 00:19 | Progress: 100%|██████████\n" - ] - }, - { - "data": { - "text/plain": [ - "array([0, 0, 0, 1, 0])" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "predictions = new_pipeline.predict(test, readings)\n", "predictions[0:5]" @@ -1111,7 +880,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.9" } }, "nbformat": 4, diff --git a/notebooks/2. Extract Readings.ipynb b/tutorials/02_Extract_Readings.ipynb similarity index 99% rename from notebooks/2. Extract Readings.ipynb rename to tutorials/02_Extract_Readings.ipynb index db55927..a454648 100644 --- a/notebooks/2. Extract Readings.ipynb +++ b/tutorials/02_Extract_Readings.ipynb @@ -6,13 +6,13 @@ "source": [ "# Extract Readings\n", "\n", - "This notebook shows how to use the CSVLoader class to load the readings table from a folder\n", - "that contains readings in the raw CSV format.\n", + "In this tutorial we will show you how to use the CSVLoader class to load the readings table\n", + "from a folder that contains readings in the raw CSV format.\n", "\n", "The Raw CSV format es briefly explained below, but more details can be found in [the documentation site](\n", "https://signals-dev.github.io/GreenGuard/advanced_usage/csv.html)/n", "\n", - "In this notebook we will:\n", + "During the next steps we will:\n", "\n", "- Generate a folder with readings in the raw format based on the demo data\n", "- Explore the raw format\n", @@ -1366,7 +1366,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.10" + "version": "3.6.9" } }, "nbformat": 4, From f4c9a62ad451ad646ca33d40da58abf75e6c47eb Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 10 Jul 2020 11:47:09 +0200 Subject: [PATCH 064/171] Update repo config and add github actions --- .github/workflows/docs.yml | 29 ++++++++++++++ .github/workflows/tests.yml | 40 +++++++++++++++++++ .gitignore | 1 + .travis.yml | 18 ++------- Makefile | 78 ++++++++++++++++++++++++++++--------- docs/conf.py | 4 ++ docs/index.rst | 7 ++++ setup.py | 44 ++++++++++----------- tox.ini | 41 ++++++++----------- 9 files changed, 182 insertions(+), 80 deletions(-) create mode 100644 .github/workflows/docs.yml create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000..02e92fd --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,29 @@ +name: Generate Docs + +on: + push: + branches: [ master ] + +jobs: + + docs: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + + - name: Python + uses: actions/setup-python@v1 + with: + python-version: '3.7' + + - name: Build + run: | + sudo apt install pandoc + python -m pip install --upgrade pip + pip install -e .[dev] + make docs + - name: Deploy + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{secrets.GITHUB_TOKEN}} + publish_dir: docs/_build/html diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..29c71ac --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,40 @@ +name: Run Tests + +on: + push: + branches: [ '*' ] + pull_request: + branches: [ master ] + +jobs: + build: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.5, 3.6, 3.7] + os: [ubuntu-latest, macos-latest] + + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + + - if: matrix.os == 'ubuntu-latest' + name: Install graphviz - Ubuntu + run: | + sudo apt-get install pandoc + + - if: matrix.os == 'macos-latest' + name: Install graphviz - MacOS + run: | + brew install pandoc + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + + - name: Test with tox + run: tox diff --git a/.gitignore b/.gitignore index fc59bb2..bcf1b75 100644 --- a/.gitignore +++ b/.gitignore @@ -65,6 +65,7 @@ instance/ # Sphinx documentation docs/_build/ docs/api/ +docs/tutorials/ # PyBuilder target/ diff --git a/.travis.yml b/.travis.yml index 9cbca5a..4cefe52 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ # Config file for automatic testing at travis-ci.org -dist: xenial +dist: bionic language: python python: - 3.7 @@ -7,21 +7,11 @@ python: - 3.5 # Command to install dependencies -install: pip install -U tox-travis codecov +install: + - sudo apt-get install pandoc + - pip install -U tox-travis codecov after_success: codecov # Command to run tests script: tox - -deploy: - - - provider: pages - skip-cleanup: true - github-token: "$GITHUB_TOKEN" - keep-history: true - local-dir: docs/_build/html - target-branch: gh-pages - on: - branch: master - python: 3.6 diff --git a/Makefile b/Makefile index b614603..b489087 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,7 @@ clean-pyc: ## remove Python file artifacts .PHONY: clean-docs clean-docs: ## remove previously built docs rm -f docs/api/*.rst + rm -rf docs/tutorials -$(MAKE) -C docs clean 2>/dev/null # this fails if sphinx is not yet installed .PHONY: clean-coverage @@ -91,24 +92,32 @@ lint: ## check style with flake8 and isort .PHONY: fix-lint fix-lint: ## fix lint issues using autoflake, autopep8, and isort - find greenguard -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables - autopep8 --in-place --recursive --aggressive greenguard - isort --apply --atomic --recursive greenguard - - find tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables - autopep8 --in-place --recursive --aggressive tests - isort --apply --atomic --recursive tests + find greenguard tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables + autopep8 --in-place --recursive --aggressive greenguard tests + isort --apply --atomic --recursive greenguard tests # TEST TARGETS -.PHONY: test -test: ## run tests quickly with the default Python +.PHONY: test-unit +test-unit: ## run tests quickly with the default Python python -m pytest --basetemp=${ENVTMPDIR} --cov=greenguard .PHONY: test-readme test-readme: ## run the readme snippets - rundoc run --single-session python3 -t python3 README.md + rm -rf tests/readme_test && mkdir tests/readme_test + cd tests/readme_test && rundoc run --single-session python3 -t python3 ../../README.md + rm -rf tests/readme_test + +.PHONY: test-tutorials +test-tutorials: ## run the tutorial notebooks + jupyter nbconvert --execute --ExecutePreprocessor.timeout=600 tutorials/*.ipynb --stdout > /dev/null + +.PHONY: test +test: test-unit test-readme ## test everything that needs test dependencies + +.PHONY: test-devel +test-devel: lint docs ## test everything that needs development dependencies .PHONY: test-all test-all: ## run tests on every Python version with tox @@ -126,6 +135,7 @@ coverage: ## check code coverage quickly with the default Python .PHONY: docs docs: clean-docs ## generate Sphinx HTML documentation, including API docs + cp -r tutorials docs/tutorials sphinx-apidoc --separate --no-toc -o docs/api/ greenguard $(MAKE) -C docs html @@ -146,12 +156,19 @@ dist: clean ## builds source and wheel package python setup.py bdist_wheel ls -l dist -.PHONY: test-publish -test-publish: dist ## package and upload a release on TestPyPI +.PHONY: publish-confirm +publish-confirm: + @echo "WARNING: This will irreversibly upload a new version to PyPI!" + @echo -n "Please type 'confirm' to proceed: " \ + && read answer \ + && [ "$${answer}" = "confirm" ] + +.PHONY: publish-test +publish-test: dist publish-confirm ## package and upload a release on TestPyPI twine upload --repository-url https://test.pypi.org/legacy/ dist/* .PHONY: publish -publish: dist ## package and upload a release +publish: dist publish-confirm ## package and upload a release twine upload dist/* .PHONY: bumpversion-release @@ -161,6 +178,13 @@ bumpversion-release: ## Merge master to stable and bumpversion release bumpversion release git push --tags origin stable +.PHONY: bumpversion-release-test +bumpversion-release-test: ## Merge master to stable and bumpversion release + git checkout stable || git checkout -b stable + git merge --no-ff master -m"make release-tag: Merge branch 'master' into stable" + bumpversion release --no-tag + @echo git push --tags origin stable + .PHONY: bumpversion-patch bumpversion-patch: ## Merge stable to master and bumpversion patch git checkout master @@ -168,6 +192,10 @@ bumpversion-patch: ## Merge stable to master and bumpversion patch bumpversion --no-tag patch git push +.PHONY: bumpversion-candidate +bumpversion-candidate: ## Bump the version to the next candidate + bumpversion candidate --no-tag + .PHONY: bumpversion-minor bumpversion-minor: ## Bump the version the next minor skipping the release bumpversion --no-tag minor @@ -176,13 +204,21 @@ bumpversion-minor: ## Bump the version the next minor skipping the release bumpversion-major: ## Bump the version the next major skipping the release bumpversion --no-tag major -.PHONY: bumpversion-candidate -bumpversion-candidate: ## Bump the version to the next candidate - bumpversion candidate --no-tag +.PHONY: bumpversion-revert +bumpversion-revert: ## Undo a previous bumpversion-release + git checkout master + git branch -D stable +CLEAN_DIR := $(shell git status --short | grep -v ??) CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) +.PHONY: check-clean +check-clean: ## Check if the directory has uncommitted changes +ifneq ($(CLEAN_DIR),) + $(error There are uncommitted changes) +endif + .PHONY: check-master check-master: ## Check if we are in master branch ifneq ($(CURRENT_BRANCH),master) @@ -196,14 +232,21 @@ ifeq ($(CHANGELOG_LINES),0) endif .PHONY: check-release -check-release: check-master check-history ## Check if the release can be made +check-release: check-clean check-master check-history ## Check if the release can be made + @echo "A new release can be made" .PHONY: release release: check-release bumpversion-release publish bumpversion-patch +.PHONY: release-test +release-test: check-release bumpversion-release-test publish-test bumpversion-revert + .PHONY: release-candidate release-candidate: check-master publish bumpversion-candidate +.PHONY: release-candidate-test +release-candidate-test: check-clean check-master publish-test + .PHONY: release-minor release-minor: check-release bumpversion-minor release @@ -228,4 +271,3 @@ docker-push: docker-login docker-build docker push signalsdev/greenguard:$(VERSION) docker tag greenguard signalsdev/greenguard docker push signalsdev/greenguard - diff --git a/docs/conf.py b/docs/conf.py index 733e1a9..9e23c07 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -32,6 +32,7 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones. extensions = [ 'm2r', + 'nbsphinx', 'sphinx.ext.autodoc', 'sphinx.ext.githubpages', 'sphinx.ext.viewcode', @@ -53,6 +54,9 @@ # The master toctree document. master_doc = 'index' +# Jupyter Notebooks +nbsphinx_execute = 'never' + # General information about the project. project = 'GreenGuard' slug = 'greenguard' diff --git a/docs/index.rst b/docs/index.rst index a654f0e..dad6c5f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,6 +6,13 @@ Overview +.. toctree:: + :caption: Tutorials + :hidden: + + tutorials/01_GreenGuard_Quickstart + tutorials/02_Extract_Readings + .. toctree:: :caption: Advanced Usage :hidden: diff --git a/setup.py b/setup.py index 67516b5..51f9880 100644 --- a/setup.py +++ b/setup.py @@ -4,13 +4,13 @@ from setuptools import setup, find_packages try: - with open('README.md') as readme_file: + with open('README.md', encoding='utf-8') as readme_file: readme = readme_file.read() except IOError: readme = '' try: - with open('HISTORY.md') as history_file: + with open('HISTORY.md', encoding='utf-8') as history_file: history = history_file.read() except IOError: history = '' @@ -33,44 +33,42 @@ tests_require = [ 'pytest>=3.4.2', 'pytest-cov>=2.6.0', - 'rundoc>=0.4.3' + 'jupyter>=1.0.0,<2', + 'rundoc>=0.4.3,<0.5', ] development_requires = [ # general - 'bumpversion>=0.5.3', + 'bumpversion>=0.5.3,<0.6', 'pip>=9.0.1', - 'watchdog>=0.8.3', + 'watchdog>=0.8.3,<0.11', # docs - 'm2r>=0.2.0', - 'Sphinx>=1.7.1,<2.4', - 'sphinx_rtd_theme>=0.2.4', + 'm2r>=0.2.0,<0.3', + 'nbsphinx>=0.5.0,<0.7', + 'Sphinx>=1.7.1,<3', + 'sphinx_rtd_theme>=0.2.4,<0.5', 'autodocsumm>=0.1.10', # style check - 'flake8>=3.7.7', - 'isort>=4.3.4', + 'flake8>=3.7.7,<4', + 'isort>=4.3.4,<5', # fix style issues - 'autoflake>=1.2', - 'autopep8>=1.4.3', + 'autoflake>=1.1,<2', + 'autopep8>=1.4.3,<2', # distribute on PyPI - 'twine>=1.10.0', + 'twine>=1.10.0,<4', 'wheel>=0.30.0', # Advanced testing - 'coverage>=4.5.1', - 'tox>=2.9.1', - - # Jupyter - 'jupyter>=1.0.0', + 'coverage>=4.5.1,<6', + 'tox>=2.9.1,<4', ] - setup( - author="MIT Data To AI Lab", + author='MIT Data To AI Lab', author_email='dailabmit@gmail.com', classifiers=[ 'Development Status :: 2 - Pre-Alpha', @@ -82,7 +80,7 @@ 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], - description="AutoML for Renewable Energy Industries.", + description='AutoML for Renewable Energy Industries.', entry_points={ 'mlblocks': [ 'pipelines=greenguard:MLBLOCKS_PIPELINES' @@ -95,12 +93,12 @@ include_package_data=True, install_requires=install_requires, keywords='wind machine learning greenguard', - license="MIT license", + license='MIT license', long_description=readme + '\n\n' + history, long_description_content_type='text/markdown', name='greenguard', packages=find_packages(include=['greenguard', 'greenguard.*']), - python_requires='>=3.5', + python_requires='>=3.5,<3.8', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index de5cd07..31724c5 100644 --- a/tox.ini +++ b/tox.ini @@ -1,38 +1,29 @@ [tox] -envlist = py35, py36, py37, lint, docs, readme - +envlist = py{35,36,37}, test-devel [travis] python = - 3.7: py37 - 3.6: py36, docs, lint, readme + 3.7: py37, test-devel + 3.6: py36 3.5: py35 +[gh-actions] +python = + 3.7: py37, test-devel + 3.6: py36 + 3.5: py35 [testenv] passenv = CI TRAVIS TRAVIS_* -setenv = - PYTHONPATH = {toxinidir} -extras = test -commands = - /usr/bin/env make test - - -[testenv:lint] skipsdist = true -extras = dev +skip_install = true +commands_pre = + /usr/bin/env pip install .[test] commands = - /usr/bin/env make lint - - -[testenv:docs] -skipsdist = true -extras = dev -commands = - /usr/bin/env make docs - + /usr/bin/env make test -[testenv:readme] -skipsdist = true +[testenv:test-devel] +commands_pre = + /usr/bin/env pip install .[dev] commands = - /usr/bin/env make test-readme + /usr/bin/env make test-devel From 17bd968535ddded981bbfd0129971b753b35f26a Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 10 Jul 2020 11:51:04 +0200 Subject: [PATCH 065/171] Update tutorial --- .gitignore | 1 + .../01_GreenGuard_Machine_Learning.ipynb | 286 ++++++++++++++++-- 2 files changed, 261 insertions(+), 26 deletions(-) diff --git a/.gitignore b/.gitignore index bcf1b75..f0a4be1 100644 --- a/.gitignore +++ b/.gitignore @@ -111,3 +111,4 @@ notebooks/ notebooks-private/ scripts/ dask-worker-space/ +tutorials/*.pkl diff --git a/tutorials/01_GreenGuard_Machine_Learning.ipynb b/tutorials/01_GreenGuard_Machine_Learning.ipynb index 7738871..e17f0b2 100644 --- a/tutorials/01_GreenGuard_Machine_Learning.ipynb +++ b/tutorials/01_GreenGuard_Machine_Learning.ipynb @@ -587,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -634,8 +634,53 @@ "output_type": "stream", "text": [ "Built 99 features\n", - "Elapsed: 00:45 | Progress: 53%|█████▎ " + "Elapsed: 01:28 | Progress: 100%|██████████\n", + "Elapsed: 00:45 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 01:29 | Progress: 100%|██████████\n", + "Elapsed: 00:47 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 01:32 | Progress: 100%|██████████\n", + "Elapsed: 00:48 | Progress: 100%|██████████\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-07-10 11:49:21,971 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:22,446 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:22,682 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:22,862 - INFO - pipeline - New configuration found:\n", + " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 16\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 82\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39699298238763425\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.06238180737748478\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", + "2020-07-10 11:49:22,864 - INFO - session - New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6110894266631971\n" ] + }, + { + "data": { + "text/plain": [ + "{'id': '6cbe94178d761b5c263dc2f7ab1f8205',\n", + " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 16,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 82,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39699298238763425,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.06238180737748478,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", + " 'score': 0.6110894266631971}" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -652,9 +697,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'id': '6cbe94178d761b5c263dc2f7ab1f8205',\n", + " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 16,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 82,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39699298238763425,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.06238180737748478,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", + " 'score': 0.6110894266631971}" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "session.best_proposal" ] @@ -668,9 +733,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 16,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 82,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39699298238763425,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.06238180737748478,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.get_hyperparameters()" ] @@ -684,9 +766,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'resample_600s_unstack_normalize_dfs_1d_xgb_classifier'" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.template_name" ] @@ -701,9 +794,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.6110894266631971" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.cv_score" ] @@ -719,27 +823,111 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-07-10 11:49:22,952 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:23,246 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:23,464 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:23,668 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:23,791 - INFO - pipeline - New configuration found:\n", + " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 80\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 31\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.32814385597842255\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.19795099494663482\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", + "2020-07-10 11:49:23,792 - INFO - session - New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6297458681170419\n", + "2020-07-10 11:49:23,796 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:23,955 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:24,191 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:24,403 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:24,546 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:25,544 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", + "2020-07-10 11:49:25,698 - INFO - pipeline - New configuration found:\n", + " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 96\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 36\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 9\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3256576169027807\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.1061546068995437\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", + "2020-07-10 11:49:25,699 - INFO - session - New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6306697372853741\n" + ] + }, + { + "data": { + "text/plain": [ + "{'id': '157087395a2643c9ecc4a2b3549a1fc9',\n", + " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 96,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 36,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 9,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3256576169027807,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.1061546068995437,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", + " 'score': 0.6306697372853741}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "session.run(iterations=10)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.6306697372853741" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.cv_score" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", + " 'max_labels'): 96,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 36,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 9,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3256576169027807,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.1061546068995437,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.get_hyperparameters()" ] @@ -759,9 +947,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 24, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 165 features\n", + "Elapsed: 00:37 | Progress: 100%|██████████\n" + ] + } + ], "source": [ "pipeline.fit(train, readings)" ] @@ -777,9 +974,17 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elapsed: 00:12 | Progress: 100%|██████████\n" + ] + } + ], "source": [ "predictions = pipeline.predict(test, readings)" ] @@ -793,9 +998,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "0.7307692307692306" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "from sklearn.metrics import f1_score\n", "\n", @@ -820,7 +1036,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -839,7 +1055,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -855,9 +1071,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Elapsed: 00:12 | Progress: 100%|██████████\n" + ] + }, + { + "data": { + "text/plain": [ + "array([0, 0, 0, 1, 0])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "predictions = new_pipeline.predict(test, readings)\n", "predictions[0:5]" From 4c017e32da5096488be26c5053acf83b1840362b Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 10 Jul 2020 11:55:59 +0200 Subject: [PATCH 066/171] Remove macos from github actions testing --- .github/workflows/tests.yml | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 29c71ac..093fa94 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -12,7 +12,7 @@ jobs: strategy: matrix: python-version: [3.5, 3.6, 3.7] - os: [ubuntu-latest, macos-latest] + os: [ubuntu-latest] steps: - uses: actions/checkout@v1 @@ -21,18 +21,9 @@ jobs: with: python-version: ${{ matrix.python-version }} - - if: matrix.os == 'ubuntu-latest' - name: Install graphviz - Ubuntu - run: | - sudo apt-get install pandoc - - - if: matrix.os == 'macos-latest' - name: Install graphviz - MacOS - run: | - brew install pandoc - - name: Install dependencies run: | + sudo apt-get install pandoc python -m pip install --upgrade pip pip install tox tox-gh-actions From bb7e23a1261cddec94795afb79e1e64e28e55c43 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 10 Jul 2020 12:15:06 +0200 Subject: [PATCH 067/171] Reverse split generation on the scorer --- greenguard/pipeline.py | 11 ++++++----- setup.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 8735e7b..b4c9c4c 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -386,14 +386,15 @@ def _cross_validate(self, template_splits, hyperparams): def _make_btb_scorer(self, target_times, readings, turbines): splits = {} - for name in self._template_names: - splits[name] = self._generate_splits(name, target_times, readings, turbines) - - del target_times, readings, turbines - gc.collect() def scorer(template_name, config): template_splits = splits.get(template_name) + if template_splits is None: + template_splits = self._generate_splits( + template_name, target_times, readings, turbines) + + splits[template_name] = template_splits + cv_score = self._cross_validate(template_splits, config) if self._is_better(cv_score): _config = '\n'.join(' {}: {}'.format(n, v) for n, v in config.items()) diff --git a/setup.py b/setup.py index c02e9a9..d78848e 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ # style check 'flake8>=3.7.7', - 'isort>=4.3.4', + 'isort>=4.3.4,<5', # fix style issues 'autoflake>=1.2', From ce24e261207b6ab8def66a10997a48c5d751d19d Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 10 Jul 2020 12:15:59 +0200 Subject: [PATCH 068/171] Update tutorials link --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 85ec4ee..db84cb8 100644 --- a/README.md +++ b/README.md @@ -281,5 +281,5 @@ f1_score(test_targets, predictions) For more details about **GreenGuard** and all its possibilities and features, please check the [project documentation site](https://signals-dev.github.io/GreenGuard/) -Also do not forget to have a look at the [notebook tutorials]( -https://github.com/signals-dev/GreenGuard/tree/master/notebooks)! +Also do not forget to have a look at the [tutorials]( +https://github.com/signals-dev/GreenGuard/tree/master/tutorials)! From 8323ec16fc09c4c298491ad9c4f0d09778e06d95 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 10 Jul 2020 12:58:05 +0200 Subject: [PATCH 069/171] Add release notes for v0.2.2 --- HISTORY.md | 10 ++++++++++ Makefile | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index c5a9de0..7e1e8ae 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,15 @@ # History +## 0.2.2 - 2020-07-10 + +### Internam Imrpovements + +* Added github actions. + +### Resolved Issues + +* Issue #27: Cache Splits pre-processed data on disk + ## 0.2.1 - 2020-06-16 With this release we give the possibility to the user to specify more than one template when diff --git a/Makefile b/Makefile index b489087..0ee820a 100644 --- a/Makefile +++ b/Makefile @@ -236,7 +236,7 @@ check-release: check-clean check-master check-history ## Check if the release ca @echo "A new release can be made" .PHONY: release -release: check-release bumpversion-release publish bumpversion-patch +release: check-release bumpversion-release docker-push publish bumpversion-patch .PHONY: release-test release-test: check-release bumpversion-release-test publish-test bumpversion-revert From 8e7867edc3f7c97737fbf72d2fb2d88c8d7c6498 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 10 Jul 2020 12:58:12 +0200 Subject: [PATCH 070/171] =?UTF-8?q?Bump=20version:=200.2.2.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index c9e61c2..6ae815d 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.2.dev0' +__version__ = '0.2.2' import os diff --git a/setup.cfg b/setup.cfg index 7f91cdd..924ad00 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.2.dev0 +current_version = 0.2.2 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 9ae475d..cbb9883 100644 --- a/setup.py +++ b/setup.py @@ -104,6 +104,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.2.dev0', + version='0.2.2', zip_safe=False, ) From 437887858fd4254ad78a0521119bb997509bc0a1 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 10 Jul 2020 13:36:41 +0200 Subject: [PATCH 071/171] =?UTF-8?q?Bump=20version:=200.2.2=20=E2=86=92=200?= =?UTF-8?q?.2.3.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 6ae815d..17dc390 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.2' +__version__ = '0.2.3.dev0' import os diff --git a/setup.cfg b/setup.cfg index 924ad00..2c808a1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.2 +current_version = 0.2.3.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index cbb9883..0a39dce 100644 --- a/setup.py +++ b/setup.py @@ -104,6 +104,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.2', + version='0.2.3.dev0', zip_safe=False, ) From cba9b36dc69d4e6a32a94b4be84930189730cfe7 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 10 Jul 2020 13:47:53 +0200 Subject: [PATCH 072/171] Fix typo --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index 7e1e8ae..d9c599b 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,7 +2,7 @@ ## 0.2.2 - 2020-07-10 -### Internam Imrpovements +### Internal Imrpovements * Added github actions. From 6212f0320f0350a5cf50b6447d5ad730d4511c2c Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Wed, 5 Aug 2020 18:50:26 +0200 Subject: [PATCH 073/171] renaming pipelines --- ...json => normalize_dfs_xgb_classifier.json} | 0 ...nstack_144_lstm_timeseries_classifier.json | 119 ------------------ ...double_144_lstm_timeseries_classifier.json | 119 ------------------ ...r.json => unstack_dfs_xgb_classifier.json} | 0 ...ck_double_lstm_timeseries_classifier.json} | 0 ...> unstack_lstm_timeseries_classifier.json} | 2 +- ...unstack_normalize_dfs_xgb_classifier.json} | 0 7 files changed, 1 insertion(+), 239 deletions(-) rename greenguard/pipelines/{resample_600s_normalize_dfs_1d_xgb_classifier.json => normalize_dfs_xgb_classifier.json} (100%) delete mode 100644 greenguard/pipelines/resample_600s_unstack_144_lstm_timeseries_classifier.json delete mode 100644 greenguard/pipelines/resample_600s_unstack_double_144_lstm_timeseries_classifier.json rename greenguard/pipelines/{resample_600s_unstack_dfs_1d_xgb_classifier.json => unstack_dfs_xgb_classifier.json} (100%) rename greenguard/pipelines/{resample_3600s_unstack_double_24_lstm_timeseries_classifier.json => unstack_double_lstm_timeseries_classifier.json} (100%) rename greenguard/pipelines/{resample_3600s_unstack_24_lstm_timeseries_classifier.json => unstack_lstm_timeseries_classifier.json} (98%) rename greenguard/pipelines/{resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json => unstack_normalize_dfs_xgb_classifier.json} (100%) diff --git a/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json b/greenguard/pipelines/normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json rename to greenguard/pipelines/normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/resample_600s_unstack_144_lstm_timeseries_classifier.json b/greenguard/pipelines/resample_600s_unstack_144_lstm_timeseries_classifier.json deleted file mode 100644 index b54702b..0000000 --- a/greenguard/pipelines/resample_600s_unstack_144_lstm_timeseries_classifier.json +++ /dev/null @@ -1,119 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "pandas.DataFrame.pop", - "pandas.DataFrame.pop", - "sklearn.impute.SimpleImputer", - "sklearn.preprocessing.MinMaxScaler", - "pandas.DataFrame", - "pandas.DataFrame.set", - "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", - "keras.Sequential.LSTMTimeSeriesClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "pandas.DataFrame.pop#1": { - "item": "turbine_id" - }, - "pandas.DataFrame.pop#2": { - "item": "timestamp" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "feature_range": [ - -1, - 1 - ] - }, - "pandas.DataFrame#1": { - "index": null, - "columns": null - }, - "pandas.DataFrame.set#1": { - "key": "turbine_id" - }, - "pandas.DataFrame.set#2": { - "key": "timestamp" - }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "window_size": 144, - "cutoff_time": "cutoff_time", - "time_index": "timestamp" - }, - "keras.Sequential.LSTMTimeSeriesClassifier": { - "epochs": 35, - "verbose": false - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#2": { - "X": "readings" - }, - "sklearn.impute.SimpleImputer#1": { - "X": "readings" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "X": "readings" - }, - "pandas.DataFrame#1": { - "X": "readings" - }, - "pandas.DataFrame.set#1": { - "X": "readings", - "value": "turbine_id" - }, - "pandas.DataFrame.set#2": { - "X": "readings", - "value": "timestamp" - }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "timeseries": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#1": { - "item": "turbine_id" - }, - "pandas.DataFrame.pop#2": { - "item": "timestamp" - }, - "sklearn.impute.SimpleImputer#1": { - "X": "readings" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "X": "readings" - }, - "pandas.DataFrame#1": { - "X": "readings" - } - } -} diff --git a/greenguard/pipelines/resample_600s_unstack_double_144_lstm_timeseries_classifier.json b/greenguard/pipelines/resample_600s_unstack_double_144_lstm_timeseries_classifier.json deleted file mode 100644 index 368dd4d..0000000 --- a/greenguard/pipelines/resample_600s_unstack_double_144_lstm_timeseries_classifier.json +++ /dev/null @@ -1,119 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "pandas.DataFrame.pop", - "pandas.DataFrame.pop", - "sklearn.impute.SimpleImputer", - "sklearn.preprocessing.MinMaxScaler", - "pandas.DataFrame", - "pandas.DataFrame.set", - "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", - "keras.Sequential.DoubleLSTMTimeSeriesClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "pandas.DataFrame.pop#1": { - "item": "turbine_id" - }, - "pandas.DataFrame.pop#2": { - "item": "timestamp" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "feature_range": [ - -1, - 1 - ] - }, - "pandas.DataFrame#1": { - "index": null, - "columns": null - }, - "pandas.DataFrame.set#1": { - "key": "turbine_id" - }, - "pandas.DataFrame.set#2": { - "key": "timestamp" - }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "window_size": 144, - "cutoff_time": "cutoff_time", - "time_index": "timestamp" - }, - "keras.Sequential.DoubleLSTMTimeSeriesClassifier": { - "epochs": 35, - "verbose": false - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#2": { - "X": "readings" - }, - "sklearn.impute.SimpleImputer#1": { - "X": "readings" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "X": "readings" - }, - "pandas.DataFrame#1": { - "X": "readings" - }, - "pandas.DataFrame.set#1": { - "X": "readings", - "value": "turbine_id" - }, - "pandas.DataFrame.set#2": { - "X": "readings", - "value": "timestamp" - }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "timeseries": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#1": { - "item": "turbine_id" - }, - "pandas.DataFrame.pop#2": { - "item": "timestamp" - }, - "sklearn.impute.SimpleImputer#1": { - "X": "readings" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "X": "readings" - }, - "pandas.DataFrame#1": { - "X": "readings" - } - } -} diff --git a/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json b/greenguard/pipelines/unstack_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json rename to greenguard/pipelines/unstack_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/resample_3600s_unstack_double_24_lstm_timeseries_classifier.json b/greenguard/pipelines/unstack_double_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/resample_3600s_unstack_double_24_lstm_timeseries_classifier.json rename to greenguard/pipelines/unstack_double_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/resample_3600s_unstack_24_lstm_timeseries_classifier.json b/greenguard/pipelines/unstack_lstm_timeseries_classifier.json similarity index 98% rename from greenguard/pipelines/resample_3600s_unstack_24_lstm_timeseries_classifier.json rename to greenguard/pipelines/unstack_lstm_timeseries_classifier.json index 7e494d5..ab9dd99 100644 --- a/greenguard/pipelines/resample_3600s_unstack_24_lstm_timeseries_classifier.json +++ b/greenguard/pipelines/unstack_lstm_timeseries_classifier.json @@ -54,7 +54,7 @@ "cutoff_time": "cutoff_time", "time_index": "timestamp" }, - "keras.Sequential.LSTMTimeSeriesClassifier": { + "keras.Sequential.LSTMTimeSeriesClassifier#1": { "epochs": 35, "verbose": false } diff --git a/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json b/greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json rename to greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json From 720d02e439d71ea714094493b911e35adb14875e Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Wed, 5 Aug 2020 18:52:07 +0200 Subject: [PATCH 074/171] Updating setup, Makefile and README --- Makefile | 2 +- README.md | 16 +++++++--------- setup.py | 2 +- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 0ee820a..ea625f3 100644 --- a/Makefile +++ b/Makefile @@ -101,7 +101,7 @@ fix-lint: ## fix lint issues using autoflake, autopep8, and isort .PHONY: test-unit test-unit: ## run tests quickly with the default Python - python -m pytest --basetemp=${ENVTMPDIR} --cov=greenguard + python -m pytest --cov=greenguard .PHONY: test-readme test-readme: ## run the readme snippets diff --git a/README.md b/README.md index db84cb8..9c37cb4 100644 --- a/README.md +++ b/README.md @@ -219,20 +219,18 @@ The returned `pipeline` variable will be `list` containing the names of all the available in the GreenGuard system: ``` -['resample_600s_normalize_dfs_1d_xgb_classifier', - 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier', - 'resample_600s_unstack_double_144_lstm_timeseries_classifier', - 'resample_3600s_unstack_24_lstm_timeseries_classifier', - 'resample_3600s_unstack_double_24_lstm_timeseries_classifier', - 'resample_600s_unstack_dfs_1d_xgb_classifier', - 'resample_600s_unstack_144_lstm_timeseries_classifier'] +['unstack_double_lstm_timeseries_classifier', + 'unstack_lstm_timeseries_classifier', + 'unstack_normalize_dfs_xgb_classifier', + 'unstack_dfs_xgb_classifier', + 'normalize_dfs_xgb_classifier'] ``` For the rest of this tutorial, we will select and use the pipeline -`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` as our template. +`normalize_dfs_xgb_classifier` as our template. ```python3 -pipeline_name = 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier' +pipeline_name = 'normalize_dfs_xgb_classifier' ``` ## 3. Fitting the Pipeline diff --git a/setup.py b/setup.py index 0a39dce..4c9c640 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ install_requires = [ 'Keras>=2.1.6,<2.4', 'mlblocks>=0.3.4,<0.4', - 'mlprimitives>=0.2.4,<0.3', + 'mlprimitives>=0.2.5,<0.3', 'scipy>=1.0.1,<1.4.0', 'baytune>=0.3.9,<0.4', 'numpy>=1.15.4,<1.17', From 465610b08b61193c9adbb4602b2ed7433858e96f Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Wed, 5 Aug 2020 18:53:14 +0200 Subject: [PATCH 075/171] Improving pipeline.py --- greenguard/pipeline.py | 58 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index b4c9c4c..785a747 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -93,6 +93,29 @@ def get_pipelines(pattern='', path=False, unstacked=False): return pipelines +def generate_init_params(template_names, init_params): + """Generate init_params dicts. + + The output will be a dict that contains one entry for each template + with a dict indicating the init_params to use with that template. + """ + if not init_params: + init_params = {} + elif isinstance(init_params, list): + init_params = dict(zip(template_names, init_params)) + + if not any(name in init_params for name in template_names): + return { + name: deepcopy(init_params) + for name in template_names + } + else: + return { + name: deepcopy(init_params.get(name, {})) + for name in template_names + } + + class GreenGuardPipeline(object): """Main Machine Learning component in the GreenGuard project. @@ -132,7 +155,7 @@ class GreenGuardPipeline(object): Template to use. If a ``str`` is given, load the corresponding ``MLPipeline``. Also can be a list combining both. metric (str or function): - Metric to use. If an ``str`` is give it must be one of the metrics + Metric to use. If an ``str`` is given it must be one of the metrics defined in the ``greenguard.metrics.METRICS`` dictionary. cost (bool): Whether the metric is a cost function (the lower the better) or not. @@ -226,23 +249,44 @@ def _get_templates(self, templates): for template in templates: if isinstance(template, str): template_name = template - template = load_pipeline(template_name) + template = deepcopy(load_pipeline(template_name)) else: template_name = md5(json.dumps(template)).digest() + template_dicts[template_name] = template template_names.append(template_name) return template_names, template_dicts def _generate_init_params(self, init_params): + """Generate init_params dicts. + + The output will be a dict that contains one entry for each template + with a dict indicating the init_params to use with that template. + """ if not init_params: - self._init_params = {} + init_params = {} elif isinstance(init_params, list): - self._init_params = dict(zip(self._template_names, init_params)) - elif any(name in init_params for name in self._template_names): - self._init_params = init_params + init_params = dict(zip(self._template_names, init_params)) + + if not any(name in init_params for name in self._template_names): + self._init_params = { + name: deepcopy(init_params) + for name in self._template_names + } + else: + self._init_params = { + name: deepcopy(init_params.get(name, {})) + for name in self._template_names + } def _generate_preprocessing(self, preprocessing): + """Generate preprocessing dict. + + The preprocessing dict contains one entry for each template and + an integer indicating the number of preprocessing steps for that + template. + """ if isinstance(preprocessing, int): self._preprocessing = {name: preprocessing for name in self._template_names} else: @@ -279,7 +323,7 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s self.templates = templates self._template_names, self._template_dicts = self._get_templates(templates) self._default_init_params = {} - self._generate_init_params(init_params) + self._init_params = generate_init_params(self._template_names, init_params) for name, template in self._template_dicts.items(): init_params = self._init_params.get(name, self._default_init_params) From f9564a3db36c5db0c5f1982bce014376e155fdda Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Wed, 5 Aug 2020 19:00:50 +0200 Subject: [PATCH 076/171] New tutorial --- tutorials/03_Benchmarking.ipynb | 220 ++++++++++++++++++++++++++++++++ 1 file changed, 220 insertions(+) create mode 100644 tutorials/03_Benchmarking.ipynb diff --git a/tutorials/03_Benchmarking.ipynb b/tutorials/03_Benchmarking.ipynb new file mode 100644 index 0000000..ad3c41d --- /dev/null +++ b/tutorials/03_Benchmarking.ipynb @@ -0,0 +1,220 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import logging;\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "logging.getLogger().setLevel(level=logging.ERROR)\n", + "logging.getLogger('greenguard').setLevel(level=logging.INFO)\n", + "\n", + "import warnings\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + }, + { + "data": { + "text/plain": [ + "['unstack_double_lstm_timeseries_classifier',\n", + " 'unstack_lstm_timeseries_classifier']" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from greenguard import get_pipelines\n", + "\n", + "get_pipelines('lstm')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from greenguard.benchmark import evaluate_templates" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2020-08-05 17:14:08,860 - INFO - greenguard.pipeline - New configuration found:\n", + " Template: unstack_lstm_timeseries_classifier \n", + " Hyperparameters: \n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", + "2020-08-05 17:14:16,974 - INFO - greenguard.pipeline - New configuration found:\n", + " Template: unstack_lstm_timeseries_classifier \n", + " Hyperparameters: \n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 397\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.38706239055719976\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 367\n", + "2020-08-05 17:14:36,898 - INFO - greenguard.pipeline - New configuration found:\n", + " Template: unstack_lstm_timeseries_classifier \n", + " Hyperparameters: \n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): most_frequent\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 90\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.7472037016839137\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 215\n", + "2020-08-05 17:15:00,145 - INFO - greenguard.pipeline - New configuration found:\n", + " Template: unstack_lstm_timeseries_classifier \n", + " Hyperparameters: \n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", + "2020-08-05 17:15:14,558 - INFO - greenguard.pipeline - New configuration found:\n", + " Template: unstack_lstm_timeseries_classifier \n", + " Hyperparameters: \n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): most_frequent\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 245\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.23326913705083852\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 425\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
templatewindow_sizeresample_ruledefault_testdefault_cvtuned_cvtuned_teststatus
0unstack_lstm_timeseries_classifier1d1h0.6037740.6249280.6388710.666667OK
1unstack_lstm_timeseries_classifier3d4h0.7083330.6079780.6400480.709677OK
\n", + "
" + ], + "text/plain": [ + " template window_size resample_rule default_test \\\n", + "0 unstack_lstm_timeseries_classifier 1d 1h 0.603774 \n", + "1 unstack_lstm_timeseries_classifier 3d 4h 0.708333 \n", + "\n", + " default_cv tuned_cv tuned_test status \n", + "0 0.624928 0.638871 0.666667 OK \n", + "1 0.607978 0.640048 0.709677 OK " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "evaluate_templates(\n", + " templates=[\n", + " 'unstack_lstm_timeseries_classifier',\n", + " ],\n", + " window_size_rule=[\n", + " ('1d', '1h'),\n", + " ('3d', '4h'),\n", + " ],\n", + " init_params={\n", + " 'keras.Sequential.LSTMTimeSeriesClassifier#1': {\n", + " 'epochs': 1,\n", + " }\n", + " },\n", + " tuning_iterations=3,\n", + " cv_splits=3,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 7f974efd8ec715f11c1053299d86aab148540156 Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Wed, 5 Aug 2020 19:01:11 +0200 Subject: [PATCH 077/171] New test --- tests/test_benchmark.py | 58 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 tests/test_benchmark.py diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py new file mode 100644 index 0000000..8a446e0 --- /dev/null +++ b/tests/test_benchmark.py @@ -0,0 +1,58 @@ +"""Tests for `greenguard.benchmark` module.""" +from sklearn.metrics import f1_score + +from greenguard.benchmark import evaluate_templates +from greenguard.demo import load_demo + + +def test_predict(): + # setup + templates = [ + 'unstack_lstm_timeseries_classifier' + ] + + window_size_rule = [ + ('1d', '1h') + ] + + target_times, readings = load_demo() + target_times = target_times.head(10) + readings = readings.head(1000) + + # run + scores_df = evaluate_templates( + target_times=target_times, + readings=readings, + templates=templates, + window_size_rule=window_size_rule, + metric=f1_score, + tuning_iterations=1, + cv_splits=2 + ) + + # assert + expected_columns = [ + 'template', + 'window_size', + 'resample_rule', + 'default_test', + 'default_cv', + 'tuned_cv', + 'tuned_test', + 'status' + ] + + expected_dtypes = [ + 'object', + 'object', + 'object', + 'float64', + 'float64', + 'float64', + 'float64', + 'object' + ] + + assert (scores_df.columns.to_list() == expected_columns) + assert (scores_df.tuned_test.notnull) + assert (scores_df.dtypes.to_list() == expected_dtypes) From 9936f4f928f68be08c402ebd8f3163cc30438edd Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Wed, 5 Aug 2020 19:09:41 +0200 Subject: [PATCH 078/171] benchmark.py --- greenguard/benchmark.py | 278 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 278 insertions(+) create mode 100644 greenguard/benchmark.py diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py new file mode 100644 index 0000000..67e8b37 --- /dev/null +++ b/greenguard/benchmark.py @@ -0,0 +1,278 @@ +import logging +from itertools import product + +import pandas as pd +from sklearn.model_selection import train_test_split + +from greenguard.demo import load_demo +from greenguard.metrics import METRICS +from greenguard.pipeline import GreenGuardPipeline, generate_init_params + +LOGGER = logging.getLogger(__name__) + + +def _generate_init_params(templates, init_params): + if not init_params: + init_params = {} + elif isinstance(init_params, list): + init_params = dict(zip(templates, init_params)) + elif any(name in init_params for name in templates): + init_params = init_params + else: + init_params = {template: init_params for template in templates} + + return init_params + + +def _build_init_params(template, window_size, rule, template_params): + if 'dfs' in template: + window_size_rule_params = { + 'pandas.DataFrame.resample#1': { + 'rule': rule, + }, + 'featuretools.dfs.json#1': { + 'training_window': window_size, + } + } + elif 'lstm' in template: + window_size_rule_params = { + 'pandas.DataFrame.resample#1': { + 'rule': rule, + }, + 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1': { + 'window_size': window_size, + } + } + + for primitive, params in window_size_rule_params.items(): + primitive_params = template_params.get(primitive, {}) + primitive_params.update(params) + + return template_params + + +def _build_init_preprocessing(templates, template, preprocessing): + if isinstance(preprocessing, int): + return preprocessing + elif isinstance(preprocessing, list): + preprocessing = dict(zip(templates, preprocessing)) + + return preprocessing.get(template, 0) + + +def evaluate_template(template, target_times, readings, metric='f1', tuning_iterations=50, + preprocessing=0, init_params=None, cost=False, test_size=0.25, cv_splits=3, + random_state=0): + """Returns the scores for a given template. + + Args: + template (str): + Given template to evaluate. + metric (function or str): + Metric to use. If an ``str`` is give it must be one of the metrics + defined in the ``greenguard.metrics.METRICS`` dictionary. + target_times (DataFrame): + Contains the specefication problem that we are solving, which has three columns: + + * turbine_id: Unique identifier of the turbine which this label corresponds to. + * cutoff_time: Time associated with this target. + * target: The value that we want to predict. This can either be a numerical value + or a categorical label. This column can also be skipped when preparing + data that will be used only to make predictions and not to fit any + pipeline. + + readings (DataFrame): + Contains the signal data from different sensors, with the following columns: + + * turbine_id: Unique identifier of the turbine which this reading comes from. + * signal_id: Unique identifier of the signal which this reading comes from. + * timestamp (datetime): Time where the reading took place, as a datetime. + * value (float): Numeric value of this reading. + + tuning_iterations (int): + Number of iterations to be used. + preprocessing (int, list or dict): + Type of preprocessing to be used. + init_params (list): + Initialization parameters for the pipeline. + cost (bool): + Wheter the metric is a cost function (the lower the better) or not. + test_size (float): + Percentage of the data set to be used for the test. + cv_splits (int): + Amount of splits to create. + random_state (int): + Random number of train_test split. + + Returns: + scores (dict): + Stores the four types of scores that are being evaluate. + """ + scores = dict() + + train, test = train_test_split(target_times, test_size=test_size, random_state=random_state) + + if isinstance(metric, str): + metric, cost = METRICS[metric] + + pipeline = GreenGuardPipeline(template, metric, cost=cost, cv_splits=cv_splits, + init_params=init_params, preprocessing=preprocessing) + + # Computing the default test score + pipeline.fit(train, readings) + predictions = pipeline.predict(test, readings) + + scores['default_test'] = metric(test['target'], predictions) + + # Computing the default cross validation score + session = pipeline.tune(train, readings) + session.run(1) + + scores['default_cv'] = pipeline.cv_score + + # Computing the cross validation score with tuned hyperparameters + session.run(tuning_iterations) + + scores['tuned_cv'] = pipeline.cv_score + + # Computing the test score with tuned hyperparameters + pipeline.fit(train, readings) + predictions = pipeline.predict(test, readings) + + scores['tuned_test'] = metric(test['target'], predictions) + + return scores + + +def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iterations=50, + init_params=None, target_times=None, readings=None, preprocessing=0, + cost=False, test_size=0.25, cv_splits=3, random_state=0, output_path=None): + """Execute the benchmark process and optionally store the result as a ``CSV``. + + Args: + templates (list): + List of templates to try. + window_size_rule (list): + List of tupples (int, str or Timedelta object). + metric (function or str). + Metric to use. If an ``str`` is give it must be one of the metrics + defined in the ``greenguard.metrics.METRICS`` dictionary. + tuning_iterations (int): + Number of iterations to be used. + target_times (DataFrame): + Contains the specefication problem that we are solving, which has three columns: + + * turbine_id: Unique identifier of the turbine which this label corresponds to. + * cutoff_time: Time associated with this target. + * target: The value that we want to predict. This can either be a numerical value + or a categorical label. This column can also be skipped when preparing + data that will be used only to make predictions and not to fit any + pipeline. + + readings (DataFrame): + Contains the signal data from different sensors, with the following columns: + + * turbine_id: Unique identifier of the turbine which this reading comes from. + * signal_id: Unique identifier of the signal which this reading comes from. + * timestamp (datetime): Time where the reading took place, as a datetime. + * value (float): Numeric value of this reading. + + preprocessing (int, list or dict): + Type of preprocessing to be used. + cost (bool): + Wheter the metric is a cost function (the lower the better) or not. + test_size (float): + Percentage of the data set to be used for the test. + cv_splits (int): + Amount of splits to create. + random_state (int): + Random number of train_test split. + output_path (str): + Path where to save the benchmark report. + + Returns: + pandas.DataFrame or None: + If ``output_path`` is ``None`` it will return a ``pandas.DataFrame`` object, + else it will dump the results in the specified ``output_path``. + + Example: + >>> from sklearn.metrics import f1_score + >>> templates = [ + ... 'normalize_dfs_xgb_classifier', + ... 'unstack_lstm_timeseries_classifier' + ... ] + >>> window_size_rule = [ + ... ('30d','12h'), + ... ('7d','4h') + ... ] + >>> preprocessing = [0, 1] + >>> scores_df = evaluate_templates( + ... templates=templates, + ... window_size_rule=window_size_rule, + ... metric=f1_score, + ... tuning_iterations=5, + ... preprocessing=preprocessing, + ... cost=False, + ... test_size=0.25, + ... cv_splits=3, + ... random_state=0 + ... ) + >>> scores_df + template window_size resample_rule default_test default_cv tuned_cv tuned_test status + 0 unstack_lstm_timeseries_classifier 30d 12h 0.720000 0.593634 0.627883 0.775510 OK + 1 unstack_lstm_timeseries_classifier 7d 4h 0.723404 0.597440 0.610766 0.745098 OK + 2 normalize_dfs_xgb_classifier 30d 12h 0.581818 0.619698 0.637123 0.596491 OK + 3 normalize_dfs_xgb_classifier 7d 4h 0.581818 0.619698 0.650367 0.603774 OK + + """ # noqa + + if readings is None and target_times is None: + target_times, readings = load_demo() + + init_params = generate_init_params(templates, init_params) + + scores_list = [] + for template, window_rule in product(templates, window_size_rule): + window_size, rule = window_rule + + scores = dict() + scores['template'] = template + scores['window_size'] = window_size + scores['resample_rule'] = rule + + try: + template_params = init_params[template] + template_params = _build_init_params(template, window_size, rule, template_params) + init_preprocessing = _build_init_preprocessing(templates, template, preprocessing) + + result = evaluate_template( + template=template, + target_times=target_times, + readings=readings, + metric=metric, + tuning_iterations=tuning_iterations, + preprocessing=init_preprocessing, + init_params=template_params, + cost=cost, + test_size=test_size, + cv_splits=cv_splits, + random_state=random_state) + + scores.update(result) + scores['status'] = 'OK' + + except Exception: + scores['status'] = 'ERRORED' + LOGGER.exception('Could not score template %s ', template) + + scores_list.append(scores) + + results = pd.DataFrame.from_records(scores_list) + results = results.reindex(['template', 'window_size', 'resample_rule', 'default_test', + 'default_cv', 'tuned_cv', 'tuned_test', 'status'], axis=1) + + if output_path: + LOGGER.info('Saving benchmark report to %s', output_path) + results.to_csv(output_path) + else: + return results From d9b6222f1358f56283a44591b0ca5bf94838dcaf Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Thu, 6 Aug 2020 16:44:29 +0200 Subject: [PATCH 079/171] Removing py3.5 --- .github/workflows/tests.yml | 2 +- .travis.yml | 1 - setup.py | 3 +-- tox.ini | 14 +++++--------- 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 093fa94..135d2a5 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.5, 3.6, 3.7] + python-version: [3.6, 3.7] os: [ubuntu-latest] steps: diff --git a/.travis.yml b/.travis.yml index 4cefe52..641dff9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -4,7 +4,6 @@ language: python python: - 3.7 - 3.6 - - 3.5 # Command to install dependencies install: diff --git a/setup.py b/setup.py index 4c9c640..6cb3298 100644 --- a/setup.py +++ b/setup.py @@ -77,7 +77,6 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', ], @@ -99,7 +98,7 @@ long_description_content_type='text/markdown', name='greenguard', packages=find_packages(include=['greenguard', 'greenguard.*']), - python_requires='>=3.5,<3.8', + python_requires='>=3.6,<3.8', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index 31724c5..91af938 100644 --- a/tox.ini +++ b/tox.ini @@ -1,29 +1,25 @@ [tox] -envlist = py{35,36,37}, test-devel +envlist = py{36,37}, test-devel [travis] python = 3.7: py37, test-devel 3.6: py36 - 3.5: py35 [gh-actions] python = 3.7: py37, test-devel 3.6: py36 - 3.5: py35 [testenv] passenv = CI TRAVIS TRAVIS_* -skipsdist = true -skip_install = true -commands_pre = - /usr/bin/env pip install .[test] +skipsdist = false +skip_install = false +extras = test commands = /usr/bin/env make test [testenv:test-devel] -commands_pre = - /usr/bin/env pip install .[dev] +extras = dev commands = /usr/bin/env make test-devel From 73c3c9e0d75be7256edd2988e99790844acdd2c8 Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Thu, 6 Aug 2020 16:44:59 +0200 Subject: [PATCH 080/171] Reducing readings --- tests/test_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 8a446e0..4dfe576 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -17,7 +17,7 @@ def test_predict(): target_times, readings = load_demo() target_times = target_times.head(10) - readings = readings.head(1000) + readings = readings.head(100) # run scores_df = evaluate_templates( From a71c6cfab585235300220881527ff8320e34d72f Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Thu, 6 Aug 2020 18:19:44 +0200 Subject: [PATCH 081/171] Fixing pipeline --- .../pipelines/unstack_double_lstm_timeseries_classifier.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/greenguard/pipelines/unstack_double_lstm_timeseries_classifier.json b/greenguard/pipelines/unstack_double_lstm_timeseries_classifier.json index 7f4e8a6..dede502 100644 --- a/greenguard/pipelines/unstack_double_lstm_timeseries_classifier.json +++ b/greenguard/pipelines/unstack_double_lstm_timeseries_classifier.json @@ -54,7 +54,7 @@ "cutoff_time": "cutoff_time", "time_index": "timestamp" }, - "keras.Sequential.DoubleLSTMTimeSeriesClassifier": { + "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { "epochs": 35, "verbose": false } From 13715fa4ed6e0202de43837fa7131b6d02efdd3c Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Thu, 6 Aug 2020 18:22:07 +0200 Subject: [PATCH 082/171] Function generate_preprocessing --- greenguard/benchmark.py | 26 ++--------------- greenguard/pipeline.py | 62 ++++++++++++++--------------------------- 2 files changed, 23 insertions(+), 65 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 67e8b37..bd1cdb3 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -6,24 +6,11 @@ from greenguard.demo import load_demo from greenguard.metrics import METRICS -from greenguard.pipeline import GreenGuardPipeline, generate_init_params +from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing LOGGER = logging.getLogger(__name__) -def _generate_init_params(templates, init_params): - if not init_params: - init_params = {} - elif isinstance(init_params, list): - init_params = dict(zip(templates, init_params)) - elif any(name in init_params for name in templates): - init_params = init_params - else: - init_params = {template: init_params for template in templates} - - return init_params - - def _build_init_params(template, window_size, rule, template_params): if 'dfs' in template: window_size_rule_params = { @@ -51,15 +38,6 @@ def _build_init_params(template, window_size, rule, template_params): return template_params -def _build_init_preprocessing(templates, template, preprocessing): - if isinstance(preprocessing, int): - return preprocessing - elif isinstance(preprocessing, list): - preprocessing = dict(zip(templates, preprocessing)) - - return preprocessing.get(template, 0) - - def evaluate_template(template, target_times, readings, metric='f1', tuning_iterations=50, preprocessing=0, init_params=None, cost=False, test_size=0.25, cv_splits=3, random_state=0): @@ -243,7 +221,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio try: template_params = init_params[template] template_params = _build_init_params(template, window_size, rule, template_params) - init_preprocessing = _build_init_preprocessing(templates, template, preprocessing) + init_preprocessing = generate_preprocessing(templates, template, preprocessing) result = evaluate_template( template=template, diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 785a747..335b67b 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -116,6 +116,26 @@ def generate_init_params(template_names, init_params): } +def generate_preprocessing(templates_names, template, preprocessing): + """Generate preprocessing dict. + + The preprocessing dict contains one entry for each template and + an integer indicating the number of preprocessing steps for that + template. + """ + if isinstance(preprocessing, int): + preprocessing = {template: preprocessing for template in templates_names} + else: + if isinstance(preprocessing, list): + preprocessing = dict(zip(templates_names, preprocessing)) + + preprocessing = { + template: preprocessing.get(template, 0) + for name in templates_names + } + return preprocessing + + class GreenGuardPipeline(object): """Main Machine Learning component in the GreenGuard project. @@ -258,46 +278,6 @@ def _get_templates(self, templates): return template_names, template_dicts - def _generate_init_params(self, init_params): - """Generate init_params dicts. - - The output will be a dict that contains one entry for each template - with a dict indicating the init_params to use with that template. - """ - if not init_params: - init_params = {} - elif isinstance(init_params, list): - init_params = dict(zip(self._template_names, init_params)) - - if not any(name in init_params for name in self._template_names): - self._init_params = { - name: deepcopy(init_params) - for name in self._template_names - } - else: - self._init_params = { - name: deepcopy(init_params.get(name, {})) - for name in self._template_names - } - - def _generate_preprocessing(self, preprocessing): - """Generate preprocessing dict. - - The preprocessing dict contains one entry for each template and - an integer indicating the number of preprocessing steps for that - template. - """ - if isinstance(preprocessing, int): - self._preprocessing = {name: preprocessing for name in self._template_names} - else: - if isinstance(preprocessing, list): - preprocessing = dict(zip(self._template_names, preprocessing)) - - self._preprocessing = { - name: preprocessing.get(name, 0) - for name in self._template_names - } - def _build_pipeline(self): self._pipeline = MLPipeline(self.template) @@ -330,7 +310,7 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s template_params = template.setdefault('init_params', {}) self._update_params(template_params, init_params) - self._generate_preprocessing(preprocessing) + generate_preprocessing(self._template_names, self.templates, preprocessing) self._set_template(self._template_names[0]) self._hyperparameters = dict() self._build_pipeline() From 25c9b8cc72b57d574f67fe3ca1cf8d0bb28412c9 Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Fri, 7 Aug 2020 14:18:20 +0200 Subject: [PATCH 083/171] Fix error with lstm pipelines --- greenguard/pipeline.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 335b67b..db84050 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -131,7 +131,7 @@ def generate_preprocessing(templates_names, template, preprocessing): preprocessing = { template: preprocessing.get(template, 0) - for name in templates_names + for template in templates_names } return preprocessing @@ -310,7 +310,8 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s template_params = template.setdefault('init_params', {}) self._update_params(template_params, init_params) - generate_preprocessing(self._template_names, self.templates, preprocessing) + self._preprocessing = generate_preprocessing( + self._template_names, self.templates, preprocessing) self._set_template(self._template_names[0]) self._hyperparameters = dict() self._build_pipeline() From 35b2e1171c1e8ff4fc36c77cc258dd5aa23a1922 Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Fri, 7 Aug 2020 14:20:29 +0200 Subject: [PATCH 084/171] Fix lint --- greenguard/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index db84050..c38bc16 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -311,7 +311,7 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s self._update_params(template_params, init_params) self._preprocessing = generate_preprocessing( - self._template_names, self.templates, preprocessing) + self._template_names, self.templates, preprocessing) self._set_template(self._template_names[0]) self._hyperparameters = dict() self._build_pipeline() From eea55c53bc7e5c22206b16a216be7adc0d8c3414 Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Fri, 7 Aug 2020 15:20:18 +0200 Subject: [PATCH 085/171] Updating notebook tutorial --- .../01_GreenGuard_Machine_Learning.ipynb | 320 +++++++++--------- 1 file changed, 159 insertions(+), 161 deletions(-) diff --git a/tutorials/01_GreenGuard_Machine_Learning.ipynb b/tutorials/01_GreenGuard_Machine_Learning.ipynb index e17f0b2..7fab764 100644 --- a/tutorials/01_GreenGuard_Machine_Learning.ipynb +++ b/tutorials/01_GreenGuard_Machine_Learning.ipynb @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -157,7 +157,7 @@ "4 T001 2013-01-16 0" ] }, - "execution_count": 3, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -177,7 +177,7 @@ "(353, 3)" ] }, - "execution_count": 4, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -188,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -200,7 +200,7 @@ "dtype: object" ] }, - "execution_count": 5, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -211,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -290,7 +290,7 @@ "4 T001 S05 2013-01-10 273.0" ] }, - "execution_count": 6, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -301,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -310,7 +310,7 @@ "(1313540, 4)" ] }, - "execution_count": 7, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -321,7 +321,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -334,7 +334,7 @@ "dtype: object" ] }, - "execution_count": 8, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -383,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -408,22 +408,20 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['resample_600s_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_double_144_lstm_timeseries_classifier',\n", - " 'resample_3600s_unstack_24_lstm_timeseries_classifier',\n", - " 'resample_3600s_unstack_double_24_lstm_timeseries_classifier',\n", - " 'resample_600s_unstack_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_144_lstm_timeseries_classifier']" + "['normalize_dfs_xgb_classifier',\n", + " 'unstack_normalize_dfs_xgb_classifier',\n", + " 'unstack_dfs_xgb_classifier',\n", + " 'unstack_lstm_timeseries_classifier',\n", + " 'unstack_double_lstm_timeseries_classifier']" ] }, - "execution_count": 10, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -443,18 +441,18 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['resample_600s_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", - " 'resample_600s_unstack_dfs_1d_xgb_classifier']" + "['normalize_dfs_xgb_classifier',\n", + " 'unstack_normalize_dfs_xgb_classifier',\n", + " 'unstack_dfs_xgb_classifier']" ] }, - "execution_count": 11, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -473,18 +471,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'resample_600s_normalize_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_unstack_normalize_dfs_1d_xgb_classifier.json',\n", - " 'resample_600s_unstack_dfs_1d_xgb_classifier': '/home/xals/Projects/MIT/GreenGuard/greenguard/pipelines/resample_600s_unstack_dfs_1d_xgb_classifier.json'}" + "{'normalize_dfs_xgb_classifier': '/home/usuario/Projects/GreenGuard/greenguard/pipelines/normalize_dfs_xgb_classifier.json',\n", + " 'unstack_normalize_dfs_xgb_classifier': '/home/usuario/Projects/GreenGuard/greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json',\n", + " 'unstack_dfs_xgb_classifier': '/home/usuario/Projects/GreenGuard/greenguard/pipelines/unstack_dfs_xgb_classifier.json'}" ] }, - "execution_count": 12, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -498,10 +496,10 @@ "metadata": {}, "source": [ "For the rest of this tutorial, we will select and use the templates\n", - "`resample_600s_unstack_normalize_dfs_1d_xgb_classifier` and\n", - "`resample_600s_normalize_dfs_1d_xgb_classifier`.\n", + "`unstack_normalize_dfs_xgb_classifier` and\n", + "`normalize_dfs_xgb_classifier`.\n", "\n", - "The `resample_600s_unstack_normalize_dfs_1d_xgb_classifier` template contains the following steps:\n", + "The `unstack_normalize_dfs_xgb_classifier` template contains the following steps:\n", "\n", "- Resample the data using a 10 minute average aggregation\n", "- Unstack the data by signal, so each signal is in a different column\n", @@ -509,19 +507,19 @@ "- Use DFS on the readings based on the target_times cutoff times using a 1d window size\n", "- Apply an XGBoost Classifier\n", "\n", - "And the `resample_600s_normalize_dfs_1d_xgb_classifier` template contains the above steps but without\n", + "And the `normalize_dfs_xgb_classifier` template contains the above steps but without\n", "unstacking the data by signal." ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "templates = [\n", - " 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier', \n", - " 'resample_600s_normalize_dfs_1d_xgb_classifier'\n", + " 'unstack_normalize_dfs_xgb_classifier', \n", + " 'normalize_dfs_xgb_classifier'\n", "]" ] }, @@ -548,7 +546,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -570,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -587,14 +585,14 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2020-07-10 11:39:49,290 - INFO - session - Obtaining default configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n" + "INFO:btb.session:Obtaining default configuration for unstack_normalize_dfs_xgb_classifier\n" ] }, { @@ -602,22 +600,22 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 00:32 | Progress: 100%|██████████\n", - "Elapsed: 00:16 | Progress: 100%|██████████\n", + "Elapsed: 00:34 | Progress: 100%|██████████\n", + "Elapsed: 00:18 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:32 | Progress: 100%|██████████\n", - "Elapsed: 00:16 | Progress: 100%|██████████\n", + "Elapsed: 00:36 | Progress: 100%|██████████\n", + "Elapsed: 00:17 | Progress: 100%|██████████\n", "Built 165 features\n", - "Elapsed: 00:32 | Progress: 100%|██████████\n", - "Elapsed: 00:15 | Progress: 100%|██████████\n" + "Elapsed: 00:38 | Progress: 100%|██████████\n", + "Elapsed: 00:17 | Progress: 100%|██████████\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2020-07-10 11:42:19,561 - INFO - pipeline - New configuration found:\n", - " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: unstack_normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", @@ -625,8 +623,8 @@ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "2020-07-10 11:42:19,563 - INFO - session - New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.604136604136604\n", - "2020-07-10 11:42:19,565 - INFO - session - Obtaining default configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n" + "INFO:btb.session:New optimal found: unstack_normalize_dfs_xgb_classifier - 0.605187908496732\n", + "INFO:btb.session:Obtaining default configuration for normalize_dfs_xgb_classifier\n" ] }, { @@ -634,51 +632,51 @@ "output_type": "stream", "text": [ "Built 99 features\n", - "Elapsed: 01:28 | Progress: 100%|██████████\n", - "Elapsed: 00:45 | Progress: 100%|██████████\n", + "Elapsed: 01:44 | Progress: 100%|██████████\n", + "Elapsed: 00:52 | Progress: 100%|██████████\n", "Built 99 features\n", - "Elapsed: 01:29 | Progress: 100%|██████████\n", - "Elapsed: 00:47 | Progress: 100%|██████████\n", + "Elapsed: 01:38 | Progress: 100%|██████████\n", + "Elapsed: 00:52 | Progress: 100%|██████████\n", "Built 99 features\n", - "Elapsed: 01:32 | Progress: 100%|██████████\n", - "Elapsed: 00:48 | Progress: 100%|██████████\n" + "Elapsed: 01:39 | Progress: 100%|██████████\n", + "Elapsed: 00:49 | Progress: 100%|██████████\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "2020-07-10 11:49:21,971 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:22,446 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:22,682 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:22,862 - INFO - pipeline - New configuration found:\n", - " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: unstack_normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 16\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 82\n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 20\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 234\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39699298238763425\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.06238180737748478\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23028782510751677\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9403975339570728\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "2020-07-10 11:49:22,864 - INFO - session - New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6110894266631971\n" + "INFO:btb.session:New optimal found: unstack_normalize_dfs_xgb_classifier - 0.6106037764640573\n", + "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n" ] }, { "data": { "text/plain": [ - "{'id': '6cbe94178d761b5c263dc2f7ab1f8205',\n", - " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + "{'id': '28d8ebbde404a0e501262a652c4d9aa5',\n", + " 'name': 'unstack_normalize_dfs_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 16,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 82,\n", + " 'max_labels'): 20,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 234,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39699298238763425,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.06238180737748478,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23028782510751677,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9403975339570728,\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.6110894266631971}" + " 'score': 0.6106037764640573}" ] }, - "execution_count": 16, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -697,25 +695,25 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': '6cbe94178d761b5c263dc2f7ab1f8205',\n", - " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + "{'id': '28d8ebbde404a0e501262a652c4d9aa5',\n", + " 'name': 'unstack_normalize_dfs_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 16,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 82,\n", + " 'max_labels'): 20,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 234,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39699298238763425,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.06238180737748478,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23028782510751677,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9403975339570728,\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.6110894266631971}" + " 'score': 0.6106037764640573}" ] }, - "execution_count": 17, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -733,22 +731,22 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 16,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 82,\n", + " 'max_labels'): 20,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 234,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.39699298238763425,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.06238180737748478,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23028782510751677,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9403975339570728,\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" ] }, - "execution_count": 18, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -766,16 +764,16 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'resample_600s_unstack_normalize_dfs_1d_xgb_classifier'" + "'unstack_normalize_dfs_xgb_classifier'" ] }, - "execution_count": 19, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -794,16 +792,16 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6110894266631971" + "0.6106037764640573" ] }, - "execution_count": 20, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -823,61 +821,61 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2020-07-10 11:49:22,952 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:23,246 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:23,464 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:23,668 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:23,791 - INFO - pipeline - New configuration found:\n", - " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: unstack_normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 80\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 31\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 4\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.32814385597842255\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.19795099494663482\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "2020-07-10 11:49:23,792 - INFO - session - New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6297458681170419\n", - "2020-07-10 11:49:23,796 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:23,955 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:24,191 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:24,403 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:24,546 - INFO - session - Generating new proposal configuration for resample_600s_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:25,544 - INFO - session - Generating new proposal configuration for resample_600s_unstack_normalize_dfs_1d_xgb_classifier\n", - "2020-07-10 11:49:25,698 - INFO - pipeline - New configuration found:\n", - " Template: resample_600s_unstack_normalize_dfs_1d_xgb_classifier \n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 32\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 10\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.11814847201162682\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9589332448610124\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6\n", + "INFO:btb.session:New optimal found: unstack_normalize_dfs_xgb_classifier - 0.640497737556561\n", + "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: unstack_normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 96\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 36\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 9\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3256576169027807\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.1061546068995437\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "2020-07-10 11:49:25,699 - INFO - session - New optimal found: resample_600s_unstack_normalize_dfs_1d_xgb_classifier - 0.6306697372853741\n" + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 98\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 34\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3652063328881058\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.8627183599656656\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6\n", + "INFO:btb.session:New optimal found: unstack_normalize_dfs_xgb_classifier - 0.6592605156037993\n", + "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n" ] }, { "data": { "text/plain": [ - "{'id': '157087395a2643c9ecc4a2b3549a1fc9',\n", - " 'name': 'resample_600s_unstack_normalize_dfs_1d_xgb_classifier',\n", + "{'id': 'f6b410d303a1cfeafdcfe0dbcf330767',\n", + " 'name': 'unstack_normalize_dfs_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 96,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 36,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 9,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3256576169027807,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.1061546068995437,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.6306697372853741}" + " 'max_labels'): 98,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 34,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3652063328881058,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.8627183599656656,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n", + " 'score': 0.6592605156037993}" ] }, - "execution_count": 21, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -888,16 +886,16 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6306697372853741" + "0.6592605156037993" ] }, - "execution_count": 22, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -908,22 +906,22 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 96,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 36,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 9,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3256576169027807,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.1061546068995437,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + " 'max_labels'): 98,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 34,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3652063328881058,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.8627183599656656,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6}" ] }, - "execution_count": 23, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -947,7 +945,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -955,7 +953,7 @@ "output_type": "stream", "text": [ "Built 165 features\n", - "Elapsed: 00:37 | Progress: 100%|██████████\n" + "Elapsed: 00:39 | Progress: 100%|██████████\n" ] } ], @@ -974,14 +972,14 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:12 | Progress: 100%|██████████\n" + "Elapsed: 00:14 | Progress: 100%|██████████\n" ] } ], @@ -998,16 +996,16 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.7307692307692306" + "0.693877551020408" ] }, - "execution_count": 26, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1036,7 +1034,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 39, "metadata": {}, "outputs": [], "source": [ @@ -1055,7 +1053,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -1071,14 +1069,14 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Elapsed: 00:12 | Progress: 100%|██████████\n" + "Elapsed: 00:14 | Progress: 100%|██████████\n" ] }, { @@ -1087,7 +1085,7 @@ "array([0, 0, 0, 1, 0])" ] }, - "execution_count": 29, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } From 69b23373ac80fc52b9b451b2adf58865751a79fb Mon Sep 17 00:00:00 2001 From: joanvaquer Date: Fri, 7 Aug 2020 16:39:37 +0200 Subject: [PATCH 086/171] Updating docstring --- greenguard/benchmark.py | 17 ++++++++++------- greenguard/pipeline.py | 35 ++++++++++++++++++++++++----------- 2 files changed, 34 insertions(+), 18 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index bd1cdb3..7b6fdc5 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -46,9 +46,6 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter Args: template (str): Given template to evaluate. - metric (function or str): - Metric to use. If an ``str`` is give it must be one of the metrics - defined in the ``greenguard.metrics.METRICS`` dictionary. target_times (DataFrame): Contains the specefication problem that we are solving, which has three columns: @@ -67,10 +64,13 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter * timestamp (datetime): Time where the reading took place, as a datetime. * value (float): Numeric value of this reading. + metric (function or str): + Metric to use. If an ``str`` is give it must be one of the metrics + defined in the ``greenguard.metrics.METRICS`` dictionary. tuning_iterations (int): Number of iterations to be used. preprocessing (int, list or dict): - Type of preprocessing to be used. + Number of preprocessing steps to be used. init_params (list): Initialization parameters for the pipeline. cost (bool): @@ -137,6 +137,8 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio defined in the ``greenguard.metrics.METRICS`` dictionary. tuning_iterations (int): Number of iterations to be used. + init_params (dict): + Initialization parameters for the pipelines. target_times (DataFrame): Contains the specefication problem that we are solving, which has three columns: @@ -156,7 +158,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio * value (float): Numeric value of this reading. preprocessing (int, list or dict): - Type of preprocessing to be used. + Number of preprocessing steps to be used. cost (bool): Wheter the metric is a cost function (the lower the better) or not. test_size (float): @@ -208,6 +210,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio target_times, readings = load_demo() init_params = generate_init_params(templates, init_params) + preprocessing = generate_preprocessing(templates, preprocessing) scores_list = [] for template, window_rule in product(templates, window_size_rule): @@ -221,7 +224,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio try: template_params = init_params[template] template_params = _build_init_params(template, window_size, rule, template_params) - init_preprocessing = generate_preprocessing(templates, template, preprocessing) + template_preprocessing = preprocessing[template] result = evaluate_template( template=template, @@ -229,7 +232,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio readings=readings, metric=metric, tuning_iterations=tuning_iterations, - preprocessing=init_preprocessing, + preprocessing=template_preprocessing, init_params=template_params, cost=cost, test_size=test_size, diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index c38bc16..a46c2c6 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -96,8 +96,15 @@ def get_pipelines(pattern='', path=False, unstacked=False): def generate_init_params(template_names, init_params): """Generate init_params dicts. - The output will be a dict that contains one entry for each template - with a dict indicating the init_params to use with that template. + Args: + template_names (list): + List of templates. + init_params (list or dict): + Initialization parameters for the templates. + + Returns: + Dict that contains one entry for each template with a dict indicating + the init_params to use with that template. """ if not init_params: init_params = {} @@ -116,22 +123,29 @@ def generate_init_params(template_names, init_params): } -def generate_preprocessing(templates_names, template, preprocessing): +def generate_preprocessing(templates_names, preprocessing): """Generate preprocessing dict. - The preprocessing dict contains one entry for each template and - an integer indicating the number of preprocessing steps for that - template. + Args: + template_names (list): + List of templates. + preprocessing (int, list or dict): + Number of preprocessing steps to be used. + + Returns: + preprocessing (dict): + Contains one entry for each template and an integer indicating the + number of preprocessing steps for that template. """ if isinstance(preprocessing, int): - preprocessing = {template: preprocessing for template in templates_names} + preprocessing = {name: preprocessing for name in templates_names} else: if isinstance(preprocessing, list): preprocessing = dict(zip(templates_names, preprocessing)) preprocessing = { - template: preprocessing.get(template, 0) - for template in templates_names + name: preprocessing.get(name, 0) + for name in templates_names } return preprocessing @@ -310,8 +324,7 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s template_params = template.setdefault('init_params', {}) self._update_params(template_params, init_params) - self._preprocessing = generate_preprocessing( - self._template_names, self.templates, preprocessing) + self._preprocessing = generate_preprocessing(self._template_names, preprocessing) self._set_template(self._template_names[0]) self._hyperparameters = dict() self._build_pipeline() From 386eed3a342a18ff29bffb8dfd2c601636dd9540 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 11 Aug 2020 03:05:35 +0200 Subject: [PATCH 087/171] Update docs. --- greenguard/benchmark.py | 34 +++- tutorials/03_Benchmarking.ipynb | 293 ++++++++++++++++++++++++-------- 2 files changed, 243 insertions(+), 84 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 7b6fdc5..c0984db 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -39,8 +39,8 @@ def _build_init_params(template, window_size, rule, template_params): def evaluate_template(template, target_times, readings, metric='f1', tuning_iterations=50, - preprocessing=0, init_params=None, cost=False, test_size=0.25, cv_splits=3, - random_state=0): + preprocessing=0, init_params=None, cost=False, test_size=0.25, + cv_splits=3, random_state=0, cache_path=None): """Returns the scores for a given template. Args: @@ -81,6 +81,9 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter Amount of splits to create. random_state (int): Random number of train_test split. + cache_path (str): + If given, cache the generated cross validation splits in this folder. + Defatuls to ``None``. Returns: scores (dict): @@ -93,8 +96,15 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter if isinstance(metric, str): metric, cost = METRICS[metric] - pipeline = GreenGuardPipeline(template, metric, cost=cost, cv_splits=cv_splits, - init_params=init_params, preprocessing=preprocessing) + pipeline = GreenGuardPipeline( + template, + metric, + cost=cost, + cv_splits=cv_splits, + init_params=init_params, + preprocessing=preprocessing, + cache_path=cache_path + ) # Computing the default test score pipeline.fit(train, readings) @@ -122,9 +132,10 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter return scores -def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iterations=50, - init_params=None, target_times=None, readings=None, preprocessing=0, - cost=False, test_size=0.25, cv_splits=3, random_state=0, output_path=None): +def evaluate_templates(templates, window_size_rule, metric='f1', + tuning_iterations=50, init_params=None, target_times=None, + readings=None, preprocessing=0, cost=False, test_size=0.25, + cv_splits=3, random_state=0, cache_path=None, output_path=None): """Execute the benchmark process and optionally store the result as a ``CSV``. Args: @@ -132,7 +143,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio List of templates to try. window_size_rule (list): List of tupples (int, str or Timedelta object). - metric (function or str). + metric (function or str): Metric to use. If an ``str`` is give it must be one of the metrics defined in the ``greenguard.metrics.METRICS`` dictionary. tuning_iterations (int): @@ -169,6 +180,9 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio Random number of train_test split. output_path (str): Path where to save the benchmark report. + cache_path (str): + If given, cache the generated cross validation splits in this folder. + Defatuls to ``None``. Returns: pandas.DataFrame or None: @@ -237,7 +251,9 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio cost=cost, test_size=test_size, cv_splits=cv_splits, - random_state=random_state) + random_state=random_state, + cache_path=cache_path + ) scores.update(result) scores['status'] = 'OK' diff --git a/tutorials/03_Benchmarking.ipynb b/tutorials/03_Benchmarking.ipynb index ad3c41d..ee765a5 100644 --- a/tutorials/03_Benchmarking.ipynb +++ b/tutorials/03_Benchmarking.ipynb @@ -1,5 +1,22 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Benchmarking" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Setup the logging\n", + "\n", + "This step sets up logging in our environment to increase our visibility over\n", + "the steps that GreenGuard performs." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -17,47 +34,61 @@ ] }, { - "cell_type": "code", - "execution_count": 2, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - }, - { - "data": { - "text/plain": [ - "['unstack_double_lstm_timeseries_classifier',\n", - " 'unstack_lstm_timeseries_classifier']" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "from greenguard import get_pipelines\n", "\n", - "get_pipelines('lstm')" + "## Running the Benchmarking\n", + "\n", + "The user API for the GreenGuard Benchmarking is the `greenguard.benchmark.evaluate_templates` function.\n", + "\n", + "The `evaluate_templates` function accepts the following arguments:\n", + "* `templates (list)`: List of templates to try.\n", + "* `window_size_rule (list)`: List of tupples (int, str or Timedelta object).\n", + "* `metric (function or str)`: Metric to use. If an ``str`` is give it must be one of the metrics defined in the `greenguard.metrics.METRICS` dictionary.\n", + "* `tuning_iterations (int)`: Number of iterations to be used.\n", + "* `init_params (dict)`: Initialization parameters for the pipelines.\n", + "* `target_times (DataFrame)`: Contains the specefication problem that we are solving, which has three columns:\n", + " * `turbine_id`: Unique identifier of the turbine which this label corresponds to.\n", + " * `cutoff_time`: Time associated with this target.\n", + " * `target`: The value that we want to predict. This can either be a numerical value\n", + " or a categorical label. This column can also be skipped when preparing\n", + " data that will be used only to make predictions and not to fit any\n", + " pipeline.\n", + "* `readings (DataFrame)`: Contains the signal data from different sensors, with the following columns:\n", + " * `turbine_id`: Unique identifier of the turbine which this reading comes from.\n", + " * `signal_id`: Unique identifier of the signal which this reading comes from.\n", + " * `timestamp (datetime)`: Time where the reading took place, as a datetime.\n", + " * `value (float)`: Numeric value of this reading.\n", + "* `preprocessing (int, list or dict)`: Number of preprocessing steps to be used.\n", + "* `cost (bool)`: Wheter the metric is a cost function (the lower the better) or not.\n", + "* `test_size (float)`: Percentage of the data set to be used for the test.\n", + "* `cv_splits (int)`: Amount of splits to create.\n", + "* `random_state (int)`: Random number of train_test split.\n", + "* `output_path (str)`: Path where to save the benchmark report.\n", + "* `cache_path (str)`: If given, cache the generated cross validation splits in this folder. Defatuls to ``None``." ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "from greenguard.benchmark import evaluate_templates" + "templates = ['unstack_lstm_timeseries_classifier', 'normalize_dfs_xgb_classifier']\n", + "window_size_rule = [('1d', '1h'), ('3d', '4h')]\n", + "init_params = {\n", + " 'unstack_lstm_timeseries_classifier': {\n", + " 'keras.Sequential.LSTMTimeSeriesClassifier#1': {\n", + " 'epochs': 1,\n", + " }\n", + " }\n", + "}\n" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "scrolled": false }, @@ -66,43 +97,144 @@ "name": "stderr", "output_type": "stream", "text": [ - "2020-08-05 17:14:08,860 - INFO - greenguard.pipeline - New configuration found:\n", + "Using TensorFlow backend.\n", + "INFO:greenguard.pipeline:New configuration found:\n", " Template: unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", - "2020-08-05 17:14:16,974 - INFO - greenguard.pipeline - New configuration found:\n", - " Template: unstack_lstm_timeseries_classifier \n", - " Hyperparameters: \n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 397\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.38706239055719976\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 367\n", - "2020-08-05 17:14:36,898 - INFO - greenguard.pipeline - New configuration found:\n", - " Template: unstack_lstm_timeseries_classifier \n", - " Hyperparameters: \n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): most_frequent\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 90\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.7472037016839137\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 215\n", - "2020-08-05 17:15:00,145 - INFO - greenguard.pipeline - New configuration found:\n", + "INFO:greenguard.pipeline:New configuration found:\n", " Template: unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", - "2020-08-05 17:15:14,558 - INFO - greenguard.pipeline - New configuration found:\n", + "INFO:greenguard.pipeline:New configuration found:\n", " Template: unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): most_frequent\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 245\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.23326913705083852\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 425\n" + " ('sklearn.impute.SimpleImputer#1', 'strategy'): median\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 353\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.4718077136146996\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 151\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 99 features\n", + "Elapsed: 02:58 | Progress: 100%|██████████\n", + "Elapsed: 00:58 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 01:54 | Progress: 100%|██████████\n", + "Elapsed: 01:08 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 02:20 | Progress: 100%|██████████\n", + "Elapsed: 01:09 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 02:16 | Progress: 100%|██████████\n", + "Elapsed: 01:07 | Progress: 100%|██████████\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: normalize_dfs_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: normalize_dfs_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 18\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 920\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 10\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.02731362750079913\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.46258174821600884\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 3\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 99 features\n", + "Elapsed: 03:18 | Progress: 100%|██████████\n", + "Elapsed: 01:03 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 03:15 | Progress: 100%|██████████\n", + "Elapsed: 01:06 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 02:05 | Progress: 100%|██████████\n", + "Elapsed: 01:10 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 01:51 | Progress: 100%|██████████\n", + "Elapsed: 00:54 | Progress: 100%|██████████\n", + "Built 99 features\n", + "Elapsed: 01:51 | Progress: 100%|██████████\n", + "Elapsed: 00:58 | Progress: 100%|██████████\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: normalize_dfs_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: normalize_dfs_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 7\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 348\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.5272082810065426\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.04014402178038856\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 2\n" ] }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Built 99 features\n", + "Elapsed: 02:42 | Progress: 100%|██████████\n", + "Elapsed: 01:00 | Progress: 100%|██████████\n" + ] + } + ], + "source": [ + "from greenguard.benchmark import evaluate_templates\n", + "\n", + "results = evaluate_templates(\n", + " templates=templates,\n", + " window_size_rule=window_size_rule,\n", + " init_params=init_params,\n", + " tuning_iterations=3,\n", + " cv_splits=3,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ { "data": { "text/html": [ @@ -140,9 +272,9 @@ " unstack_lstm_timeseries_classifier\n", " 1d\n", " 1h\n", - " 0.603774\n", - " 0.624928\n", - " 0.638871\n", + " 0.711864\n", + " 0.646437\n", + " 0.646437\n", " 0.666667\n", " OK\n", " \n", @@ -151,24 +283,50 @@ " unstack_lstm_timeseries_classifier\n", " 3d\n", " 4h\n", - " 0.708333\n", - " 0.607978\n", - " 0.640048\n", + " 0.703704\n", + " 0.577295\n", + " 0.616052\n", " 0.709677\n", " OK\n", " \n", + " \n", + " 2\n", + " normalize_dfs_xgb_classifier\n", + " 1d\n", + " 1h\n", + " 0.581818\n", + " 0.619698\n", + " 0.646750\n", + " 0.631579\n", + " OK\n", + " \n", + " \n", + " 3\n", + " normalize_dfs_xgb_classifier\n", + " 3d\n", + " 4h\n", + " 0.581818\n", + " 0.619698\n", + " 0.637957\n", + " 0.642857\n", + " OK\n", + " \n", " \n", "\n", "" ], "text/plain": [ " template window_size resample_rule default_test \\\n", - "0 unstack_lstm_timeseries_classifier 1d 1h 0.603774 \n", - "1 unstack_lstm_timeseries_classifier 3d 4h 0.708333 \n", + "0 unstack_lstm_timeseries_classifier 1d 1h 0.711864 \n", + "1 unstack_lstm_timeseries_classifier 3d 4h 0.703704 \n", + "2 normalize_dfs_xgb_classifier 1d 1h 0.581818 \n", + "3 normalize_dfs_xgb_classifier 3d 4h 0.581818 \n", "\n", " default_cv tuned_cv tuned_test status \n", - "0 0.624928 0.638871 0.666667 OK \n", - "1 0.607978 0.640048 0.709677 OK " + "0 0.646437 0.646437 0.666667 OK \n", + "1 0.577295 0.616052 0.709677 OK \n", + "2 0.619698 0.646750 0.631579 OK \n", + "3 0.619698 0.637957 0.642857 OK " ] }, "execution_count": 4, @@ -177,22 +335,7 @@ } ], "source": [ - "evaluate_templates(\n", - " templates=[\n", - " 'unstack_lstm_timeseries_classifier',\n", - " ],\n", - " window_size_rule=[\n", - " ('1d', '1h'),\n", - " ('3d', '4h'),\n", - " ],\n", - " init_params={\n", - " 'keras.Sequential.LSTMTimeSeriesClassifier#1': {\n", - " 'epochs': 1,\n", - " }\n", - " },\n", - " tuning_iterations=3,\n", - " cv_splits=3,\n", - ")" + "results" ] } ], @@ -212,7 +355,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.6.11" } }, "nbformat": 4, From 5e92532bd044b79f74c369ad8e31f3c8dda00f42 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 11 Aug 2020 03:16:24 +0200 Subject: [PATCH 088/171] Add release notes for v0.2.3 --- HISTORY.md | 4 ++++ README.md | 1 + docker/Dockerfile | 2 +- 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index d9c599b..0607be3 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,9 @@ # History +## 0.2.3 - 2020-08-10 + +* Added benchmarking module. + ## 0.2.2 - 2020-07-10 ### Internal Imrpovements diff --git a/README.md b/README.md index 9c37cb4..e585050 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ AutoML for Renewable Energy Industries. [![PyPI Shield](https://img.shields.io/pypi/v/greenguard.svg)](https://pypi.python.org/pypi/greenguard) [![Travis CI Shield](https://travis-ci.org/signals-dev/GreenGuard.svg?branch=master)](https://travis-ci.org/signals-dev/GreenGuard) [![Downloads](https://pepy.tech/badge/greenguard)](https://pepy.tech/project/greenguard) +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/signals-dev/GreenGuard/master?filepath=tutorials) diff --git a/docker/Dockerfile b/docker/Dockerfile index 0917f4c..a086f2b 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,7 +6,7 @@ EXPOSE 8888 RUN mkdir /app COPY setup.py /app COPY greenguard /app/greenguard -COPY notebooks /app/notebooks +COPY tutorials /app/tutorials RUN pip install -e /app jupyter WORKDIR /app From 942fa67b3775e9b597d263030d1d6cdc1b644337 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 11 Aug 2020 03:19:45 +0200 Subject: [PATCH 089/171] =?UTF-8?q?Bump=20version:=200.2.3.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.3?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 3 ++- setup.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 17dc390..0247a00 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.3.dev0' +__version__ = '0.2.3' import os diff --git a/setup.cfg b/setup.cfg index 2c808a1..040280a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.3.dev0 +current_version = 0.2.3 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? @@ -45,3 +45,4 @@ test = pytest [tool:pytest] collect_ignore = ['setup.py'] + diff --git a/setup.py b/setup.py index 6cb3298..183a4ef 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.3.dev0', + version='0.2.3', zip_safe=False, ) From 79287059d3fb785bea1b5508415db0ecb9385f9d Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 11 Aug 2020 09:51:40 +0200 Subject: [PATCH 090/171] =?UTF-8?q?Bump=20version:=200.2.3=20=E2=86=92=200?= =?UTF-8?q?.2.4.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 0247a00..20eb561 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.3' +__version__ = '0.2.4.dev0' import os diff --git a/setup.cfg b/setup.cfg index 040280a..44d3a12 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.3 +current_version = 0.2.4.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 183a4ef..8ffd589 100644 --- a/setup.py +++ b/setup.py @@ -103,6 +103,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.3', + version='0.2.4.dev0', zip_safe=False, ) From ae334569c1061a6284431abd8b46594106d38020 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 15 Sep 2020 15:03:36 +0200 Subject: [PATCH 091/171] Created make_problems function to automaticly generate problems into pickle files. --- greenguard/benchmark.py | 82 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index c0984db..0d8ea30 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -1,10 +1,15 @@ import logging +import os +import pickle from itertools import product import pandas as pd +from dask.distributed import Client, LocalCluster from sklearn.model_selection import train_test_split +from tqdm import tqdm from greenguard.demo import load_demo +from greenguard.loaders import CSVLoader from greenguard.metrics import METRICS from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing @@ -273,3 +278,80 @@ def evaluate_templates(templates, window_size_rule, metric='f1', results.to_csv(output_path) else: return results + + +def make_problem(target_times_paths, readings_path, window_size_resample_rule, output_path, + signals=None, aggregation='mean', datetime_fmt='%m/%d/%y %H:%M:%S', + filename_fmt='%Y-%m.csv', unstack=False, parse_dates=['cutoff_time']): + """ + Args: + target_times_paths (list): + List of paths to CSVs that contain target times. + readings_path (str): + Path to the folder where readings in raw CSV format can be found. + window_size_resample_rule (list): + List of tupples (int, str or Timedelta object). + output_path (str): + Path to save the generated problems. + signals (str): + List of signal names or csv file that has a `signal_id` column to use as the signal + names list. + aggregation (str): + Aggregation to perform to the. + datetime_fmt (str): + Date format used by the column timestamp for the readings. Defaults + to `%m/%d/%y %H:%M:%S`. + filename_fmt (str): + Filename format. Defaults to `%Y-%m.csv`. + unstack (bool): + Whether to unstack the resampled data, generating one column per signal. Only used + when resampling. Defaults to `False`. + """ + cluster = LocalCluster(n_workers=16, dashboard_adress=':9792') + client = Client(cluster) + + for tt_path in tqdm(target_times_paths): + if parse_dates: + parse_dates = [parse_dates] if not isinstance(parse_dates, list) else parse_dates + target_times = pd.read_csv(tt_path, parse_dates=parse_dates) + else: + target_times = pd.read_csv(tt_path) + + for window_size, rule in window_size_resample_rule: + csv_loader = CSVLoader( + readings_path, + rule=rule, + aggregation=aggregation, + datetime_fmt=datetime_fmt, + ) + + new_target_times, readings = csv_loader.load( + target_times, + window_size=window_size, + signals=signals, + ) + + problem_name = 'problem_{}_{}.pkl'.format(window_size, rule) + output_pickle = os.path.join(output_path, problem_name) + + with open(output_pickle, 'wb') as pickle_file: + pickle.dump((new_target_times, readings), pickle_file) + + client.shutdown() + + +def benchmark(): + """ + Args: + pipelines (list): + problem_paths (list): + target_times_paths (list): + readings_path (list): + window_size_resample_rule (list): + List of tupples (int, str or Timedelta object). + tuning_iterations (int): + cv_splits (int): + cache_path (str): + output_path (str): + """ + pass From a1244ba992c8767de8c5c507ff38a4dc17eb4973 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 15 Sep 2020 15:59:30 +0200 Subject: [PATCH 092/171] Return generated problems paths --- greenguard/benchmark.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 0d8ea30..35c9117 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -310,6 +310,8 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, o cluster = LocalCluster(n_workers=16, dashboard_adress=':9792') client = Client(cluster) + generated_problems = list() + for tt_path in tqdm(target_times_paths): if parse_dates: parse_dates = [parse_dates] if not isinstance(parse_dates, list) else parse_dates @@ -322,6 +324,7 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, o readings_path, rule=rule, aggregation=aggregation, + unstack=unstack, datetime_fmt=datetime_fmt, ) @@ -332,13 +335,16 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, o ) problem_name = 'problem_{}_{}.pkl'.format(window_size, rule) - output_pickle = os.path.join(output_path, problem_name) - - with open(output_pickle, 'wb') as pickle_file: + output_pickle_path = os.path.join(output_path, problem_name) + with open(output_pickle_path, 'wb') as pickle_file: pickle.dump((new_target_times, readings), pickle_file) + generated_problems.append(output_pickle_path) + client.shutdown() + return generated_problems + def benchmark(): """ From b05aa355008facada3796c4534e4cab647cfc11b Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 15 Sep 2020 18:50:38 +0200 Subject: [PATCH 093/171] Benchmark draft --- greenguard/benchmark.py | 169 +++++++++++++++++++++++++++++++++------- greenguard/utils.py | 8 ++ 2 files changed, 148 insertions(+), 29 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 35c9117..281039a 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -12,10 +12,17 @@ from greenguard.loaders import CSVLoader from greenguard.metrics import METRICS from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing +from greenguard.utils import as_list LOGGER = logging.getLogger(__name__) +def _create_client(n_workers=16, dashboard_adress=':9792'): + cluster = LocalCluster(n_workers=n_workers, dashboard_adress=dashboard_adress) + client = Client(cluster) + return client + + def _build_init_params(template, window_size, rule, template_params): if 'dfs' in template: window_size_rule_params = { @@ -280,9 +287,29 @@ def evaluate_templates(templates, window_size_rule, metric='f1', return results +def _generate_target_times_readings(target_times, readings_path, window_size, rule, + signals, aggregation, datetime_fmt='%m/%d/%y %H:%M:%S', + filename_fmt='%Y-%m.csv'): + """ + Returns: + pandas.DataFrame: + Table of readings for the target times, including the columns ``turbine_id``, + ``signal_id``, ``timestamp`` and ``value``. + """ + csv_loader = CSVLoader( + readings_path, + rule=rule, + aggregation=aggregation, + datetime_fmt=datetime_fmt, + filename_fmt=filename_fmt, + ) + + return csv_loader.load(target_times, window_size=window_size, signals=signals) + + def make_problem(target_times_paths, readings_path, window_size_resample_rule, output_path, signals=None, aggregation='mean', datetime_fmt='%m/%d/%y %H:%M:%S', - filename_fmt='%Y-%m.csv', unstack=False, parse_dates=['cutoff_time']): + filename_fmt='%Y-%m.csv', n_workers=16, dashboard_adress=':9792'): """ Args: target_times_paths (list): @@ -303,41 +330,29 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, o to `%m/%d/%y %H:%M:%S`. filename_fmt (str): Filename format. Defaults to `%Y-%m.csv`. - unstack (bool): - Whether to unstack the resampled data, generating one column per signal. Only used - when resampling. Defaults to `False`. + n_workers (int): + dashboard_adress (str): """ - cluster = LocalCluster(n_workers=16, dashboard_adress=':9792') - client = Client(cluster) - + client = _create_client(n_workers, dashboard_adress) generated_problems = list() + target_times_paths = as_list(target_times_paths) for tt_path in tqdm(target_times_paths): - if parse_dates: - parse_dates = [parse_dates] if not isinstance(parse_dates, list) else parse_dates - target_times = pd.read_csv(tt_path, parse_dates=parse_dates) - else: - target_times = pd.read_csv(tt_path) - for window_size, rule in window_size_resample_rule: - csv_loader = CSVLoader( + new_target_times, readings = _generate_target_times_readings( + tt_path, readings_path, - rule=rule, - aggregation=aggregation, - unstack=unstack, - datetime_fmt=datetime_fmt, - ) - - new_target_times, readings = csv_loader.load( - target_times, - window_size=window_size, - signals=signals, + rule, + aggregation, + signals, + datetime_fmt, + filename_fmt ) problem_name = 'problem_{}_{}.pkl'.format(window_size, rule) output_pickle_path = os.path.join(output_path, problem_name) with open(output_pickle_path, 'wb') as pickle_file: - pickle.dump((new_target_times, readings), pickle_file) + pickle.dump((new_target_times, readings, window_size, rule), pickle_file) generated_problems.append(output_pickle_path) @@ -346,18 +361,114 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, o return generated_problems -def benchmark(): +def benchmark(templates, problem_paths=None, target_times_paths=None, readings_path=None, + window_size_resample_rule=None, signals=None, tuning_iterations=100, preprocessing=0, + init_params=None, aggregation='mean', cost=False, cv_splits=5, metric='f1', + test_size=0.33, random_state=0, cache_path=None, n_workers=16, + dashboard_adress=':9792', output_path=None, datetime_fmt='%m/%d/%y %H:%M:%S', + filename_fmt='%Y-%m.csv'): """ Args: - pipelines (list): + templates (list): problem_paths (list): target_times_paths (list): - readings_path (list): + readings_path (str): window_size_resample_rule (list): List of tupples (int, str or Timedelta object). + signals (str): + List of signal names or csv file that has a `signal_id` column to use as the signal + names list. + aggregation (str): + Aggregation to perform to the. + datetime_fmt (str): + Date format used by the column timestamp for the readings. Defaults + to `%m/%d/%y %H:%M:%S`. + n_workers + dashboard_adress + filename_fmt (str): + Filename format. Defaults to `%Y-%m.csv`. tuning_iterations (int): + preprocessing : + init_params : + cost : + test_size : cv_splits (int): cache_path (str): + output_path (str): """ - pass + templates = as_list(templates) + results = list() + + if target_times_paths: + target_times_paths = as_list(target_times_paths) + if not readings_path: + raise ValueError('Missing readings path.') + + client = _create_client(n_workers, dashboard_adress) + + for tt_path in tqdm(target_times_paths): + for window_size, rule in window_size_resample_rule: + target_times, readings = _generate_target_times_readings( + tt_path, + readings_path, + rule, + aggregation, + signals, + datetime_fmt, + filename_fmt, + ) + + df = evaluate_templates( + templates, + [(window_size, rule)], + metric=metric, + tuning_iterations=tuning_iterations, + init_params=init_params, + target_times=target_times, + readings=readings, + preprocessing=preprocessing, + cost=cost, + test_size=test_size, + cv_splits=cv_splits, + random_state=random_state, + cache_path=cache_path, + output_path=None + ) + + results.append(df) + + client.shutdown() + + else: + problem_paths = as_list(problem_paths) + + for problem_path in tqdm(problem_paths): + with open(problem_path, 'rb') as pickle_file: + target_times, readings, window_size, rule = pickle.load(pickle_file) + + df = evaluate_templates( + templates, + [(window_size, rule)], + metric=metric, + tuning_iterations=tuning_iterations, + init_params=init_params, + target_times=target_times, + readings=readings, + preprocessing=preprocessing, + cost=cost, + test_size=test_size, + cv_splits=cv_splits, + random_state=random_state, + cache_path=cache_path, + output_path=None + ) + + results.append(df) + + results = pd.concat(results, ignore_index=True) + + if output_path: + results.to_csv(output_path) + + return results diff --git a/greenguard/utils.py b/greenguard/utils.py index a803f97..15d46e1 100644 --- a/greenguard/utils.py +++ b/greenguard/utils.py @@ -53,3 +53,11 @@ def logging_setup(verbosity=1, logfile=None, logger_name=None): console_handler.setLevel(log_level) console_handler.setFormatter(formatter) logger.addHandler(console_handler) + + +def as_list(param): + """Make sure that param is a ``list``.""" + if isinstance(param, (list, tuple)): + return param + + return [param] From 55dcb1905b4a6bcf4f30c1d216ef439f2c44e075 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 21 Sep 2020 20:01:09 +0200 Subject: [PATCH 094/171] Review Data Format and Structure --- DATA_FORMAT.md | 119 +++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 7 ++- 2 files changed, 125 insertions(+), 1 deletion(-) create mode 100644 DATA_FORMAT.md diff --git a/DATA_FORMAT.md b/DATA_FORMAT.md new file mode 100644 index 0000000..b0fc5a3 --- /dev/null +++ b/DATA_FORMAT.md @@ -0,0 +1,119 @@ +# GreenGuard Data Format + +## Input + +The minimum input expected by the **GreenGuard** system consists of the following two elements, +which need to be passed as `pandas.DataFrame` objects: + +### Target Times + +A table containing the specification of the problem that we are solving, which has three +columns: + +* `turbine_id`: Unique identifier of the turbine which this label corresponds to. +* `cutoff_time`: Time associated with this target +* `target`: The value that we want to predict. This can either be a numerical value or a + categorical label. This column can also be skipped when preparing data that will be used + only to make predictions and not to fit any pipeline. + +| | turbine_id | cutoff_time | target | +|----|--------------|---------------------|----------| +| 0 | T1 | 2001-01-02 00:00:00 | 0 | +| 1 | T1 | 2001-01-03 00:00:00 | 1 | +| 2 | T2 | 2001-01-04 00:00:00 | 0 | + +### Readings + +A table containing the signal data from the different sensors, with the following columns: + + * `turbine_id`: Unique identifier of the turbine which this reading comes from. + * `signal_id`: Unique identifier of the signal which this reading comes from. + * `timestamp (datetime)`: Time where the reading took place, as a datetime. + * `value (float)`: Numeric value of this reading. + +| | turbine_id | signal_id | timestamp | value | +|----|--------------|-------------|---------------------|---------| +| 0 | T1 | S1 | 2001-01-01 00:00:00 | 1 | +| 1 | T1 | S1 | 2001-01-01 12:00:00 | 2 | +| 2 | T1 | S1 | 2001-01-02 00:00:00 | 3 | +| 3 | T1 | S1 | 2001-01-02 12:00:00 | 4 | +| 4 | T1 | S1 | 2001-01-03 00:00:00 | 5 | +| 5 | T1 | S1 | 2001-01-03 12:00:00 | 6 | +| 6 | T1 | S2 | 2001-01-01 00:00:00 | 7 | +| 7 | T1 | S2 | 2001-01-01 12:00:00 | 8 | +| 8 | T1 | S2 | 2001-01-02 00:00:00 | 9 | +| 9 | T1 | S2 | 2001-01-02 12:00:00 | 10 | +| 10 | T1 | S2 | 2001-01-03 00:00:00 | 11 | +| 11 | T1 | S2 | 2001-01-03 12:00:00 | 12 | + +### Turbines + +Optionally, a third table can be added containing metadata about the turbines. +The only requirement for this table is to have a `turbine_id` field, and it can have +an arbitraty number of additional fields. + +| | turbine_id | manufacturer | ... | ... | ... | +|----|--------------|----------------|-------|-------|-------| +| 0 | T1 | Siemens | ... | ... | ... | +| 1 | T2 | Siemens | ... | ... | ... | + + +## CSV Format + +As explained in a previous section, the input expected by the **GreenGuard** system consists of +two tables which need to be passed as `pandas.DataFrame` objects: + +* The `target_times` table, which containing the specification of the problem that we are solving + in the form of training examples with a `turbine_id`, a `cutoff_time` and a `target` value. +* The `readings` table, which contains the signal readings from the different sensors, with + `turbine_id`, `signal_id`, `timestamp` and `value` fields. + +However, in most scenarios the size of the available will far exceed the memory limitations +of the system on which **GreenGuard** is being run, so loading all the data in a single +`pandas.DataFrame` will not be possible. + +In order to solve this situation, **GreenGuard** provides a [CSVLoader]( +https://d3-ai.github.io/GreenGuard/api/greenguard.loaders.csv.html#greenguard.loaders.csv.CSVLoader) +class which can be used to load data from what we call the **Raw Data Format**. + +### Raw Data Format + +The **Raw Data Format** consists on a collection of CSV files stored in a single folder with the +following structure: + +#### Folder Structure + +* All the data from all the turbines is inside a single folder, which here we will call `readings`. +* Inside the `readings` folder, one folder exists for each turbine, named exactly like the turbine: + * `readings/T001` + * `readings/T002` + * ... +* Inside each turbine folder one CSV file exists for each month, named `%Y-%m.csv`. + * `readings/T001/2010-01.csv` + * `readings/T001/2010-02.csv` + * `readings/T001/2010-03.csv` + * ... + +#### CSV Contents + +* Each CSV file contains three columns: + * `signal_id`: name or id of the signal. + * ``timestamp``: timestamp of the reading formatted as ``%m/%d/%y %H:%M:%S``. + * `value`: value of the reading. + +This is an example of what a CSV contents look like: + +| | signal_id | timestamp | value | +|----|-------------|-------------------|---------| +| 0 | S1 | 01/01/01 00:00:00 | 1 | +| 1 | S1 | 01/01/01 12:00:00 | 2 | +| 2 | S1 | 01/02/01 00:00:00 | 3 | +| 3 | S1 | 01/02/01 12:00:00 | 4 | +| 4 | S1 | 01/03/01 00:00:00 | 5 | +| 5 | S1 | 01/03/01 12:00:00 | 6 | +| 6 | S2 | 01/01/01 00:00:00 | 7 | +| 7 | S2 | 01/01/01 12:00:00 | 8 | +| 8 | S2 | 01/02/01 00:00:00 | 9 | +| 9 | S2 | 01/02/01 12:00:00 | 10 | +| 10 | S2 | 01/03/01 00:00:00 | 11 | +| 11 | S2 | 01/03/01 12:00:00 | 12 | diff --git a/README.md b/README.md index e585050..4e4b086 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ AutoML for Renewable Energy Industries. - Documentation: https://signals-dev.github.io/GreenGuard - Homepage: https://github.com/signals-dev/GreenGuard -# Overview +## Overview The GreenGuard project is a collection of end-to-end solutions for machine learning problems commonly found in monitoring wind energy production systems. Most tasks utilize sensor data @@ -44,6 +44,11 @@ The salient aspects of this customized project are: * A robust continuous integration and testing infrastructure. * A ``learning database`` recording all past outcomes --> tasks, pipelines, outcomes. +## Resources + +* [Data Format](DATA_FORMAT.md). +* [GreenGuard folder structure](DATA_FORMAT.md#folder-structure). + # Install ## Requirements From c887f4e7a908e8e80178cb5a64632a49de8f141f Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 21 Sep 2020 20:09:07 +0200 Subject: [PATCH 095/171] Update links --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4e4b086..adbc11e 100644 --- a/README.md +++ b/README.md @@ -141,7 +141,7 @@ A part from the in-memory data format explained above, which is limited by the m allocation capabilities of the system where it is run, **GreenGuard** is also prepared to load and work with data stored as a collection of CSV files, drastically increasing the amount of data which it can work with. Further details about this format can be found in the -[project documentation site](https://signals-dev.github.io/GreenGuard/advanced_usage/csv.html). +[project documentation site](DATA_FORMAT.md#csv-format). # Quickstart From d176a3527e7e2541c959464b041c527e70df5a7e Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 22 Sep 2020 01:56:50 +0200 Subject: [PATCH 096/171] Curate dependancies --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8ffd589..7872309 100644 --- a/setup.py +++ b/setup.py @@ -16,11 +16,13 @@ history = '' install_requires = [ + 'boto3==1.14.44', + 'botocore==1.17.44', + 'baytune>=0.3.9,<0.4', 'Keras>=2.1.6,<2.4', 'mlblocks>=0.3.4,<0.4', 'mlprimitives>=0.2.5,<0.3', 'scipy>=1.0.1,<1.4.0', - 'baytune>=0.3.9,<0.4', 'numpy>=1.15.4,<1.17', 'pymongo>=3.7.2,<4', 'scikit-learn>=0.20.1,<0.21', From 0526bb5f8cf31f393dfdaacd075bbcd561f33733 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 25 Sep 2020 15:22:32 +0200 Subject: [PATCH 097/171] Add release notes for v0.2.4 --- HISTORY.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index 0607be3..abc8226 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,9 @@ # History +## 0.2.4 - 2020-09-25 + +* Fix dependency errors + ## 0.2.3 - 2020-08-10 * Added benchmarking module. From e5a4c389193e108348202b138988925c6a89c4b3 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 25 Sep 2020 15:22:40 +0200 Subject: [PATCH 098/171] =?UTF-8?q?Bump=20version:=200.2.4.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 20eb561..80c8ff7 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.4.dev0' +__version__ = '0.2.4' import os diff --git a/setup.cfg b/setup.cfg index 44d3a12..3d700c2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.4.dev0 +current_version = 0.2.4 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 7872309..c1ee678 100644 --- a/setup.py +++ b/setup.py @@ -105,6 +105,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.4.dev0', + version='0.2.4', zip_safe=False, ) From f8649547f6e6659d103acd862a994d7a6a040955 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 25 Sep 2020 16:41:00 +0200 Subject: [PATCH 099/171] =?UTF-8?q?Bump=20version:=200.2.4=20=E2=86=92=200?= =?UTF-8?q?.2.5.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 80c8ff7..4b2b765 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.4' +__version__ = '0.2.5.dev0' import os diff --git a/setup.cfg b/setup.cfg index 3d700c2..5fdd02b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.4 +current_version = 0.2.5.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index c1ee678..a7cd421 100644 --- a/setup.py +++ b/setup.py @@ -105,6 +105,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.4', + version='0.2.5.dev0', zip_safe=False, ) From dc30fb3ab4754787f9a0ade7429c56091c74859d Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 29 Sep 2020 20:12:02 +0200 Subject: [PATCH 100/171] Add command line for benchmark --- greenguard/benchmark.py | 167 +++++++++++++++++++++------------------- setup.py | 1 + 2 files changed, 88 insertions(+), 80 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 281039a..d59f370 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -1,10 +1,12 @@ +import argparse import logging import os import pickle +import sys +import warnings from itertools import product import pandas as pd -from dask.distributed import Client, LocalCluster from sklearn.model_selection import train_test_split from tqdm import tqdm @@ -17,12 +19,6 @@ LOGGER = logging.getLogger(__name__) -def _create_client(n_workers=16, dashboard_adress=':9792'): - cluster = LocalCluster(n_workers=n_workers, dashboard_adress=dashboard_adress) - client = Client(cluster) - return client - - def _build_init_params(template, window_size, rule, template_params): if 'dfs' in template: window_size_rule_params = { @@ -287,9 +283,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', return results -def _generate_target_times_readings(target_times, readings_path, window_size, rule, - signals, aggregation, datetime_fmt='%m/%d/%y %H:%M:%S', - filename_fmt='%Y-%m.csv'): +def _generate_target_times_readings(target_times, readings_path, window_size, rule, signals): """ Returns: pandas.DataFrame: @@ -299,17 +293,13 @@ def _generate_target_times_readings(target_times, readings_path, window_size, ru csv_loader = CSVLoader( readings_path, rule=rule, - aggregation=aggregation, - datetime_fmt=datetime_fmt, - filename_fmt=filename_fmt, ) return csv_loader.load(target_times, window_size=window_size, signals=signals) -def make_problem(target_times_paths, readings_path, window_size_resample_rule, output_path, - signals=None, aggregation='mean', datetime_fmt='%m/%d/%y %H:%M:%S', - filename_fmt='%Y-%m.csv', n_workers=16, dashboard_adress=':9792'): +def make_problem(target_times_paths, readings_path, window_size_resample_rule, + output_path, signals=None): """ Args: target_times_paths (list): @@ -323,30 +313,19 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, o signals (str): List of signal names or csv file that has a `signal_id` column to use as the signal names list. - aggregation (str): - Aggregation to perform to the. - datetime_fmt (str): - Date format used by the column timestamp for the readings. Defaults - to `%m/%d/%y %H:%M:%S`. - filename_fmt (str): - Filename format. Defaults to `%Y-%m.csv`. - n_workers (int): - dashboard_adress (str): """ - client = _create_client(n_workers, dashboard_adress) generated_problems = list() target_times_paths = as_list(target_times_paths) - for tt_path in tqdm(target_times_paths): + for target_time_path in tqdm(target_times_paths): for window_size, rule in window_size_resample_rule: + target_times = pd.read_csv(target_time_path, parse_dates=['cutoff_time']) new_target_times, readings = _generate_target_times_readings( - tt_path, + target_times, readings_path, + window_size, rule, - aggregation, - signals, - datetime_fmt, - filename_fmt + signals=signals, ) problem_name = 'problem_{}_{}.pkl'.format(window_size, rule) @@ -356,17 +335,13 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, o generated_problems.append(output_pickle_path) - client.shutdown() - return generated_problems def benchmark(templates, problem_paths=None, target_times_paths=None, readings_path=None, - window_size_resample_rule=None, signals=None, tuning_iterations=100, preprocessing=0, - init_params=None, aggregation='mean', cost=False, cv_splits=5, metric='f1', - test_size=0.33, random_state=0, cache_path=None, n_workers=16, - dashboard_adress=':9792', output_path=None, datetime_fmt='%m/%d/%y %H:%M:%S', - filename_fmt='%Y-%m.csv'): + window_size_resample_rule=None, signals=None, tuning_iterations=100, + preprocessing=None, init_params=None, cost=False, cv_splits=5, metric='f1', + test_size=0.33, random_state=0, cache_path=None, output_path=None): """ Args: templates (list): @@ -378,15 +353,6 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p signals (str): List of signal names or csv file that has a `signal_id` column to use as the signal names list. - aggregation (str): - Aggregation to perform to the. - datetime_fmt (str): - Date format used by the column timestamp for the readings. Defaults - to `%m/%d/%y %H:%M:%S`. - n_workers - dashboard_adress - filename_fmt (str): - Filename format. Defaults to `%Y-%m.csv`. tuning_iterations (int): preprocessing : init_params : @@ -394,7 +360,6 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p test_size : cv_splits (int): cache_path (str): - output_path (str): """ templates = as_list(templates) @@ -402,21 +367,17 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p if target_times_paths: target_times_paths = as_list(target_times_paths) - if not readings_path: + if readings_path is None: raise ValueError('Missing readings path.') - client = _create_client(n_workers, dashboard_adress) - for tt_path in tqdm(target_times_paths): for window_size, rule in window_size_resample_rule: + target_times = pd.read_csv(tt_path, parse_dates=['cutoff_time']) target_times, readings = _generate_target_times_readings( tt_path, readings_path, rule, - aggregation, signals, - datetime_fmt, - filename_fmt, ) df = evaluate_templates( @@ -438,37 +399,83 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p results.append(df) - client.shutdown() - else: - problem_paths = as_list(problem_paths) + problem_paths = as_list(problem_paths) for problem_path in tqdm(problem_paths): with open(problem_path, 'rb') as pickle_file: - target_times, readings, window_size, rule = pickle.load(pickle_file) - - df = evaluate_templates( - templates, - [(window_size, rule)], - metric=metric, - tuning_iterations=tuning_iterations, - init_params=init_params, - target_times=target_times, - readings=readings, - preprocessing=preprocessing, - cost=cost, - test_size=test_size, - cv_splits=cv_splits, - random_state=random_state, - cache_path=cache_path, - output_path=None - ) - - results.append(df) + target_times, readings, pickle_window_size, pickle_rule = pickle.load(pickle_file) + + if window_size_resample_rule is None: + window_size_resample_rule = [(pickle_window_size, pickle_rule)] + + for window_size, resample_rule in window_size_resample_rule: + + # window_size can be only smaller than pickle window size + # resample rule can be only bigger than picke rule + if (pd.to_timedelta(pickle_window_size) >= pd.to_timedelta(window_size) + and pd.to_timedelta(pickle_rule) <= pd.to_timedelta(resample_rule)): # noqa W503 + + df = evaluate_templates( + templates, + [(window_size, rule)], + metric=metric, + tuning_iterations=tuning_iterations, + init_params=init_params, + target_times=target_times, + readings=readings, + preprocessing=preprocessing, + cost=cost, + test_size=test_size, + cv_splits=cv_splits, + random_state=random_state, + cache_path=cache_path, + output_path=None + ) + + results.append(df) + + else: + msg = (f'Invalid window size of {window_size} for {pickle_window_size}' + f' or invalid resample rule {resample_rule} for {pickle_rule}.') + LOGGER.info(msg) results = pd.concat(results, ignore_index=True) if output_path: - results.to_csv(output_path) + results.to_csv(output_path, index=False) + + else: + return results + + +def _get_parser(): + parser = argparse.ArgumentParser(description='GreenGuard Benchmark Command Line Interface.') + parser.set_defaults(action=benchmark) + + # Add arguments + parser.add_argument('-t', '--templates', nargs='+', help='List of templates to try.') + parser.add_argument('-p', '--problems', nargs='+', help='Paths to problems to be benchmarked.') + parser.add_argument('-w', '--window-size-resample-rule', nargs='+', + help='List of tuples with window size and resample rule to benchmark.') + parser.add_argument('-i', '--tuning-iterations', type=int, default=100, + help='Number of tuning iterations to perform per problem per pipeline.') + + return parser + + +def main(): + warnings.filterwarnings("ignore") + + # Parse args + parser = _get_parser() + args = parser.parse_args() + if args.templates is None: + parser.print_help() + sys.exit(0) + + args.action(**args) + - return results +if __name__ == '__main__': + main() diff --git a/setup.py b/setup.py index 8ffd589..098b67a 100644 --- a/setup.py +++ b/setup.py @@ -16,6 +16,7 @@ history = '' install_requires = [ + 'matplotlib<3.2.2', 'Keras>=2.1.6,<2.4', 'mlblocks>=0.3.4,<0.4', 'mlprimitives>=0.2.5,<0.3', From d66d52cf1b3394a0eed85fc11a9124a52643a3a2 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Mon, 5 Oct 2020 19:41:11 +0200 Subject: [PATCH 101/171] Rename make_problems, export them with the given problem name and update return values if no output_path is given --- greenguard/benchmark.py | 40 ++++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index d59f370..a441a94 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -298,8 +298,8 @@ def _generate_target_times_readings(target_times, readings_path, window_size, ru return csv_loader.load(target_times, window_size=window_size, signals=signals) -def make_problem(target_times_paths, readings_path, window_size_resample_rule, - output_path, signals=None): +def make_problems(target_times_paths, readings_path, window_size_resample_rule, + output_path=None, signals=None): """ Args: target_times_paths (list): @@ -314,10 +314,17 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, List of signal names or csv file that has a `signal_id` column to use as the signal names list. """ - generated_problems = list() - target_times_paths = as_list(target_times_paths) + if isinstance(target_times_paths, str): + target_times_paths = [target_times_paths] + if isinstance(target_times_paths, list): + target_times_paths = {os.path.basename(path)[:-4]: path for path in target_times_paths} - for target_time_path in tqdm(target_times_paths): + if output_path: + generated_problems = list() + else: + generated_problems = {} + + for name, target_time_path in tqdm(target_times_paths.values()): for window_size, rule in window_size_resample_rule: target_times = pd.read_csv(target_time_path, parse_dates=['cutoff_time']) new_target_times, readings = _generate_target_times_readings( @@ -328,12 +335,17 @@ def make_problem(target_times_paths, readings_path, window_size_resample_rule, signals=signals, ) - problem_name = 'problem_{}_{}.pkl'.format(window_size, rule) - output_pickle_path = os.path.join(output_path, problem_name) - with open(output_pickle_path, 'wb') as pickle_file: - pickle.dump((new_target_times, readings, window_size, rule), pickle_file) + problem_name = '{}_{}_{}.pkl'.format(name, window_size, rule) - generated_problems.append(output_pickle_path) + if output_path: + output_pickle_path = os.path.join(output_path, problem_name) + with open(output_pickle_path, 'wb') as pickle_file: + pickle.dump((new_target_times, readings, window_size, rule), pickle_file) + + generated_problems.append(output_pickle_path) + + else: + generated_problems[problem_name] = (new_target_times, readings, window_size, rule) return generated_problems @@ -371,18 +383,18 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p raise ValueError('Missing readings path.') for tt_path in tqdm(target_times_paths): - for window_size, rule in window_size_resample_rule: + for window_size, resample_rule in window_size_resample_rule: target_times = pd.read_csv(tt_path, parse_dates=['cutoff_time']) target_times, readings = _generate_target_times_readings( tt_path, readings_path, - rule, + resample_rule, signals, ) df = evaluate_templates( templates, - [(window_size, rule)], + [(window_size, resample_rule)], metric=metric, tuning_iterations=tuning_iterations, init_params=init_params, @@ -418,7 +430,7 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p df = evaluate_templates( templates, - [(window_size, rule)], + [(window_size, resample_rule)], metric=metric, tuning_iterations=tuning_iterations, init_params=init_params, From 321932fe9a3a6865d46fdf26b467ed4ea6fb8009 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Wed, 7 Oct 2020 13:17:05 +0200 Subject: [PATCH 102/171] Benchmarking and command line implementation --- greenguard/benchmark.py | 276 ++++++++++++++++++++++++++++------------ 1 file changed, 193 insertions(+), 83 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index a441a94..3747653 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -4,16 +4,20 @@ import pickle import sys import warnings +from datetime import datetime from itertools import product import pandas as pd +import tabulate from sklearn.model_selection import train_test_split from tqdm import tqdm +from greenguard import get_pipelines from greenguard.demo import load_demo from greenguard.loaders import CSVLoader from greenguard.metrics import METRICS from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing +from greenguard.results import summarize_results from greenguard.utils import as_list LOGGER = logging.getLogger(__name__) @@ -97,7 +101,10 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter scores (dict): Stores the four types of scores that are being evaluate. """ + start_time = datetime.utcnow() + scores = dict() + scores['metric'] = metric train, test = train_test_split(target_times, test_size=test_size, random_state=random_state) @@ -115,14 +122,18 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter ) # Computing the default test score + fit_predict_time = datetime.utcnow() pipeline.fit(train, readings) predictions = pipeline.predict(test, readings) + fit_predict_time = datetime.utcnow() - fit_predict_time scores['default_test'] = metric(test['target'], predictions) # Computing the default cross validation score + cv_time = datetime.utcnow() session = pipeline.tune(train, readings) session.run(1) + cv_time = datetime.utcnow() - cv_time scores['default_cv'] = pipeline.cv_score @@ -136,14 +147,17 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter predictions = pipeline.predict(test, readings) scores['tuned_test'] = metric(test['target'], predictions) + scores['fit_predict_time'] = fit_predict_time + scores['cv_time'] = cv_time + scores['total_time'] = datetime.utcnow() - start_time return scores -def evaluate_templates(templates, window_size_rule, metric='f1', - tuning_iterations=50, init_params=None, target_times=None, - readings=None, preprocessing=0, cost=False, test_size=0.25, - cv_splits=3, random_state=0, cache_path=None, output_path=None): +def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iterations=50, + init_params=None, target_times=None, readings=None, preprocessing=0, + cost=False, test_size=0.25, cv_splits=3, random_state=0, cache_path=None, + cache_results=None, problem_name=None, output_path=None): """Execute the benchmark process and optionally store the result as a ``CSV``. Args: @@ -239,6 +253,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', window_size, rule = window_rule scores = dict() + scores['problem_name'] = problem_name scores['template'] = template scores['window_size'] = window_size scores['resample_rule'] = rule @@ -270,11 +285,34 @@ def evaluate_templates(templates, window_size_rule, metric='f1', scores['status'] = 'ERRORED' LOGGER.exception('Could not score template %s ', template) + if cache_results: + os.makedirs(cache_results, exist_ok=True) + template_name = template + if os.path.isfile(template_name): + template_name = os.path.basename(template_name).replace('.json', '') + + file_name = '{}_{}_{}_{}.csv'.format(problem_name, template_name, window_size, rule) + pd.DataFrame([scores]).to_csv(os.path.join(cache_results, file_name), index=False) + scores_list.append(scores) results = pd.DataFrame.from_records(scores_list) - results = results.reindex(['template', 'window_size', 'resample_rule', 'default_test', - 'default_cv', 'tuned_cv', 'tuned_test', 'status'], axis=1) + columns = [ + 'problem_name', + 'window_size', + 'resample_rule', + 'template', + 'default_test', + 'default_cv', + 'tuned_cv', + 'tuned_test', + 'fit_predict_time', + 'cv_time', + 'total_time', + 'status', + ] + + results = results.reindex(columns, axis=1) if output_path: LOGGER.info('Saving benchmark report to %s', output_path) @@ -317,14 +355,17 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, if isinstance(target_times_paths, str): target_times_paths = [target_times_paths] if isinstance(target_times_paths, list): - target_times_paths = {os.path.basename(path)[:-4]: path for path in target_times_paths} + target_times_paths = { + os.path.basename(path).replace('.csv', ''): path + for path in target_times_paths + } if output_path: generated_problems = list() else: generated_problems = {} - for name, target_time_path in tqdm(target_times_paths.values()): + for problem_name, target_time_path in tqdm(target_times_paths.values()): for window_size, rule in window_size_resample_rule: target_times = pd.read_csv(target_time_path, parse_dates=['cutoff_time']) new_target_times, readings = _generate_target_times_readings( @@ -335,7 +376,7 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, signals=signals, ) - problem_name = '{}_{}_{}.pkl'.format(name, window_size, rule) + problem_name = '{}_{}_{}.pkl'.format(problem_name, window_size, rule) if output_path: output_pickle_path = os.path.join(output_path, problem_name) @@ -350,21 +391,16 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, return generated_problems -def benchmark(templates, problem_paths=None, target_times_paths=None, readings_path=None, - window_size_resample_rule=None, signals=None, tuning_iterations=100, - preprocessing=None, init_params=None, cost=False, cv_splits=5, metric='f1', - test_size=0.33, random_state=0, cache_path=None, output_path=None): +def run_benchmark(templates, problem_paths=None, window_size_resample_rule=None, + tuning_iterations=100, preprocessing=0, init_params=None, cost=False, + cv_splits=5, metric='f1', test_size=0.33, random_state=0, cache_path=None, + output_path=None, cache_results=None): """ Args: templates (list): problem_paths (list): - target_times_paths (list): - readings_path (str): window_size_resample_rule (list): List of tupples (int, str or Timedelta object). - signals (str): - List of signal names or csv file that has a `signal_id` column to use as the signal - names list. tuning_iterations (int): preprocessing : init_params : @@ -376,21 +412,23 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p """ templates = as_list(templates) results = list() + problem_paths = as_list(problem_paths) - if target_times_paths: - target_times_paths = as_list(target_times_paths) - if readings_path is None: - raise ValueError('Missing readings path.') - - for tt_path in tqdm(target_times_paths): - for window_size, resample_rule in window_size_resample_rule: - target_times = pd.read_csv(tt_path, parse_dates=['cutoff_time']) - target_times, readings = _generate_target_times_readings( - tt_path, - readings_path, - resample_rule, - signals, - ) + for problem_path in tqdm(problem_paths): + with open(problem_path, 'rb') as pickle_file: + target_times, readings, pickle_window_size, pickle_rule = pickle.load(pickle_file) + + problem_name = '_'.join(os.path.basename(problem_path).split('_')[:-2]) + + if window_size_resample_rule is None: + window_size_resample_rule = [(pickle_window_size, pickle_rule)] + + for window_size, resample_rule in window_size_resample_rule: + + # window_size can be only smaller than pickle window size + # resample rule can be only bigger than picke rule + if (pd.to_timedelta(pickle_window_size) >= pd.to_timedelta(window_size) + and pd.to_timedelta(pickle_rule) <= pd.to_timedelta(resample_rule)): # noqa W503 df = evaluate_templates( templates, @@ -406,51 +444,22 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p cv_splits=cv_splits, random_state=random_state, cache_path=cache_path, + cache_results=cache_results, + problem_name=problem_name, output_path=None ) results.append(df) - else: + if cache_results: + file_name = '{}_{}_{}.csv'.format(problem_name, window_size, resample_rule) + df.to_csv(os.path.join(cache_results, file_name), index=False) + + else: + msg = 'Invalid window size or resample rule {}.'.format( + (window_size, pickle_window_size, resample_rule, pickle_rule)) - problem_paths = as_list(problem_paths) - for problem_path in tqdm(problem_paths): - with open(problem_path, 'rb') as pickle_file: - target_times, readings, pickle_window_size, pickle_rule = pickle.load(pickle_file) - - if window_size_resample_rule is None: - window_size_resample_rule = [(pickle_window_size, pickle_rule)] - - for window_size, resample_rule in window_size_resample_rule: - - # window_size can be only smaller than pickle window size - # resample rule can be only bigger than picke rule - if (pd.to_timedelta(pickle_window_size) >= pd.to_timedelta(window_size) - and pd.to_timedelta(pickle_rule) <= pd.to_timedelta(resample_rule)): # noqa W503 - - df = evaluate_templates( - templates, - [(window_size, resample_rule)], - metric=metric, - tuning_iterations=tuning_iterations, - init_params=init_params, - target_times=target_times, - readings=readings, - preprocessing=preprocessing, - cost=cost, - test_size=test_size, - cv_splits=cv_splits, - random_state=random_state, - cache_path=cache_path, - output_path=None - ) - - results.append(df) - - else: - msg = (f'Invalid window size of {window_size} for {pickle_window_size}' - f' or invalid resample rule {resample_rule} for {pickle_rule}.') - LOGGER.info(msg) + LOGGER.info(msg) results = pd.concat(results, ignore_index=True) @@ -461,17 +470,118 @@ def benchmark(templates, problem_paths=None, target_times_paths=None, readings_p return results +def _run(args): + # Logger setup + log_level = (3 - args.verbose) * 10 + fmt = '%(asctime)s - %(process)d - %(levelname)s - %(name)s - %(module)s - %(message)s' + logging.basicConfig(level=log_level, format=fmt) + logging.getLogger("botocore").setLevel(logging.ERROR) + logging.getLogger("hyperopt").setLevel(logging.ERROR) + logging.getLogger("ax").setLevel(logging.ERROR) + logging.getLogger("urllib3").setLevel(logging.CRITICAL) + + if args.templates is None: + args.templates = get_pipelines() + + window_size_resample_rule = None + if args.window_size: + window_size_resample_rule = list(product(args.window_size, args.resample_rule)) + + # run + results = run_benchmark( + templates=args.templates, + problem_paths=args.problem_paths, + window_size_resample_rule=window_size_resample_rule, + cv_splits=args.cv_splits, + metric=args.metric, + test_size=args.test_size, + random_state=args.random_state, + cache_path=args.cache_path, + cache_results=args.cache_results, + tuning_iterations=args.iterations, + output_path=args.output_path, + ) + + if not args.output_path: + print(tabulate.tabulate( + results, + tablefmt='github', + headers=results.columns + )) + + +def _summary(args): + summarize_results(args.input, args.output) + + +def _create(args): + window_size_resample_rule = list(product(args.window_size, args.resample_rule)) + make_problems( + args.target_times_paths, + args.readings_path, + window_size_resample_rule, + output_path=args.output_path, + signals=args.signals + ) + + def _get_parser(): parser = argparse.ArgumentParser(description='GreenGuard Benchmark Command Line Interface.') - parser.set_defaults(action=benchmark) - - # Add arguments - parser.add_argument('-t', '--templates', nargs='+', help='List of templates to try.') - parser.add_argument('-p', '--problems', nargs='+', help='Paths to problems to be benchmarked.') - parser.add_argument('-w', '--window-size-resample-rule', nargs='+', - help='List of tuples with window size and resample rule to benchmark.') - parser.add_argument('-i', '--tuning-iterations', type=int, default=100, - help='Number of tuning iterations to perform per problem per pipeline.') + parser.set_defaults(action=None) + action = parser.add_subparsers(title='action') + action.required = True + + # Run action + run = action.add_parser('run', help='Run the GreenGuard Benchmark') + run.set_defaults(action=_run) + run.set_defaults(user=None) + + run.add_argument('-v', '--verbose', action='http://github.com/count', default=0, + help='Be verbose. Use -vv for increased verbosity.') + run.add_argument('-t', '--templates', nargs='+', + help='Perform benchmarking over the given list of templates.') + run.add_argument('-p', '--problem-paths', nargs='+', required=False, + help='Perform benchmarking over a list of pkl problems.') + run.add_argument('-w', '--window-size', nargs='+', required=False, + help='List of window sizes values to benchmark.') + run.add_argument('-r', '--resample-rule', nargs='+', required=False, + help='List of resample rule to benchmark.') + run.add_argument('-o', '--output_path', type=str, + help='Output path where to store the results.') + run.add_argument('-s', '--cv-splits', type=int, default=5, + help='Amount of cross validation splits to use.') + run.add_argument('-m', '--metric', type=str, default='f1', + help='Name of metric function to be used during benchmarking.') + run.add_argument('-n', '--random-state', type=int, default=0, + help='Random state for the cv splits.') + run.add_argument('-e', '--test-size', type=float, default=0.33, + help='Percentage of the data set to be used for the test.') + run.add_argument('-c', '--cache-path', type=str, + help='Path to cache the generated cross validation splits in.') + run.add_argument('-R', '--cache-results', type=str, + help='Path to store the csv files for each problem and template.') + run.add_argument('-i', '--iterations', type=int, default=100, + help='Number of iterations to perform per challenge with each candidate.') + + # Summarize action + summary = action.add_parser('summary', help='Summarize the GreenGuard Benchmark results') + summary.set_defaults(action=_summary) + summary.add_argument('input', nargs='+', help='Input path with results.') + summary.add_argument('output', help='Output file.') + + # Create action + create = action.add_parser('create', help='Create GreenGuard problems') + create.set_defaults(action=_create) + create.add_argument('target-times-paths', nargs='+', help='List of target times paths.') + create.add_argument('readings-path', type=str, help='Path to the readings folder.') + create.add_argument('-w', '--window-size', nargs='+', required=False, + help='List of window sizes values to benchmark.') + create.add_argument('-r', '--resample-rule', nargs='+', required=False, + help='List of resample rule to benchmark.') + create.add_argument('-o', '--output', type=str, + help='Output path where to save the generated problems.') + create.add_argument('-s', '--signals', type=str, + help='Path to csv file that has signal_id column to use as the signal') return parser @@ -481,12 +591,12 @@ def main(): # Parse args parser = _get_parser() - args = parser.parse_args() - if args.templates is None: + if len(sys.argv) < 2: parser.print_help() sys.exit(0) - args.action(**args) + args = parser.parse_args() + args.action(args) if __name__ == '__main__': From 043009662196a666741a36ecb645365cae69d16e Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Wed, 7 Oct 2020 17:44:20 +0200 Subject: [PATCH 103/171] Allow dictionary usage as problems --- greenguard/benchmark.py | 95 +++++++++++++++++++++++------------------ setup.py | 2 + 2 files changed, 56 insertions(+), 41 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 3747653..c29af4b 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -2,6 +2,7 @@ import logging import os import pickle +import re import sys import warnings from datetime import datetime @@ -17,11 +18,25 @@ from greenguard.loaders import CSVLoader from greenguard.metrics import METRICS from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing -from greenguard.results import summarize_results from greenguard.utils import as_list LOGGER = logging.getLogger(__name__) +LEADERBOARD_COLUMNS = [ + 'problem_name', + 'window_size', + 'resample_rule', + 'template', + 'default_test', + 'default_cv', + 'tuned_cv', + 'tuned_test', + 'fit_predict_time', + 'cv_time', + 'total_time', + 'status', +] + def _build_init_params(template, window_size, rule, template_params): if 'dfs' in template: @@ -241,7 +256,6 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio 3 normalize_dfs_xgb_classifier 7d 4h 0.581818 0.619698 0.650367 0.603774 OK """ # noqa - if readings is None and target_times is None: target_times, readings = load_demo() @@ -292,27 +306,13 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio template_name = os.path.basename(template_name).replace('.json', '') file_name = '{}_{}_{}_{}.csv'.format(problem_name, template_name, window_size, rule) - pd.DataFrame([scores]).to_csv(os.path.join(cache_results, file_name), index=False) + df = pd.DataFrame([scores]).reindex(LEADERBOARD_COLUMNS, axis=1) + df.to_csv(os.path.join(cache_results, file_name), index=False) scores_list.append(scores) results = pd.DataFrame.from_records(scores_list) - columns = [ - 'problem_name', - 'window_size', - 'resample_rule', - 'template', - 'default_test', - 'default_cv', - 'tuned_cv', - 'tuned_test', - 'fit_predict_time', - 'cv_time', - 'total_time', - 'status', - ] - - results = results.reindex(columns, axis=1) + results = results.reindex(LEADERBOARD_COLUMNS, axis=1) if output_path: LOGGER.info('Saving benchmark report to %s', output_path) @@ -365,7 +365,7 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, else: generated_problems = {} - for problem_name, target_time_path in tqdm(target_times_paths.values()): + for problem_name, target_time_path in tqdm(target_times_paths.items()): for window_size, rule in window_size_resample_rule: target_times = pd.read_csv(target_time_path, parse_dates=['cutoff_time']) new_target_times, readings = _generate_target_times_readings( @@ -376,22 +376,22 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, signals=signals, ) - problem_name = '{}_{}_{}.pkl'.format(problem_name, window_size, rule) + pickle_name = '{}_{}_{}'.format(problem_name, window_size, rule) if output_path: - output_pickle_path = os.path.join(output_path, problem_name) + output_pickle_path = os.path.join(output_path, pickle_name + '.pkl') with open(output_pickle_path, 'wb') as pickle_file: pickle.dump((new_target_times, readings, window_size, rule), pickle_file) generated_problems.append(output_pickle_path) else: - generated_problems[problem_name] = (new_target_times, readings, window_size, rule) + generated_problems[pickle_name] = (new_target_times, readings, window_size, rule) return generated_problems -def run_benchmark(templates, problem_paths=None, window_size_resample_rule=None, +def run_benchmark(templates, problems=None, window_size_resample_rule=None, tuning_iterations=100, preprocessing=0, init_params=None, cost=False, cv_splits=5, metric='f1', test_size=0.33, random_state=0, cache_path=None, output_path=None, cache_results=None): @@ -412,23 +412,30 @@ def run_benchmark(templates, problem_paths=None, window_size_resample_rule=None, """ templates = as_list(templates) results = list() - problem_paths = as_list(problem_paths) - - for problem_path in tqdm(problem_paths): - with open(problem_path, 'rb') as pickle_file: - target_times, readings, pickle_window_size, pickle_rule = pickle.load(pickle_file) + if isinstance(problems, str): + problems = [problems] + if isinstance(problems, list): + problems = { + '_'.join(os.path.basename(problem).split('_')[:-2]): problem + for problem in problems + } - problem_name = '_'.join(os.path.basename(problem_path).split('_')[:-2]) + for problem_name, problem in tqdm(problems.items()): + if isinstance(problem, str): + with open(problem, 'rb') as pickle_file: + target_times, readings, orig_window_size, orig_rule = pickle.load(pickle_file) + else: + target_times, readings, orig_window_size, orig_rule = problem if window_size_resample_rule is None: - window_size_resample_rule = [(pickle_window_size, pickle_rule)] + window_size_resample_rule = [(orig_window_size, orig_rule)] for window_size, resample_rule in window_size_resample_rule: # window_size can be only smaller than pickle window size # resample rule can be only bigger than picke rule - if (pd.to_timedelta(pickle_window_size) >= pd.to_timedelta(window_size) - and pd.to_timedelta(pickle_rule) <= pd.to_timedelta(resample_rule)): # noqa W503 + if (pd.to_timedelta(orig_window_size) >= pd.to_timedelta(window_size) + and pd.to_timedelta(orig_rule) <= pd.to_timedelta(resample_rule)): # noqa W503 df = evaluate_templates( templates, @@ -457,7 +464,7 @@ def run_benchmark(templates, problem_paths=None, window_size_resample_rule=None, else: msg = 'Invalid window size or resample rule {}.'.format( - (window_size, pickle_window_size, resample_rule, pickle_rule)) + (window_size, orig_window_size, resample_rule, orig_rule)) LOGGER.info(msg) @@ -484,13 +491,17 @@ def _run(args): args.templates = get_pipelines() window_size_resample_rule = None - if args.window_size: - window_size_resample_rule = list(product(args.window_size, args.resample_rule)) + if args.window_size_resample_rule: + pattern = re.compile(r'\d+[DdHhMmSs]') + window_size_resample_rule = [ + tuple(pattern.findall(item)) + for item in args.window_size_resample_rule + ] # run results = run_benchmark( templates=args.templates, - problem_paths=args.problem_paths, + problems=args.problems, window_size_resample_rule=window_size_resample_rule, cv_splits=args.cv_splits, metric=args.metric, @@ -510,6 +521,10 @@ def _run(args): )) +def summarize_results(input_path, output_path): + pass + + def _summary(args): summarize_results(args.input, args.output) @@ -540,12 +555,10 @@ def _get_parser(): help='Be verbose. Use -vv for increased verbosity.') run.add_argument('-t', '--templates', nargs='+', help='Perform benchmarking over the given list of templates.') - run.add_argument('-p', '--problem-paths', nargs='+', required=False, + run.add_argument('-p', '--problems', nargs='+', required=False, help='Perform benchmarking over a list of pkl problems.') - run.add_argument('-w', '--window-size', nargs='+', required=False, + run.add_argument('-w', '--window-size-resample-rule', nargs='+', required=False, help='List of window sizes values to benchmark.') - run.add_argument('-r', '--resample-rule', nargs='+', required=False, - help='List of resample rule to benchmark.') run.add_argument('-o', '--output_path', type=str, help='Output path where to store the results.') run.add_argument('-s', '--cv-splits', type=int, default=5, diff --git a/setup.py b/setup.py index f78ff68..b671c9c 100644 --- a/setup.py +++ b/setup.py @@ -16,10 +16,12 @@ history = '' install_requires = [ + 'xlsxwriter>=1.3.6<1.4', 'matplotlib<3.2.2', 'boto3==1.14.44', 'botocore==1.17.44', 'baytune>=0.3.9,<0.4', + 'tabulate>=0.8.3,<0.9', 'Keras>=2.1.6,<2.4', 'mlblocks>=0.3.4,<0.4', 'mlprimitives>=0.2.5,<0.3', From 0b0d3bb592c71e1fbbdc39b1e77de2a6c3e81190 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Wed, 7 Oct 2020 19:47:46 +0200 Subject: [PATCH 104/171] WIP --- greenguard/benchmark.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index c29af4b..4d6e397 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -416,11 +416,14 @@ def run_benchmark(templates, problems=None, window_size_resample_rule=None, problems = [problems] if isinstance(problems, list): problems = { - '_'.join(os.path.basename(problem).split('_')[:-2]): problem + os.path.basename(problem).replace('.pkl', ''): problem for problem in problems } for problem_name, problem in tqdm(problems.items()): + # remove window_size resample_rule nomenclature from the problem's name + problem_name = re.sub(r'\_\d+[DdHhMmSs]', r'', problem_name) + if isinstance(problem, str): with open(problem, 'rb') as pickle_file: target_times, readings, orig_window_size, orig_rule = pickle.load(pickle_file) From be89b4c5355a4db252f49540e38a5fb7698fcba7 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 8 Oct 2020 10:19:49 +0200 Subject: [PATCH 105/171] Update documentation and add logging. --- greenguard/benchmark.py | 214 ++++++++++++++++++++++++++++++---------- 1 file changed, 164 insertions(+), 50 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 4d6e397..9bbfea7 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -18,7 +18,6 @@ from greenguard.loaders import CSVLoader from greenguard.metrics import METRICS from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing -from greenguard.utils import as_list LOGGER = logging.getLogger(__name__) @@ -31,6 +30,7 @@ 'default_cv', 'tuned_cv', 'tuned_test', + 'metric', 'fit_predict_time', 'cv_time', 'total_time', @@ -172,14 +172,14 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iterations=50, init_params=None, target_times=None, readings=None, preprocessing=0, cost=False, test_size=0.25, cv_splits=3, random_state=0, cache_path=None, - cache_results=None, problem_name=None, output_path=None): + cache_results=None, problem_name=None, output_path=None, progress_bar=None): """Execute the benchmark process and optionally store the result as a ``CSV``. Args: templates (list): List of templates to try. window_size_rule (list): - List of tupples (int, str or Timedelta object). + List of tuples (int, str or Timedelta object). metric (function or str): Metric to use. If an ``str`` is give it must be one of the metrics defined in the ``greenguard.metrics.METRICS`` dictionary. @@ -273,10 +273,14 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio scores['resample_rule'] = rule try: + LOGGER.info('Evaluating template %s on problem %s (%s, %s)', + template, problem_name, window_size, rule) + template_params = init_params[template] template_params = _build_init_params(template, window_size, rule, template_params) template_preprocessing = preprocessing[template] + result = evaluate_template( template=template, target_times=target_times, @@ -310,6 +314,7 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio df.to_csv(os.path.join(cache_results, file_name), index=False) scores_list.append(scores) + progress_bar.update(1) results = pd.DataFrame.from_records(scores_list) results = results.reindex(LEADERBOARD_COLUMNS, axis=1) @@ -338,14 +343,25 @@ def _generate_target_times_readings(target_times, readings_path, window_size, ru def make_problems(target_times_paths, readings_path, window_size_resample_rule, output_path=None, signals=None): - """ + """Make problems with the target times and readings for each window size and resample rule. + + Create problems in the accepted format by ``run_benchmark`` as pickle files containing: + + * ``target_times``: ``pandas.DataFrame`` containing the target times. + * ``readings``: ``pandas.DataFrame`` containing the readings for the target times. + * ``window_size``: window size value used. + * ``resample_rule``: resample rule value used. + + Or return a ``dict`` containing as keys the names of the problems generated and tuples with + the previously specified fields of target times, readings, window size and resample rule. + Args: target_times_paths (list): List of paths to CSVs that contain target times. readings_path (str): Path to the folder where readings in raw CSV format can be found. window_size_resample_rule (list): - List of tupples (int, str or Timedelta object). + List of tuples (int, str or Timedelta object). output_path (str): Path to save the generated problems. signals (str): @@ -365,6 +381,9 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, else: generated_problems = {} + if isinstance(signals, str) and os.path.exists(signals): + signals = pd.read_csv(signals).signal_id + for problem_name, target_time_path in tqdm(target_times_paths.items()): for window_size, rule in window_size_resample_rule: target_times = pd.read_csv(target_time_path, parse_dates=['cutoff_time']) @@ -379,6 +398,7 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, pickle_name = '{}_{}_{}'.format(problem_name, window_size, rule) if output_path: + os.makedirs(output_path, exist_ok=True) output_pickle_path = os.path.join(output_path, pickle_name + '.pkl') with open(output_pickle_path, 'wb') as pickle_file: pickle.dump((new_target_times, readings, window_size, rule), pickle_file) @@ -391,26 +411,95 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, return generated_problems -def run_benchmark(templates, problems=None, window_size_resample_rule=None, - tuning_iterations=100, preprocessing=0, init_params=None, cost=False, - cv_splits=5, metric='f1', test_size=0.33, random_state=0, cache_path=None, - output_path=None, cache_results=None): - """ +def run_benchmark(templates, problems, window_size_resample_rule=None, + tuning_iterations=50, signals=None, preprocessing=0, init_params=None, + metric='f1', cost=False, cv_splits=5, test_size=0.33, random_state=0, + cache_path=None,cache_results=None, output_path=None): + """Execute the benchmark function and optionally store the result as a ``CSV``. + + This function provides a user-friendly interface to interact with the ``evaluate_templates`` + function. It allows the user to specify an ``output_path`` where the results can be + stored. If this path is not provided, a ``pandas.DataFrame`` will be returned. + + This function evaluates each template against each problem for each window size and resample + rule possible, and will tune each teamplate for the given amount of tuning iterations. + + The problems can be a pickle file that contains the following values: + + * ``target_times``: ``pandas.DataFrame`` containing the target times. + * ``readings``: ``pandas.DataFrame`` containing the readings for the target times. + * ``window_size``: window size value used. + * ``resample_rule``: resample rule value used. + + Or it can be dictionary containing the problem's name and as values either a path to a pickle + file or a tuple containing the previously specified fields. + Args: - templates (list): - problem_paths (list): + templates (str or list): + Name of the json pipelines that will be evaluated against the problems. + problems (str, list or dict): + There are three possible values for problems: + + * ``str``: Path to a given problem stored as a pickle file (pkl). + * ``list``: List of paths to given problems stored as a pickle files (pkl). + * ``dict``: A dict containing as keys the name of the problem and as value the + path to a pickle file or a tuple with target times and readings data + frames and the window size and resample rule used to generate this + problem. + + The pickle files has to contain a tuple with target times and readings data frames and + the window size and resample rule used to generate that problem. We recommend using + the function ``make_problems`` to generate those files. + window_size_resample_rule (list): - List of tupples (int, str or Timedelta object). + List of tuples (int, str or Timedelta object). tuning_iterations (int): - preprocessing : - init_params : - cost : - test_size : + Amount of tuning iterations to perfrom over each template. + signals (str or list): + Path to a csv file containing ``signal_id`` column that we would like to use or a + ``list`` of signals that we would like to use. If ``None`` use all the signals from + the readings. + preprocessing (int, dict or list): + There are three possible values for preprocessing: + + * ``int``: the value will be used for all templates. + * ``dict`` with the template name as a key and a number as a value, will + be used for that template. + * ``list``: each value will be assigned to the corresponding position of + self.templates. + + Defaults to ``0``. + init_params (dict or list): + There are three possible values for init_params: + + * Init params ``dict``: It will be used for all templates. + * ``dict`` with the name of the template as a key and dictionary with its + init params. + * ``list``: each value will be assigned to the corresponding position of + self.templates. + + Defaults to ``None``. + metric (function or str): + Metric to use. If an ``str`` is give it must be one of the metrics + defined in the ``greenguard.metrics.METRICS`` dictionary. + cost (bool): + Whether the metric is a cost function (the lower the better) or not. + Defaults to ``False``. cv_splits (int): + Number of cross validation folds to use. Defaults to ``5``. + test_size (float): + Amount of data that will be saved for test, represented in percentage between 0 and 1. + random_state (int or RandomState): + random state to use for the cross validation partitioning. Defaults to ``0``. cache_path (str): + If given, cache the generated cross validation splits in this folder. + Defatuls to ``None``. + cache_results (str): + If provided, store the progress of each pipeline and each problem while runing. output_path (str): + If provided, store the results to the given filename. Defaults to ``None``. """ - templates = as_list(templates) + templates = templates if isinstance(templates, (list, tuple)) else [templates] results = list() if isinstance(problems, str): problems = [problems] @@ -420,7 +509,14 @@ def run_benchmark(templates, problems=None, window_size_resample_rule=None, for problem in problems } - for problem_name, problem in tqdm(problems.items()): + if signals is not None: + if isinstance(signals, str) and os.path.exists(signals): + signals = pd.read_csv(signals).signal_id + + total_runs = len(templates) * len(problems) * len(window_size_resample_rule or [1]) + pbar = tqdm(total=total_runs) + + for problem_name, problem in problems.items(): # remove window_size resample_rule nomenclature from the problem's name problem_name = re.sub(r'\_\d+[DdHhMmSs]', r'', problem_name) @@ -430,15 +526,20 @@ def run_benchmark(templates, problems=None, window_size_resample_rule=None, else: target_times, readings, orig_window_size, orig_rule = problem - if window_size_resample_rule is None: - window_size_resample_rule = [(orig_window_size, orig_rule)] + if signals is not None: + readings = readings[readings.signal_id.isin(signals)] + + wsrr = window_size_resample_rule or [(orig_window_size, orig_rule)] + + orig_window_size = pd.to_timedelta(orig_window_size) + orig_rule = pd.to_timedelta(orig_rule) - for window_size, resample_rule in window_size_resample_rule: + for window_size, resample_rule in wsrr: # window_size can be only smaller than pickle window size # resample rule can be only bigger than picke rule - if (pd.to_timedelta(orig_window_size) >= pd.to_timedelta(window_size) - and pd.to_timedelta(orig_rule) <= pd.to_timedelta(resample_rule)): # noqa W503 + if (orig_window_size >= pd.to_timedelta(window_size) + and orig_rule <= pd.to_timedelta(resample_rule)): # noqa W503 df = evaluate_templates( templates, @@ -456,7 +557,8 @@ def run_benchmark(templates, problems=None, window_size_resample_rule=None, cache_path=cache_path, cache_results=cache_results, problem_name=problem_name, - output_path=None + output_path=None, + progress_bar=pbar ) results.append(df) @@ -466,30 +568,36 @@ def run_benchmark(templates, problems=None, window_size_resample_rule=None, df.to_csv(os.path.join(cache_results, file_name), index=False) else: + pbar.update(1) + msg = 'Invalid window size or resample rule {}.'.format( (window_size, orig_window_size, resample_rule, orig_rule)) - LOGGER.info(msg) + LOGGER.warn(msg) - results = pd.concat(results, ignore_index=True) + pbar.close() + results = pd.concat(results, ignore_index=True) if output_path: + os.makedirs(output_path, exist_ok=True) results.to_csv(output_path, index=False) else: return results -def _run(args): +def _setup_logging(args): # Logger setup log_level = (3 - args.verbose) * 10 fmt = '%(asctime)s - %(process)d - %(levelname)s - %(name)s - %(module)s - %(message)s' - logging.basicConfig(level=log_level, format=fmt) + logging.basicConfig(filename=args.logfile, level=log_level, format=fmt) logging.getLogger("botocore").setLevel(logging.ERROR) - logging.getLogger("hyperopt").setLevel(logging.ERROR) - logging.getLogger("ax").setLevel(logging.ERROR) + logging.getLogger("tensorflow").setLevel(logging.ERROR) logging.getLogger("urllib3").setLevel(logging.CRITICAL) + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # FATAL + +def _run(args): if args.templates is None: args.templates = get_pipelines() @@ -514,6 +622,7 @@ def _run(args): cache_results=args.cache_results, tuning_iterations=args.iterations, output_path=args.output_path, + signals=args.signals, ) if not args.output_path: @@ -528,11 +637,11 @@ def summarize_results(input_path, output_path): pass -def _summary(args): +def _summarize_results(args): summarize_results(args.input, args.output) -def _create(args): +def _make_problems(args): window_size_resample_rule = list(product(args.window_size, args.resample_rule)) make_problems( args.target_times_paths, @@ -556,6 +665,8 @@ def _get_parser(): run.add_argument('-v', '--verbose', action='http://github.com/count', default=0, help='Be verbose. Use -vv for increased verbosity.') + run.add_argument('-l', '--logfile', + help='Log file.') run.add_argument('-t', '--templates', nargs='+', help='Perform benchmarking over the given list of templates.') run.add_argument('-p', '--problems', nargs='+', required=False, @@ -578,33 +689,34 @@ def _get_parser(): help='Path to store the csv files for each problem and template.') run.add_argument('-i', '--iterations', type=int, default=100, help='Number of iterations to perform per challenge with each candidate.') + run.add_argument('-S', '--signals', type=str, + help='Path to csv file that has signal_id column to use as the signal') # Summarize action - summary = action.add_parser('summary', help='Summarize the GreenGuard Benchmark results') - summary.set_defaults(action=_summary) + summary = action.add_parser('summarize-results', + help='Summarize the GreenGuard Benchmark results') + summary.set_defaults(action=_summarize_results) summary.add_argument('input', nargs='+', help='Input path with results.') summary.add_argument('output', help='Output file.') - # Create action - create = action.add_parser('create', help='Create GreenGuard problems') - create.set_defaults(action=_create) - create.add_argument('target-times-paths', nargs='+', help='List of target times paths.') - create.add_argument('readings-path', type=str, help='Path to the readings folder.') - create.add_argument('-w', '--window-size', nargs='+', required=False, - help='List of window sizes values to benchmark.') - create.add_argument('-r', '--resample-rule', nargs='+', required=False, - help='List of resample rule to benchmark.') - create.add_argument('-o', '--output', type=str, - help='Output path where to save the generated problems.') - create.add_argument('-s', '--signals', type=str, - help='Path to csv file that has signal_id column to use as the signal') + # Make problems action + problems = action.add_parser('make-problems', help='Create GreenGuard problems') + problems.set_defaults(action=_make_problems) + problems.add_argument('target-times-paths', nargs='+', help='List of target times paths.') + problems.add_argument('readings-path', type=str, help='Path to the readings folder.') + problems.add_argument('-w', '--window-size', nargs='+', required=False, + help='List of window sizes values to benchmark.') + problems.add_argument('-r', '--resample-rule', nargs='+', required=False, + help='List of resample rule to benchmark.') + problems.add_argument('-o', '--output', type=str, + help='Output path where to save the generated problems.') + problems.add_argument('-s', '--signals', type=str, + help='Path to csv file that has signal_id column to use as the signal') return parser def main(): - warnings.filterwarnings("ignore") - # Parse args parser = _get_parser() if len(sys.argv) < 2: @@ -612,8 +724,10 @@ def main(): sys.exit(0) args = parser.parse_args() + _setup_logging(args) args.action(args) if __name__ == '__main__': + warnings.filterwarnings("ignore") main() From 9884001ee822f9aab629e8a172a3b1ad34bf309a Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 8 Oct 2020 10:20:13 +0200 Subject: [PATCH 106/171] Fix lint --- greenguard/benchmark.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 9bbfea7..1731c58 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -280,7 +280,6 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio template_params = _build_init_params(template, window_size, rule, template_params) template_preprocessing = preprocessing[template] - result = evaluate_template( template=template, target_times=target_times, @@ -414,7 +413,7 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, def run_benchmark(templates, problems, window_size_resample_rule=None, tuning_iterations=50, signals=None, preprocessing=0, init_params=None, metric='f1', cost=False, cv_splits=5, test_size=0.33, random_state=0, - cache_path=None,cache_results=None, output_path=None): + cache_path=None, cache_results=None, output_path=None): """Execute the benchmark function and optionally store the result as a ``CSV``. This function provides a user-friendly interface to interact with the ``evaluate_templates`` From 28780f8e725cb8f41308e45d7d9d3cfe833028ea Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 8 Oct 2020 10:21:39 +0200 Subject: [PATCH 107/171] Update dependencies --- setup.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 3fe60b7..f29bf8f 100644 --- a/setup.py +++ b/setup.py @@ -16,20 +16,23 @@ history = '' install_requires = [ - 'xlsxwriter>=1.3.6<1.4', - 'matplotlib<3.2.2', - 'boto3==1.14.44', - 'botocore==1.17.44', 'baytune>=0.3.9,<0.4', - 'tabulate>=0.8.3,<0.9', - 'Keras>=2.1.6,<2.4', 'mlblocks>=0.3.4,<0.4', 'mlprimitives>=0.2.5,<0.3', + 'pymongo>=3.7.2,<4', + 'scikit-learn>=0.20.0,<0.21', + 'tqdm<4.50.0,>=4.36.1', + 'cloudpickle>=1.6,<2', 'scipy>=1.0.1,<1.4.0', 'numpy>=1.15.4,<1.17', - 'pymongo>=3.7.2,<4', - 'scikit-learn>=0.20.1,<0.21', - 'dask>=2.6.0,<3' + 'pandas>=0.23.4,<0.25', + 'dask>=2.6.0,<3', + 'Keras>=2.1.6,<2.4', + 'tabulate>=0.8.3,<0.9', + 'xlsxwriter>=1.3.6<1.4', + #'matplotlib<3.2.2', + 'boto3==1.14.44', + 'botocore==1.17.44', ] setup_requires = [ @@ -71,6 +74,7 @@ # Advanced testing 'coverage>=4.5.1,<6', 'tox>=2.9.1,<4', + 'importlib-metadata<2,>=0.12', ] setup( From 6aa8ed69d5bf88e4aa94c7dd8938e1fb7ed897ad Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 8 Oct 2020 10:21:59 +0200 Subject: [PATCH 108/171] Turn off verbosity --- greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json b/greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json index b0550ee..5c82d77 100644 --- a/greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json +++ b/greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json @@ -42,7 +42,7 @@ "encode": false, "max_depth": -1, "copy": true, - "verbose": true, + "verbose": false, "n_jobs": 1, "training_window": "1d" } From 254b264512cb4adcc319c5db269f26faa371c1fe Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 8 Oct 2020 10:29:44 +0200 Subject: [PATCH 109/171] Load pipeline from a file --- greenguard/pipeline.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index a46c2c6..4d5fb86 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -283,7 +283,11 @@ def _get_templates(self, templates): for template in templates: if isinstance(template, str): template_name = template - template = deepcopy(load_pipeline(template_name)) + if os.path.isfile(template): + with open(template, 'r') as json_file: + template = json.load(json_file) + else: + template = deepcopy(load_pipeline(template_name)) else: template_name = md5(json.dumps(template)).digest() From 83e2c6217709dac1510b5337970d2df802e5598c Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 8 Oct 2020 14:24:11 +0200 Subject: [PATCH 110/171] Add sumarize --- greenguard/benchmark.py | 20 ++++++- greenguard/results.py | 119 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 3 deletions(-) create mode 100644 greenguard/results.py diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 1731c58..4d5dd81 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -18,6 +18,7 @@ from greenguard.loaders import CSVLoader from greenguard.metrics import METRICS from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing +from greenguard.results import load_results, write_results LOGGER = logging.getLogger(__name__) @@ -597,6 +598,7 @@ def _setup_logging(args): def _run(args): + _setup_logging(args) if args.templates is None: args.templates = get_pipelines() @@ -632,8 +634,21 @@ def _run(args): )) -def summarize_results(input_path, output_path): - pass +def summarize_results(input_paths, output_path): + """Load multiple benchmark results CSV files and compile a summary. + + The result is an Excel file with one tab for each results CSV file + and an additional Number of Wins tab with a summary. + + Args: + inputs_paths (list[str]): + List of paths to CSV files where the benchmarks results are stored. + These files must have one column per Tuner and one row per Challenge. + output_path (str): + Path, including the filename, where the Excel file will be created. + """ + results = load_results(input_paths) + write_results(results, output_path) def _summarize_results(args): @@ -723,7 +738,6 @@ def main(): sys.exit(0) args = parser.parse_args() - _setup_logging(args) args.action(args) diff --git a/greenguard/results.py b/greenguard/results.py new file mode 100644 index 0000000..869c26d --- /dev/null +++ b/greenguard/results.py @@ -0,0 +1,119 @@ +import os + +import pandas as pd + + +def load_results(files): + problems_results = dict() + for filename in files: + problem = os.path.basename(filename).replace('.csv', '') + problems_results[problem] = pd.read_csv(filename, index_col=0).round(6) + + return problems_results + + +def get_wins_by_problems(results): + df = results.groupby('problem_name')['template', 'window_size', 'resample_rule', 'tuned_test'] + df = df.apply(max) + df = df.rename(columns={'tuned_test': 'score'}) + + return df + + +def get_exclusive_wins(scores, column, pivot_columns=['window_size', 'resample_rule']): + summary = {} + for problem in scores.problem_name.unique(): + df = scores[scores['problem_name'] == problem] + df['wr'] = df.apply( + lambda row: '{}_{}'.format(row[pivot_columns[0]], row[pivot_columns[1]]), axis=1) + df = df.pivot(index='wr', columns=column, values='tuned_test') + + is_winner = df.T.rank(method='min', ascending=False) == 1 + num_winners = is_winner.sum() + is_exclusive = num_winners == 1 + is_exclusive_winner = is_winner & is_exclusive + summary[problem] = is_exclusive_winner.sum(axis=1) + + summary_df = pd.DataFrame(summary) + summary_df.index.name = 'template' + columns = summary_df.columns.sort_values(ascending=False) + return summary_df[columns] + + +def add_sheet(dfs, name, writer, cell_fmt, index_fmt, header_fmt): + startrow = 0 + widths = [0] + if not isinstance(dfs, dict): + dfs = {None: dfs} + + for df_name, df in dfs.items(): + df = df.reset_index() + startrow += bool(df_name) + df.to_excel(writer, sheet_name=name, startrow=startrow + 1, index=False, header=False) + + worksheet = writer.sheets[name] + + if df_name: + worksheet.write(startrow - 1, 0, df_name, index_fmt) + widths[0] = max(widths[0], len(df_name)) + + for idx, column in enumerate(df.columns): + worksheet.write(startrow, idx, column, header_fmt) + width = max(len(column), *df[column].astype(str).str.len()) + 1 + if len(widths) > idx: + widths[idx] = max(widths[idx], width) + else: + widths.append(width) + + startrow += len(df) + 2 + + for idx, width in enumerate(widths): + fmt = cell_fmt if idx else index_fmt + worksheet.set_column(idx, idx, width + 1, fmt) + + +def write_results(results, output): + writer = pd.ExcelWriter(output, engine='xlsxwriter') + cell_fmt = writer.book.add_format({ + "font_name": "Arial", + "font_size": "10" + }) + index_fmt = writer.book.add_format({ + "font_name": "Arial", + "font_size": "10", + "bold": True, + }) + header_fmt = writer.book.add_format({ + "font_name": "Arial", + "font_size": "10", + "bold": True, + "bottom": 1 + }) + + if isinstance(results, dict): + results = pd.concat(list(results.values()), ignore_index=True) + + window = get_exclusive_wins(results, 'window_size', ['window_size', 'tuned_test']) + + resample_pivots = ['resample_rule', ['problem_name', 'tuned_test']] + resample = get_exclusive_wins(results, 'resample_rule', resample_pivots) + + summary = { + 'Best pipeline by Problem': get_wins_by_problems(results), + 'Rankings - Number of wins': get_exclusive_wins(results, 'template'), + 'Resample Rule': resample, + 'Window Size': window + } + add_sheet(summary, 'Summary', writer, cell_fmt, index_fmt, header_fmt) + + for problem in results['problem_name'].unique(): + add_sheet( + results[results['problem_name'] == problem], + problem, + writer, + cell_fmt, + index_fmt, + header_fmt + ) + + writer.save() From d529174f7f6b14d0f59dfb75ae548003b24274d5 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 8 Oct 2020 15:04:27 +0200 Subject: [PATCH 111/171] Fix tests --- greenguard/benchmark.py | 4 +++- setup.py | 1 - tests/test_benchmark.py | 14 ++++++++++++-- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 4d5dd81..d4807b8 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -314,7 +314,9 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio df.to_csv(os.path.join(cache_results, file_name), index=False) scores_list.append(scores) - progress_bar.update(1) + + if progress_bar: + progress_bar.update(1) results = pd.DataFrame.from_records(scores_list) results = results.reindex(LEADERBOARD_COLUMNS, axis=1) diff --git a/setup.py b/setup.py index f29bf8f..91ac49f 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,6 @@ 'Keras>=2.1.6,<2.4', 'tabulate>=0.8.3,<0.9', 'xlsxwriter>=1.3.6<1.4', - #'matplotlib<3.2.2', 'boto3==1.14.44', 'botocore==1.17.44', ] diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 4dfe576..c18a565 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -32,13 +32,18 @@ def test_predict(): # assert expected_columns = [ - 'template', + 'problem_name', 'window_size', 'resample_rule', + 'template', 'default_test', 'default_cv', 'tuned_cv', 'tuned_test', + 'metric', + 'fit_predict_time', + 'cv_time', + 'total_time', 'status' ] @@ -46,11 +51,16 @@ def test_predict(): 'object', 'object', 'object', + 'object', + 'float64', 'float64', 'float64', 'float64', 'float64', - 'object' + 'float64', + 'float64', + 'float64', + 'object', ] assert (scores_df.columns.to_list() == expected_columns) From 676dcc514bc168c387df061b70a47a6842a572ea Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Thu, 8 Oct 2020 17:16:17 +0200 Subject: [PATCH 112/171] Add default cv time and average cv time --- greenguard/benchmark.py | 12 ++++++++---- tests/test_benchmark.py | 4 +++- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index d4807b8..1585b75 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -33,7 +33,8 @@ 'tuned_test', 'metric', 'fit_predict_time', - 'cv_time', + 'default_cv_time', + 'average_cv_time', 'total_time', 'status', ] @@ -146,15 +147,17 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter scores['default_test'] = metric(test['target'], predictions) # Computing the default cross validation score - cv_time = datetime.utcnow() + default_cv_time = datetime.utcnow() session = pipeline.tune(train, readings) session.run(1) - cv_time = datetime.utcnow() - cv_time + default_cv_time = datetime.utcnow() - default_cv_time scores['default_cv'] = pipeline.cv_score # Computing the cross validation score with tuned hyperparameters + average_cv_time = datetime.utcnow() session.run(tuning_iterations) + average_cv_time = (average_cv_time - datetime.utcnow()) / tuning_iterations scores['tuned_cv'] = pipeline.cv_score @@ -164,7 +167,8 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter scores['tuned_test'] = metric(test['target'], predictions) scores['fit_predict_time'] = fit_predict_time - scores['cv_time'] = cv_time + scores['default_cv_time'] = default_cv_time + scores['default_cv_time'] = default_cv_time scores['total_time'] = datetime.utcnow() - start_time return scores diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index c18a565..a1a2d6f 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -42,7 +42,8 @@ def test_predict(): 'tuned_test', 'metric', 'fit_predict_time', - 'cv_time', + 'default_cv_time', + 'average_cv_time', 'total_time', 'status' ] @@ -60,6 +61,7 @@ def test_predict(): 'float64', 'float64', 'float64', + 'float64', 'object', ] From 36015927d999448f99cbc98bb18828a6d3af3aab Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 9 Oct 2020 01:14:34 +0200 Subject: [PATCH 113/171] Add average cv time. --- greenguard/benchmark.py | 1 + 1 file changed, 1 insertion(+) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 1585b75..eff50bc 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -169,6 +169,7 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter scores['fit_predict_time'] = fit_predict_time scores['default_cv_time'] = default_cv_time scores['default_cv_time'] = default_cv_time + scores['average_cv_time'] = average_cv_time scores['total_time'] = datetime.utcnow() - start_time return scores From de9aeef79a7b5fb2143725a70e120901cfa46718 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 9 Oct 2020 11:13:21 +0200 Subject: [PATCH 114/171] Add release notes for v0.2.5 --- HISTORY.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index abc8226..d0c8364 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,14 @@ # History +## 0.2.5 - 2020-10-09 + +With this release we include: + +* `run_benchmark`: A function within the module `benchmark` that allows the user to evaluate +templates against problems with different window size and resample rules. +* `summarize_results`: A function that given a `csv` file generates a `xlsx` file with a summary +tab and a deatailed tab with the results from `run_benchmark`. + ## 0.2.4 - 2020-09-25 * Fix dependency errors From d10f5a9163681a12ac525847197ad2ec4443bb2c Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 9 Oct 2020 11:13:27 +0200 Subject: [PATCH 115/171] =?UTF-8?q?Bump=20version:=200.2.5.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 4b2b765..cc7e309 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.5.dev0' +__version__ = '0.2.5' import os diff --git a/setup.cfg b/setup.cfg index 5fdd02b..6614256 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.5.dev0 +current_version = 0.2.5 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 91ac49f..c3f7366 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.5.dev0', + version='0.2.5', zip_safe=False, ) From 39674e3fa8d8a685413162e8133aefaa8c2163eb Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 9 Oct 2020 11:25:09 +0200 Subject: [PATCH 116/171] =?UTF-8?q?Bump=20version:=200.2.5=20=E2=86=92=200?= =?UTF-8?q?.2.6.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index cc7e309..662545d 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.5' +__version__ = '0.2.6.dev0' import os diff --git a/setup.cfg b/setup.cfg index 6614256..919f5d6 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.5 +current_version = 0.2.6.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index c3f7366..3bce783 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.5', + version='0.2.6.dev0', zip_safe=False, ) From e0ec043e1d0b4124705e2b22110caf8d8c2ea140 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Fri, 23 Oct 2020 00:26:16 +0200 Subject: [PATCH 117/171] Intermediate outputs (#52) * Step by step execution and pipeline inspection notebook * Add step by step pipeline notbeooks. * Update default value for output_ when predicting. * Set verbose to false. * Rerun notebooks. Co-authored-by: Carles Sala --- greenguard/pipeline.py | 27 +- .../normalize_dfs_xgb_classifier.json | 2 +- .../normalize_dfs_xgb_classifier.ipynb | 1697 +++++++++++ ...ck_double_lstm_timeseries_classifier.ipynb | 2501 +++++++++++++++++ .../unstack_lstm_timeseries_classifier.ipynb | 2375 ++++++++++++++++ ...unstack_normalize_dfs_xgb_classifier.ipynb | 1785 ++++++++++++ 6 files changed, 8379 insertions(+), 8 deletions(-) create mode 100644 tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb create mode 100644 tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb create mode 100644 tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb create mode 100644 tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 4d5fb86..34504e3 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -527,7 +527,8 @@ def tune(self, target_times, readings, turbines=None): tunables = self._get_tunables(self._template_dicts) return BTBSession(tunables, scoring_function, maximize=not self._cost) - def fit(self, target_times, readings, turbines=None): + def fit(self, target_times=None, readings=None, turbines=None, + start_=None, output_=None, **kwargs): """Fit this pipeline to the given data. Args: @@ -539,12 +540,23 @@ def fit(self, target_times, readings, turbines=None): turbines (pandas.DataFrame): ``turbines`` table. """ - X = target_times[['turbine_id', 'cutoff_time']] - y = target_times['target'] - self._pipeline.fit(X, y, readings=readings, turbines=turbines) - self.fitted = True + if target_times is None: + X = kwargs.pop('X') + y = kwargs.pop('y') + else: + X = target_times[['turbine_id', 'cutoff_time']] + y = target_times['target'] + + out = self._pipeline.fit(X, y, readings=readings, turbines=turbines, + start_=start_, output_=output_, **kwargs) + + if output_ is None: + self.fitted = True + + return out - def predict(self, target_times, readings, turbines=None): + def predict(self, target_times=None, readings=None, turbines=None, + start_=None, output_='default', **kwargs): """Make predictions using this pipeline. Args: @@ -564,7 +576,8 @@ def predict(self, target_times, readings, turbines=None): raise NotFittedError() X = target_times[['turbine_id', 'cutoff_time']] - return self._pipeline.predict(X, readings=readings, turbines=turbines) + return self._pipeline.predict(X, readings=readings, turbines=turbines, + start_=start_, output_=output_, **kwargs) def save(self, path): """Serialize and save this pipeline using cloudpickle. diff --git a/greenguard/pipelines/normalize_dfs_xgb_classifier.json b/greenguard/pipelines/normalize_dfs_xgb_classifier.json index 3d7d4d2..8039d12 100644 --- a/greenguard/pipelines/normalize_dfs_xgb_classifier.json +++ b/greenguard/pipelines/normalize_dfs_xgb_classifier.json @@ -44,7 +44,7 @@ "encode": false, "max_depth": -1, "copy": true, - "verbose": true, + "verbose": false, "n_jobs": 1, "training_window": "1d" } diff --git a/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb b/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb new file mode 100644 index 0000000..5bcb1ea --- /dev/null +++ b/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb @@ -0,0 +1,1697 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# normalize_dfs_xgb_classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from greenguard.demo import load_demo\n", + "\n", + "target_times, readings = load_demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_name = 'normalize_dfs_xgb_classifier'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from greenguard.pipeline import GreenGuardPipeline\n", + "\n", + "pipeline = GreenGuardPipeline(pipeline_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pandas.DataFrame.resample',\n", + " 'featuretools.EntitySet.entity_from_dataframe',\n", + " 'featuretools.EntitySet.normalize_entity',\n", + " 'featuretools.EntitySet.normalize_entity',\n", + " 'featuretools.dfs',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'xgboost.XGBClassifier']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.template['primitives']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step by Step execution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Input Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation (part of GreenGuard Pipeline)\n", + "\n", + "* Input: target_times, readings, turbines\n", + "* Output: X, y, readings, turbines\n", + "* Effect: target_times has been split into X and y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.resample\n", + "\n", + "* Input: readings\n", + "* Output: readings (resampled)\n", + "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", + " signal_id and timestamp have been set as a multi-index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "step = 0\n", + "context = pipeline.fit(target_times, readings, output_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10 00:00:00323.0
1T001S012013-01-10 00:10:00346.0
2T001S012013-01-10 00:20:00407.0
3T001S012013-01-10 00:30:00257.0
4T001S012013-01-10 00:40:00267.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 00:00:00 323.0\n", + "1 T001 S01 2013-01-10 00:10:00 346.0\n", + "2 T001 S01 2013-01-10 00:20:00 407.0\n", + "3 T001 S01 2013-01-10 00:30:00 257.0\n", + "4 T001 S01 2013-01-10 00:40:00 267.0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## featuretools.EntitySet.entity_from_dataframe\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: entityset\n", + "* Effect: Entityset has been generated from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Entityset: entityset\n", + " Entities:\n", + " readings [Rows: 1329146, Columns: 5]\n", + " Relationships:\n", + " No relationships" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['entityset']" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10 00:00:00323.0
1T001S012013-01-10 00:10:00346.0
2T001S012013-01-10 00:20:00407.0
3T001S012013-01-10 00:30:00257.0
4T001S012013-01-10 00:40:00267.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 00:00:00 323.0\n", + "1 T001 S01 2013-01-10 00:10:00 346.0\n", + "2 T001 S01 2013-01-10 00:20:00 407.0\n", + "3 T001 S01 2013-01-10 00:30:00 257.0\n", + "4 T001 S01 2013-01-10 00:40:00 267.0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## featuretools.EntitySet.normalize_entity\n", + "\n", + "* Input: entityset\n", + "* Output: entityset with relationship (readings.turbine_id with turbines.turbine_id)\n", + "* Effect: establish relation between readings and turbines" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "step = 2\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Entityset: entityset\n", + " Entities:\n", + " readings [Rows: 1329146, Columns: 5]\n", + " turbines [Rows: 1, Columns: 1]\n", + " Relationships:\n", + " readings.turbine_id -> turbines.turbine_id" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['entityset']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## featuretools.EntitySet.normalize_entity\n", + "\n", + "* Input: entityset\n", + "* Output: entityset with relationship (readings.signal_id with signals.signal_id)\n", + "* Effect: establish relationship between readings and signals" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "step = 3\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Entityset: entityset\n", + " Entities:\n", + " readings [Rows: 1329146, Columns: 5]\n", + " turbines [Rows: 1, Columns: 1]\n", + " signals [Rows: 26, Columns: 1]\n", + " Relationships:\n", + " readings.turbine_id -> turbines.turbine_id\n", + " readings.signal_id -> signals.signal_id" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['entityset']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## featuretools.dfs\n", + "\n", + "* Input: entityset (unstacked, no turbine_id, no timestamp)\n", + "* Output: X (has additional features)\n", + "* Effect: build features for relational dataset using DFS" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "step = 4\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUM(readings.value)STD(readings.value)MAX(readings.value)SKEW(readings.value)MIN(readings.value)MEAN(readings.value)COUNT(readings)NUM_UNIQUE(readings.signal_id)MODE(readings.signal_id)NUM_UNIQUE(readings.DAY(timestamp))...MEAN(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))MEAN(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp)))MODE(readings.signals.MODE(readings.MONTH(timestamp)))MODE(readings.signals.MODE(readings.DAY(timestamp)))MODE(readings.signals.MODE(readings.YEAR(timestamp)))MODE(readings.signals.MODE(readings.WEEKDAY(timestamp)))
turbine_id
T0013.457475e+091.456852e+063448719.01.0192120.0917102.224456377026S012...12111111120134
T0013.465358e+091.459852e+063453777.01.0187600.0919193.186021377026S012...12111111220135
T0013.479406e+091.465252e+063463880.01.0181922.7922919.430027377026S012...12111111320136
T0013.499427e+091.473308e+063474703.01.017664-1.0928229.883899377026S012...12111111420130
T0012.912289e+091.477955e+063485019.01.0318790.0924242.895144377026S012...12111111520131
\n", + "

5 rows × 99 columns

\n", + "
" + ], + "text/plain": [ + " SUM(readings.value) STD(readings.value) MAX(readings.value) \\\n", + "turbine_id \n", + "T001 3.457475e+09 1.456852e+06 3448719.0 \n", + "T001 3.465358e+09 1.459852e+06 3453777.0 \n", + "T001 3.479406e+09 1.465252e+06 3463880.0 \n", + "T001 3.499427e+09 1.473308e+06 3474703.0 \n", + "T001 2.912289e+09 1.477955e+06 3485019.0 \n", + "\n", + " SKEW(readings.value) MIN(readings.value) MEAN(readings.value) \\\n", + "turbine_id \n", + "T001 1.019212 0.0 917102.224456 \n", + "T001 1.018760 0.0 919193.186021 \n", + "T001 1.018192 2.7 922919.430027 \n", + "T001 1.017664 -1.0 928229.883899 \n", + "T001 1.031879 0.0 924242.895144 \n", + "\n", + " COUNT(readings) NUM_UNIQUE(readings.signal_id) \\\n", + "turbine_id \n", + "T001 3770 26 \n", + "T001 3770 26 \n", + "T001 3770 26 \n", + "T001 3770 26 \n", + "T001 3770 26 \n", + "\n", + " MODE(readings.signal_id) NUM_UNIQUE(readings.DAY(timestamp)) ... \\\n", + "turbine_id ... \n", + "T001 S01 2 ... \n", + "T001 S01 2 ... \n", + "T001 S01 2 ... \n", + "T001 S01 2 ... \n", + "T001 S01 2 ... \n", + "\n", + " MEAN(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MEAN(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "\n", + " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 11 \n", + "T001 12 \n", + "T001 13 \n", + "T001 14 \n", + "T001 15 \n", + "\n", + " MODE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "\n", + " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \n", + "turbine_id \n", + "T001 4 \n", + "T001 5 \n", + "T001 6 \n", + "T001 0 \n", + "T001 1 \n", + "\n", + "[5 rows x 99 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "99" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# features generated (the turbine_id is set as index).\n", + "len(context['X'].columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10 00:00:00323.0
1T001S012013-01-10 00:10:00346.0
2T001S012013-01-10 00:20:00407.0
3T001S012013-01-10 00:30:00257.0
4T001S012013-01-10 00:40:00267.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 00:00:00 323.0\n", + "1 T001 S01 2013-01-10 00:10:00 346.0\n", + "2 T001 S01 2013-01-10 00:20:00 407.0\n", + "3 T001 S01 2013-01-10 00:30:00 257.0\n", + "4 T001 S01 2013-01-10 00:40:00 267.0" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlprimitives.custom.feature_extraction.CategoricalEncoder\n", + "\n", + "* Input: X\n", + "* Output: X (label encoded)\n", + "* Effect: encodes categorical features using OneHotLabelEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "step = 5\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUM(readings.value)STD(readings.value)MAX(readings.value)SKEW(readings.value)MIN(readings.value)MEAN(readings.value)COUNT(readings)NUM_UNIQUE(readings.signal_id)NUM_UNIQUE(readings.DAY(timestamp))NUM_UNIQUE(readings.MONTH(timestamp))...NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp)))MODE(readings.signals.MODE(readings.MONTH(timestamp)))MODE(readings.signals.MODE(readings.DAY(timestamp)))MODE(readings.signals.MODE(readings.YEAR(timestamp)))MODE(readings.signals.MODE(readings.WEEKDAY(timestamp)))MODE(readings.signal_id)=S01MODE(readings.signals.MODE(readings.turbine_id))=T001
turbine_id
T0013.457475e+091.456852e+063448719.01.0192120.0917102.22445637702621...11111112013411
T0013.465358e+091.459852e+063453777.01.0187600.0919193.18602137702621...11111122013511
T0013.479406e+091.465252e+063463880.01.0181922.7922919.43002737702621...11111132013611
T0013.499427e+091.473308e+063474703.01.017664-1.0928229.88389937702621...11111142013011
T0012.912289e+091.477955e+063485019.01.0318790.0924242.89514437702621...11111152013111
\n", + "

5 rows × 99 columns

\n", + "
" + ], + "text/plain": [ + " SUM(readings.value) STD(readings.value) MAX(readings.value) \\\n", + "turbine_id \n", + "T001 3.457475e+09 1.456852e+06 3448719.0 \n", + "T001 3.465358e+09 1.459852e+06 3453777.0 \n", + "T001 3.479406e+09 1.465252e+06 3463880.0 \n", + "T001 3.499427e+09 1.473308e+06 3474703.0 \n", + "T001 2.912289e+09 1.477955e+06 3485019.0 \n", + "\n", + " SKEW(readings.value) MIN(readings.value) MEAN(readings.value) \\\n", + "turbine_id \n", + "T001 1.019212 0.0 917102.224456 \n", + "T001 1.018760 0.0 919193.186021 \n", + "T001 1.018192 2.7 922919.430027 \n", + "T001 1.017664 -1.0 928229.883899 \n", + "T001 1.031879 0.0 924242.895144 \n", + "\n", + " COUNT(readings) NUM_UNIQUE(readings.signal_id) \\\n", + "turbine_id \n", + "T001 3770 26 \n", + "T001 3770 26 \n", + "T001 3770 26 \n", + "T001 3770 26 \n", + "T001 3770 26 \n", + "\n", + " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "\n", + " NUM_UNIQUE(readings.MONTH(timestamp)) ... \\\n", + "turbine_id ... \n", + "T001 1 ... \n", + "T001 1 ... \n", + "T001 1 ... \n", + "T001 1 ... \n", + "T001 1 ... \n", + "\n", + " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 11 \n", + "T001 12 \n", + "T001 13 \n", + "T001 14 \n", + "T001 15 \n", + "\n", + " MODE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "\n", + " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 4 \n", + "T001 5 \n", + "T001 6 \n", + "T001 0 \n", + "T001 1 \n", + "\n", + " MODE(readings.signal_id)=S01 \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.signals.MODE(readings.turbine_id))=T001 \n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + "[5 rows x 99 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10 00:00:00323.0
1T001S012013-01-10 00:10:00346.0
2T001S012013-01-10 00:20:00407.0
3T001S012013-01-10 00:30:00257.0
4T001S012013-01-10 00:40:00267.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 00:00:00 323.0\n", + "1 T001 S01 2013-01-10 00:10:00 346.0\n", + "2 T001 S01 2013-01-10 00:20:00 407.0\n", + "3 T001 S01 2013-01-10 00:30:00 257.0\n", + "4 T001 S01 2013-01-10 00:40:00 267.0" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## xgboost.XGBClassifier\n", + "\n", + "* Input: X (label encoded and featurized)\n", + "* Output: None\n", + "* Effect: trained model" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "step = 6\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb b/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb new file mode 100644 index 0000000..5c7b442 --- /dev/null +++ b/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb @@ -0,0 +1,2501 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# unstack_double_lstm_timeseries_classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from greenguard.demo import load_demo\n", + "\n", + "target_times, readings = load_demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_name = 'unstack_double_lstm_timeseries_classifier'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from greenguard.pipeline import GreenGuardPipeline\n", + "\n", + "pipeline = GreenGuardPipeline(pipeline_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pandas.DataFrame.resample',\n", + " 'pandas.DataFrame.unstack',\n", + " 'pandas.DataFrame.pop',\n", + " 'pandas.DataFrame.pop',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'sklearn.preprocessing.MinMaxScaler',\n", + " 'pandas.DataFrame',\n", + " 'pandas.DataFrame.set',\n", + " 'pandas.DataFrame.set',\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences',\n", + " 'keras.Sequential.DoubleLSTMTimeSeriesClassifier']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.template['primitives']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step by Step execution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Input Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation (part of GreenGuard Pipeline)\n", + "\n", + "* Input: target_times, readings, turbines\n", + "* Output: X, y, readings, turbines\n", + "* Effect: target_times has been split into X and y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.resample\n", + "\n", + "* Input: readings\n", + "* Output: readings (resampled)\n", + "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", + " signal_id and timestamp have been set as a multi-index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "context = pipeline.fit(target_times, readings, output_=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
turbine_idsignal_idtimestamp
T001S012013-01-10 00:00:00313.333333
2013-01-10 01:00:00197.500000
2013-01-10 02:00:00248.166667
2013-01-10 03:00:00253.166667
2013-01-10 04:00:00305.000000
\n", + "
" + ], + "text/plain": [ + " value\n", + "turbine_id signal_id timestamp \n", + "T001 S01 2013-01-10 00:00:00 313.333333\n", + " 2013-01-10 01:00:00 197.500000\n", + " 2013-01-10 02:00:00 248.166667\n", + " 2013-01-10 03:00:00 253.166667\n", + " 2013-01-10 04:00:00 305.000000" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
1T0012013-01-10 01:00:00197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
2T0012013-01-10 02:00:00248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
3T0012013-01-10 03:00:00253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
4T0012013-01-10 04:00:00305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", + "0 T001 2013-01-10 00:00:00 313.333333 323.833333 336.000000 \n", + "1 T001 2013-01-10 01:00:00 197.500000 221.333333 216.000000 \n", + "2 T001 2013-01-10 02:00:00 248.166667 271.666667 277.500000 \n", + "3 T001 2013-01-10 03:00:00 253.166667 256.166667 242.666667 \n", + "4 T001 2013-01-10 04:00:00 305.000000 312.333333 346.166667 \n", + "\n", + " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", + "0 364.666667 286.500000 314.000000 243.166667 3.197980e+06 ... \n", + "1 260.666667 206.833333 235.833333 186.666667 3.198221e+06 ... \n", + "2 298.000000 233.666667 271.166667 216.333333 3.198448e+06 ... \n", + "3 265.333333 211.666667 226.666667 181.000000 3.198691e+06 ... \n", + "4 329.833333 280.666667 308.833333 271.833333 3.198978e+06 ... \n", + "\n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", + "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", + "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", + "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", + "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", + "\n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 47.666667 52.666667 40.833333 357.333333 \n", + "1 34.500000 39.333333 31.166667 249.666667 \n", + "2 39.333333 45.500000 36.166667 297.666667 \n", + "3 35.333333 37.833333 30.333333 268.000000 \n", + "4 46.166667 49.666667 41.166667 341.833333 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (unstacked)\n", + "* Output: readings (without turbine_id), turbine_id\n", + "* Effect: turbine_id has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "step = 2\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 T001\n", + "1 T001\n", + "2 T001\n", + "3 T001\n", + "4 T001\n", + "Name: turbine_id, dtype: object" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['turbine_id'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
02013-01-10 00:00:00313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06695143.166667...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
12013-01-10 01:00:00197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06695403.666667...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
22013-01-10 02:00:00248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06695656.500000...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
32013-01-10 03:00:00253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06695911.333333...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
42013-01-10 04:00:00305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06696195.833333...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 2013-01-10 00:00:00 313.333333 323.833333 336.000000 364.666667 \n", + "1 2013-01-10 01:00:00 197.500000 221.333333 216.000000 260.666667 \n", + "2 2013-01-10 02:00:00 248.166667 271.666667 277.500000 298.000000 \n", + "3 2013-01-10 03:00:00 253.166667 256.166667 242.666667 265.333333 \n", + "4 2013-01-10 04:00:00 305.000000 312.333333 346.166667 329.833333 \n", + "\n", + " value_S05 value_S06 value_S07 value_S08 value_S09 ... \\\n", + "0 286.500000 314.000000 243.166667 3.197980e+06 695143.166667 ... \n", + "1 206.833333 235.833333 186.666667 3.198221e+06 695403.666667 ... \n", + "2 233.666667 271.166667 216.333333 3.198448e+06 695656.500000 ... \n", + "3 211.666667 226.666667 181.000000 3.198691e+06 695911.333333 ... \n", + "4 280.666667 308.833333 271.833333 3.198978e+06 696195.833333 ... \n", + "\n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", + "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", + "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", + "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", + "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", + "\n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 47.666667 52.666667 40.833333 357.333333 \n", + "1 34.500000 39.333333 31.166667 249.666667 \n", + "2 39.333333 45.500000 36.166667 297.666667 \n", + "3 35.333333 37.833333 30.333333 268.000000 \n", + "4 46.166667 49.666667 41.166667 341.833333 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (without turbine_id)\n", + "* Output: readings (without timestamp), timestamp\n", + "* Effect: timestamp has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "step = 3\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2013-01-10 00:00:00\n", + "1 2013-01-10 01:00:00\n", + "2 2013-01-10 02:00:00\n", + "3 2013-01-10 03:00:00\n", + "4 2013-01-10 04:00:00\n", + "Name: timestamp, dtype: datetime64[ns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['timestamp'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09value_S10...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06695143.1666673.348384e+06...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
1197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06695403.6666673.348651e+06...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
2248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06695656.5000003.348910e+06...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
3253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06695911.3333333.349157e+06...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
4305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06696195.8333333.349452e+06...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", + "0 313.333333 323.833333 336.000000 364.666667 286.500000 314.000000 \n", + "1 197.500000 221.333333 216.000000 260.666667 206.833333 235.833333 \n", + "2 248.166667 271.666667 277.500000 298.000000 233.666667 271.166667 \n", + "3 253.166667 256.166667 242.666667 265.333333 211.666667 226.666667 \n", + "4 305.000000 312.333333 346.166667 329.833333 280.666667 308.833333 \n", + "\n", + " value_S07 value_S08 value_S09 value_S10 ... value_S17 \\\n", + "0 243.166667 3.197980e+06 695143.166667 3.348384e+06 ... 10.383333 \n", + "1 186.666667 3.198221e+06 695403.666667 3.348651e+06 ... 8.666667 \n", + "2 216.333333 3.198448e+06 695656.500000 3.348910e+06 ... 8.833333 \n", + "3 181.000000 3.198691e+06 695911.333333 3.349157e+06 ... 8.433333 \n", + "4 271.833333 3.198978e+06 696195.833333 3.349452e+06 ... 9.083333 \n", + "\n", + " value_S18 value_S19 value_S20 value_S21 value_S22 value_S23 \\\n", + "0 3.131958e+06 52.666667 54.333333 56.166667 61.000000 47.666667 \n", + "1 3.133668e+06 33.166667 37.000000 36.166667 43.666667 34.500000 \n", + "2 3.135413e+06 41.500000 45.666667 46.500000 49.666667 39.333333 \n", + "3 3.137001e+06 42.333333 42.833333 40.500000 44.166667 35.333333 \n", + "4 3.138843e+06 50.500000 51.166667 55.500000 53.666667 46.166667 \n", + "\n", + " value_S24 value_S25 value_S26 \n", + "0 52.666667 40.833333 357.333333 \n", + "1 39.333333 31.166667 249.666667 \n", + "2 45.500000 36.166667 297.666667 \n", + "3 37.833333 30.333333 268.000000 \n", + "4 49.666667 41.166667 341.833333 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## sklearn.impute.SimpleImputer\n", + "\n", + "* Input: readings (unstacked, no turbine_id, no timestamp)\n", + "* Output: readings (imputed, numpy array)\n", + "* Effect: readings have been imputed and converted to numpy array" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "step = 4\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[3.13333333e+02, 3.23833333e+02, 3.36000000e+02, 3.64666667e+02,\n", + " 2.86500000e+02, 3.14000000e+02, 2.43166667e+02, 3.19798000e+06,\n", + " 6.95143167e+05, 3.34838383e+06, 3.43692150e+06, 3.32248667e+06,\n", + " 3.35809000e+06, 3.22390150e+06, 7.95000000e+00, 5.85000000e+00,\n", + " 1.03833333e+01, 3.13195833e+06, 5.26666667e+01, 5.43333333e+01,\n", + " 5.61666667e+01, 6.10000000e+01, 4.76666667e+01, 5.26666667e+01,\n", + " 4.08333333e+01, 3.57333333e+02],\n", + " [1.97500000e+02, 2.21333333e+02, 2.16000000e+02, 2.60666667e+02,\n", + " 2.06833333e+02, 2.35833333e+02, 1.86666667e+02, 3.19822067e+06,\n", + " 6.95403667e+05, 3.34865117e+06, 3.43722283e+06, 3.32272200e+06,\n", + " 3.35834000e+06, 3.22409567e+06, 6.83333333e+00, 5.15000000e+00,\n", + " 8.66666667e+00, 3.13366817e+06, 3.31666667e+01, 3.70000000e+01,\n", + " 3.61666667e+01, 4.36666667e+01, 3.45000000e+01, 3.93333333e+01,\n", + " 3.11666667e+01, 2.49666667e+02],\n", + " [2.48166667e+02, 2.71666667e+02, 2.77500000e+02, 2.98000000e+02,\n", + " 2.33666667e+02, 2.71166667e+02, 2.16333333e+02, 3.19844767e+06,\n", + " 6.95656500e+05, 3.34890967e+06, 3.43751900e+06, 3.32295950e+06,\n", + " 3.35862067e+06, 3.22432333e+06, 7.11666667e+00, 5.56666667e+00,\n", + " 8.83333333e+00, 3.13541283e+06, 4.15000000e+01, 4.56666667e+01,\n", + " 4.65000000e+01, 4.96666667e+01, 3.93333333e+01, 4.55000000e+01,\n", + " 3.61666667e+01, 2.97666667e+02],\n", + " [2.53166667e+02, 2.56166667e+02, 2.42666667e+02, 2.65333333e+02,\n", + " 2.11666667e+02, 2.26666667e+02, 1.81000000e+02, 3.19869117e+06,\n", + " 6.95911333e+05, 3.34915717e+06, 3.43778050e+06, 3.32316850e+06,\n", + " 3.35884883e+06, 3.22450217e+06, 6.71666667e+00, 5.16666667e+00,\n", + " 8.43333333e+00, 3.13700133e+06, 4.23333333e+01, 4.28333333e+01,\n", + " 4.05000000e+01, 4.41666667e+01, 3.53333333e+01, 3.78333333e+01,\n", + " 3.03333333e+01, 2.68000000e+02],\n", + " [3.05000000e+02, 3.12333333e+02, 3.46166667e+02, 3.29833333e+02,\n", + " 2.80666667e+02, 3.08833333e+02, 2.71833333e+02, 3.19897850e+06,\n", + " 6.96195833e+05, 3.34945200e+06, 3.43807767e+06, 3.32340933e+06,\n", + " 3.35910983e+06, 3.22471400e+06, 7.20000000e+00, 5.28333333e+00,\n", + " 9.08333333e+00, 3.13884333e+06, 5.05000000e+01, 5.11666667e+01,\n", + " 5.55000000e+01, 5.36666667e+01, 4.61666667e+01, 4.96666667e+01,\n", + " 4.11666667e+01, 3.41833333e+02]])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## sklearn.preprocessing.MinMaxScaler\n", + "\n", + "* Input: (imputed, array)\n", + "* Output: readings (scaled, array)\n", + "* Effect: readings have been scaled to [-1, 1] range" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "step = 5\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.26126126, -0.23706897, -0.20870076, -0.14106583, -0.32328767,\n", + " -0.25969448, -0.42198789, -1. , -1. , -1. ,\n", + " -1. , -1. , -1. , -1. , -0.11007463,\n", + " -0.16824645, -0.10424155, -0.37397741, -0.25233645, -0.22716628,\n", + " -0.20140515, -0.13481829, -0.32239156, -0.25380117, -0.4182243 ,\n", + " -0.25697453],\n", + " [-0.53349001, -0.47805643, -0.49088771, -0.38557994, -0.51037182,\n", + " -0.44339992, -0.55438391, -0.99983031, -0.99982547, -0.99982499,\n", + " -0.99980741, -0.9998428 , -0.99983779, -0.99986887, -0.23507463,\n", + " -0.26777251, -0.25233645, -0.37363511, -0.52570093, -0.470726 ,\n", + " -0.4824356 , -0.37866354, -0.50762016, -0.44093567, -0.55373832,\n", + " -0.48085254],\n", + " [-0.41441441, -0.35971787, -0.3462669 , -0.29780564, -0.44735812,\n", + " -0.36036036, -0.48486624, -0.99967026, -0.99965608, -0.99965576,\n", + " -0.99961813, -0.99968416, -0.99965569, -0.99971512, -0.20335821,\n", + " -0.20853081, -0.2379583 , -0.37328583, -0.4088785 , -0.34894614,\n", + " -0.33723653, -0.29425557, -0.43962485, -0.35438596, -0.48364486,\n", + " -0.38104315],\n", + " [-0.40266353, -0.39615987, -0.4281795 , -0.37460815, -0.49902153,\n", + " -0.4649432 , -0.56766257, -0.99949857, -0.99948535, -0.99949373,\n", + " -0.999451 , -0.99954455, -0.99950765, -0.99959435, -0.24813433,\n", + " -0.26540284, -0.27246585, -0.37296782, -0.39719626, -0.38875878,\n", + " -0.42154567, -0.37162954, -0.49589683, -0.4619883 , -0.56542056,\n", + " -0.4427309 ],\n", + " [-0.28084606, -0.26410658, -0.18479326, -0.22296238, -0.3369863 ,\n", + " -0.27183705, -0.35481351, -0.99929598, -0.99929474, -0.99930071,\n", + " -0.99926107, -0.99938368, -0.99933831, -0.9994513 , -0.19402985,\n", + " -0.24881517, -0.21639109, -0.37259906, -0.28271028, -0.27166276,\n", + " -0.21077283, -0.23798359, -0.34349355, -0.29590643, -0.4135514 ,\n", + " -0.28920464]])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame\n", + "\n", + "* Input: readings (scaled, array)\n", + "* Output: readings (dataframe)\n", + "* Effect: readings have been converted into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "step = 6\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...16171819202122232425
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.104242-0.373977-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.252336-0.373635-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.237958-0.373286-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.272466-0.372968-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.216391-0.372599-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", + "\n", + " 7 8 9 ... 16 17 18 19 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.104242 -0.373977 -0.252336 -0.227166 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.252336 -0.373635 -0.525701 -0.470726 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.237958 -0.373286 -0.408879 -0.348946 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.272466 -0.372968 -0.397196 -0.388759 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.216391 -0.372599 -0.282710 -0.271663 \n", + "\n", + " 20 21 22 23 24 25 \n", + "0 -0.201405 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 \n", + "1 -0.482436 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 \n", + "2 -0.337237 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 \n", + "3 -0.421546 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 \n", + "4 -0.210773 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe)\n", + "* Output: readings (dataframe with turbine_id)\n", + "* Effect: turbine_id has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "step = 7\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...171819202122232425turbine_id
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.373977-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975T001
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.373635-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853T001
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.373286-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043T001
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.372968-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731T001
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.372599-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205T001
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", + "\n", + " 7 8 9 ... 17 18 19 20 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.373977 -0.252336 -0.227166 -0.201405 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.373635 -0.525701 -0.470726 -0.482436 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.373286 -0.408879 -0.348946 -0.337237 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.372968 -0.397196 -0.388759 -0.421546 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.372599 -0.282710 -0.271663 -0.210773 \n", + "\n", + " 21 22 23 24 25 turbine_id \n", + "0 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 T001 \n", + "1 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 T001 \n", + "2 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 T001 \n", + "3 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 T001 \n", + "4 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 T001 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe with turbine_id)\n", + "* Output: readings (dataframe with turbine_id and timestamp)\n", + "* Effect: timestamp has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "step = 8\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...1819202122232425turbine_idtimestamp
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975T0012013-01-10 00:00:00
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853T0012013-01-10 01:00:00
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043T0012013-01-10 02:00:00
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731T0012013-01-10 03:00:00
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205T0012013-01-10 04:00:00
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", + "\n", + " 7 8 9 ... 18 19 20 21 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.252336 -0.227166 -0.201405 -0.134818 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.525701 -0.470726 -0.482436 -0.378664 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.408879 -0.348946 -0.337237 -0.294256 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.397196 -0.388759 -0.421546 -0.371630 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.282710 -0.271663 -0.210773 -0.237984 \n", + "\n", + " 22 23 24 25 turbine_id timestamp \n", + "0 -0.322392 -0.253801 -0.418224 -0.256975 T001 2013-01-10 00:00:00 \n", + "1 -0.507620 -0.440936 -0.553738 -0.480853 T001 2013-01-10 01:00:00 \n", + "2 -0.439625 -0.354386 -0.483645 -0.381043 T001 2013-01-10 02:00:00 \n", + "3 -0.495897 -0.461988 -0.565421 -0.442731 T001 2013-01-10 03:00:00 \n", + "4 -0.343494 -0.295906 -0.413551 -0.289205 T001 2013-01-10 04:00:00 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences\n", + "\n", + "* Input: X, readings (dataframe with turbine_id and timestamp)\n", + "* Output: X\n", + "* Effect: X has been converted to a 3d numpy array that contains 1 matrix of shape\n", + " (window_size x num_signals) for each one of the target times." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline._pipeline.get_hyperparameters()[\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "step = 9\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8521, 28)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353,)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['y'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353, 24, 26)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.58793576, -0.60305643, -0.63981971, -0.61481191, -0.69823875,\n", + " -0.65021543, -0.68912322, -0.99436914, -0.99439755, -0.99454249,\n", + " -0.99446788, -0.99476185, -0.99490997, -0.99529511, -0.34701493,\n", + " -0.33886256, -0.33860532, -0.36301186, -0.57943925, -0.59250585,\n", + " -0.6323185 , -0.60609613, -0.69284877, -0.64444444, -0.68691589,\n", + " -0.63853752],\n", + " [-0.56600078, -0.5846395 , -0.63002156, -0.61559561, -0.70880626,\n", + " -0.66392479, -0.69732474, -0.9942427 , -0.99427986, -0.9944408 ,\n", + " -0.99436498, -0.99468147, -0.99482011, -0.99521249, -0.33955224,\n", + " -0.31516588, -0.38892883, -0.36280656, -0.55841121, -0.57611241,\n", + " -0.62295082, -0.61078546, -0.70222743, -0.65847953, -0.69392523,\n", + " -0.63645815],\n", + " [-0.64081473, -0.64184953, -0.67038997, -0.63597179, -0.71350294,\n", + " -0.65844105, -0.66764304, -0.99412236, -0.99416864, -0.99434228,\n", + " -0.99426059, -0.99459663, -0.99472365, -0.99511795, -0.34328358,\n", + " -0.30094787, -0.36304817, -0.36259859, -0.63317757, -0.6323185 ,\n", + " -0.66042155, -0.62954279, -0.70926143, -0.65380117, -0.66588785,\n", + " -0.66002426],\n", + " [-0.73678026, -0.72139498, -0.72800314, -0.69239812, -0.71350294,\n", + " -0.68233451, -0.69732474, -0.99403811, -0.99408512, -0.9942623 ,\n", + " -0.99417111, -0.99451525, -0.99463206, -0.9950315 , -0.40671642,\n", + " -0.36018957, -0.44644141, -0.36242395, -0.72897196, -0.71194379,\n", + " -0.71896956, -0.68347011, -0.70926143, -0.6748538 , -0.69392523,\n", + " -0.71027552],\n", + " [-0.75401488, -0.74333856, -0.75112679, -0.71590909, -0.76555773,\n", + " -0.73599687, -0.75278266, -0.99395808, -0.99400684, -0.99419094,\n", + " -0.99409367, -0.99444556, -0.99455517, -0.99495418, -0.43656716,\n", + " -0.3957346 , -0.465133 , -0.36226933, -0.7453271 , -0.735363 ,\n", + " -0.74004684, -0.70926143, -0.76084408, -0.73099415, -0.75 ,\n", + " -0.7463178 ],\n", + " [-0.79866823, -0.76684953, -0.7558299 , -0.72688088, -0.76125245,\n", + " -0.75714845, -0.78363601, -0.99389098, -0.99393583, -0.99411958,\n", + " -0.99401538, -0.99437709, -0.99448423, -0.99489036, -0.43843284,\n", + " -0.37914692, -0.49388929, -0.36212623, -0.78971963, -0.75644028,\n", + " -0.7470726 , -0.72098476, -0.75615475, -0.7497076 , -0.78037383,\n", + " -0.76572518],\n", + " [-0.84919702, -0.83855799, -0.82245738, -0.78134796, -0.75225049,\n", + " -0.70661966, -0.65787932, -0.99384186, -0.99388279, -0.9940635 ,\n", + " -0.99395157, -0.9943113 , -0.99441264, -0.99481202, -0.51679104,\n", + " -0.50473934, -0.53414809, -0.36199904, -0.8411215 , -0.83138173,\n", + " -0.81264637, -0.77256741, -0.74677608, -0.70292398, -0.65654206,\n", + " -0.77438919],\n", + " [-0.69134352, -0.705721 , -0.73584166, -0.70297806, -0.75225049,\n", + " -0.72659616, -0.71724273, -0.99377229, -0.99381646, -0.99400032,\n", + " -0.99387925, -0.99423682, -0.99433003, -0.99471624, -0.43843284,\n", + " -0.40521327, -0.48094896, -0.36184615, -0.68457944, -0.69555035,\n", + " -0.72599532, -0.6975381 , -0.74677608, -0.71929825, -0.71261682,\n", + " -0.71893953],\n", + " [-0.84488837, -0.82915361, -0.83578287, -0.81896552, -0.86105675,\n", + " -0.8613396 , -0.86330795, -0.99369779, -0.99374656, -0.99393715,\n", + " -0.99381182, -0.99418494, -0.99427639, -0.99466379, -0.49253731,\n", + " -0.48104265, -0.51545651, -0.36172116, -0.8364486 , -0.81967213,\n", + " -0.82435597, -0.81008206, -0.85463072, -0.85497076, -0.86214953,\n", + " -0.84889967],\n", + " [-0.77908343, -0.78761755, -0.78757594, -0.78918495, -0.82348337,\n", + " -0.82491187, -0.85276313, -0.99365725, -0.99370625, -0.99389819,\n", + " -0.99377113, -0.99415254, -0.99424222, -0.99463329, -0.52798507,\n", + " -0.6042654 , -0.51545651, -0.36164779, -0.77336449, -0.77985948,\n", + " -0.78220141, -0.78429074, -0.86635404, -0.82222222, -0.85046729,\n", + " -0.81562987],\n", + " [-0.70544458, -0.64733542, -0.64844209, -0.61833856, -0.6481409 ,\n", + " -0.66392479, -0.71646163, -0.99356747, -0.99360832, -0.99380327,\n", + " -0.99367558, -0.99407272, -0.99415647, -0.99456035, -0.36567164,\n", + " -0.4549763 , -0.34291876, -0.36146698, -0.70560748, -0.63934426,\n", + " -0.63934426, -0.62016413, -0.64830012, -0.65847953, -0.72663551,\n", + " -0.66868827],\n", + " [-0.70387779, -0.67202194, -0.69508132, -0.72413793, -0.73228963,\n", + " -0.72816295, -0.72310096, -0.99348204, -0.99351955, -0.99372023,\n", + " -0.99359367, -0.99399256, -0.99407882, -0.99449203, -0.38432836,\n", + " -0.58530806, -0.33141625, -0.36130226, -0.69392523, -0.66042155,\n", + " -0.68384075, -0.71629543, -0.72801876, -0.72163743, -0.72196262,\n", + " -0.7113152 ],\n", + " [-0.8515472 , -0.81073668, -0.776602 , -0.76724138, -0.78277886,\n", + " -0.75832354, -0.74262839, -0.99341682, -0.99344607, -0.99364669,\n", + " -0.99352762, -0.99392743, -0.99401037, -0.99441763, -0.44029851,\n", + " -0.5521327 , -0.38461538, -0.36116102, -0.84345794, -0.80327869,\n", + " -0.76814988, -0.76084408, -0.77725674, -0.75204678, -0.73831776,\n", + " -0.7865188 ],\n", + " [-0.80258519, -0.83659875, -0.83499902, -0.79741379, -0.80821918,\n", + " -0.81629456, -0.79379028, -0.99336347, -0.99339091, -0.99358745,\n", + " -0.99346147, -0.9938642 , -0.99394733, -0.99434605, -0.44962687,\n", + " -0.6563981 , -0.34579439, -0.36103606, -0.79439252, -0.82669789,\n", + " -0.82669789, -0.78898007, -0.80304807, -0.81052632, -0.79205607,\n", + " -0.81632299],\n", + " [-0.83313749, -0.87539185, -0.90241035, -0.88440439, -0.86771037,\n", + " -0.87935762, -0.87580551, -0.99331764, -0.99335898, -0.99355602,\n", + " -0.99342259, -0.99382267, -0.99390959, -0.99430418, -0.54291045,\n", + " -0.72274882, -0.42918763, -0.36096002, -0.82943925, -0.87119438,\n", + " -0.89461358, -0.87573271, -0.86166471, -0.87134503, -0.87383178,\n", + " -0.88078323],\n", + " [-0.56678418, -0.60031348, -0.64295512, -0.78409091, -0.76164384,\n", + " -0.78535057, -0.82464362, -0.99321481, -0.99327557, -0.99349034,\n", + " -0.99337881, -0.9937915 , -0.99387347, -0.99427367, -0.32835821,\n", + " -0.47630332, -0.25808771, -0.36084678, -0.56074766, -0.59250585,\n", + " -0.6323185 , -0.77960141, -0.84759672, -0.78947368, -0.8364486 ,\n", + " -0.72621729],\n", + " [-0.77007442, -0.81230408, -0.83186361, -0.85540752, -0.85870841,\n", + " -0.86486486, -0.847686 , -0.99311634, -0.99319338, -0.99341516,\n", + " -0.99332651, -0.99374196, -0.99381551, -0.99422246, -0.46641791,\n", + " -0.65165877, -0.39324227, -0.36071245, -0.76168224, -0.80093677,\n", + " -0.82201405, -0.84759672, -0.85463072, -0.85730994, -0.84579439,\n", + " -0.83780974],\n", + " [-0.87622405, -0.92163009, -0.91377621, -0.89224138, -0.84540117,\n", + " -0.83431257, -0.82112869, -0.99306816, -0.99315821, -0.99338734,\n", + " -0.99329935, -0.99370611, -0.99377885, -0.9941789 , -0.55783582,\n", + " -0.65402844, -0.50970525, -0.36064058, -0.86682243, -0.91334895,\n", + " -0.90632319, -0.88745604, -0.84056272, -0.82923977, -0.81775701,\n", + " -0.87731762],\n", + " [-0.82843713, -0.83111285, -0.84166177, -0.8322884 , -0.84579256,\n", + " -0.8515472 , -0.86057411, -0.99302656, -0.99312426, -0.99335155,\n", + " -0.99325919, -0.99365991, -0.99373278, -0.99413129, -0.50559701,\n", + " -0.53791469, -0.52120776, -0.36055736, -0.82242991, -0.82201405,\n", + " -0.83138173, -0.82415006, -0.84056272, -0.84327485, -0.85747664,\n", + " -0.84508751],\n", + " [-0.74539757, -0.73824451, -0.76484421, -0.72100313, -0.73228963,\n", + " -0.70975323, -0.739504 , -0.99296569, -0.99306553, -0.99329699,\n", + " -0.9932005 , -0.99360224, -0.99367493, -0.99407862, -0.45149254,\n", + " -0.46208531, -0.48382459, -0.36044105, -0.73598131, -0.73067916,\n", + " -0.75644028, -0.71629543, -0.72801876, -0.70526316, -0.73831776,\n", + " -0.73696067],\n", + " [-0.40814728, -0.4596395 , -0.51087596, -0.46316614, -0.54598826,\n", + " -0.50607129, -0.57039641, -0.99283748, -0.99294147, -0.9931881 ,\n", + " -0.99308418, -0.99349681, -0.99356041, -0.99398047, -0.30597015,\n", + " -0.29383886, -0.34867002, -0.36020709, -0.46728972, -0.470726 ,\n", + " -0.5175644 , -0.48651817, -0.55685815, -0.51812865, -0.59579439,\n", + " -0.5179345 ],\n", + " [-0.47591069, -0.45219436, -0.48579267, -0.48981191, -0.57847358,\n", + " -0.54876616, -0.61882445, -0.99268659, -0.99280044, -0.99306033,\n", + " -0.99295359, -0.99338192, -0.99344287, -0.9938794 , -0.30223881,\n", + " -0.33649289, -0.32278936, -0.35994787, -0.49065421, -0.46370023,\n", + " -0.4941452 , -0.49589683, -0.58264947, -0.55321637, -0.62850467,\n", + " -0.53110379],\n", + " [-0.26792009, -0.27115987, -0.30080345, -0.24412226, -0.34246575,\n", + " -0.30434783, -0.40285101, -0.99250927, -0.99261854, -0.99288914,\n", + " -0.99278188, -0.99322495, -0.99327569, -0.9937324 , -0.22947761,\n", + " -0.28909953, -0.26096334, -0.35960139, -0.33878505, -0.29976581,\n", + " -0.32786885, -0.2919109 , -0.38100821, -0.32865497, -0.42523364,\n", + " -0.3394559 ],\n", + " [-0.31374853, -0.26449843, -0.2941407 , -0.23315047, -0.36516634,\n", + " -0.35957697, -0.44112478, -0.9923035 , -0.99241264, -0.99269787,\n", + " -0.99258055, -0.99304482, -0.99309553, -0.99356987, -0.2108209 ,\n", + " -0.21563981, -0.23652049, -0.35921021, -0.30607477, -0.26229508,\n", + " -0.29039813, -0.23563892, -0.35990621, -0.35204678, -0.43925234,\n", + " -0.32004852]])" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## keras.Sequential.DoubleLSTMTimeSeriesClassifier\n", + "\n", + "* Input: X, y\n", + "* Output: \n", + "* Effect: DoubleLSTM has been fitted." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/plamen/.virtualenvs/GreenGuard/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "If using Keras pass *_constraint arguments to layers.\n", + "WARNING:tensorflow:From /home/plamen/.virtualenvs/GreenGuard/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n", + "\n" + ] + } + ], + "source": [ + "step = 10\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb b/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb new file mode 100644 index 0000000..faec108 --- /dev/null +++ b/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb @@ -0,0 +1,2375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# unstack_lstm_timeseries_classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from greenguard.demo import load_demo\n", + "\n", + "target_times, readings = load_demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_name = 'unstack_lstm_timeseries_classifier'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from greenguard.pipeline import GreenGuardPipeline\n", + "\n", + "pipeline = GreenGuardPipeline(pipeline_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pandas.DataFrame.resample',\n", + " 'pandas.DataFrame.unstack',\n", + " 'pandas.DataFrame.pop',\n", + " 'pandas.DataFrame.pop',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'sklearn.preprocessing.MinMaxScaler',\n", + " 'pandas.DataFrame',\n", + " 'pandas.DataFrame.set',\n", + " 'pandas.DataFrame.set',\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences',\n", + " 'keras.Sequential.LSTMTimeSeriesClassifier']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.template['primitives']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step by Step execution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Input Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation (part of GreenGuard Pipeline)\n", + "\n", + "* Input: target_times, readings, turbines\n", + "* Output: X, y, readings, turbines\n", + "* Effect: target_times has been split into X and y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.resample\n", + "\n", + "* Input: readings\n", + "* Output: readings (resampled)\n", + "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", + " signal_id and timestamp have been set as a multi-index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "context = pipeline.fit(target_times, readings, output_=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
turbine_idsignal_idtimestamp
T001S012013-01-10 00:00:00313.333333
2013-01-10 01:00:00197.500000
2013-01-10 02:00:00248.166667
2013-01-10 03:00:00253.166667
2013-01-10 04:00:00305.000000
\n", + "
" + ], + "text/plain": [ + " value\n", + "turbine_id signal_id timestamp \n", + "T001 S01 2013-01-10 00:00:00 313.333333\n", + " 2013-01-10 01:00:00 197.500000\n", + " 2013-01-10 02:00:00 248.166667\n", + " 2013-01-10 03:00:00 253.166667\n", + " 2013-01-10 04:00:00 305.000000" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
1T0012013-01-10 01:00:00197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
2T0012013-01-10 02:00:00248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
3T0012013-01-10 03:00:00253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
4T0012013-01-10 04:00:00305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", + "0 T001 2013-01-10 00:00:00 313.333333 323.833333 336.000000 \n", + "1 T001 2013-01-10 01:00:00 197.500000 221.333333 216.000000 \n", + "2 T001 2013-01-10 02:00:00 248.166667 271.666667 277.500000 \n", + "3 T001 2013-01-10 03:00:00 253.166667 256.166667 242.666667 \n", + "4 T001 2013-01-10 04:00:00 305.000000 312.333333 346.166667 \n", + "\n", + " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", + "0 364.666667 286.500000 314.000000 243.166667 3.197980e+06 ... \n", + "1 260.666667 206.833333 235.833333 186.666667 3.198221e+06 ... \n", + "2 298.000000 233.666667 271.166667 216.333333 3.198448e+06 ... \n", + "3 265.333333 211.666667 226.666667 181.000000 3.198691e+06 ... \n", + "4 329.833333 280.666667 308.833333 271.833333 3.198978e+06 ... \n", + "\n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", + "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", + "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", + "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", + "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", + "\n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 47.666667 52.666667 40.833333 357.333333 \n", + "1 34.500000 39.333333 31.166667 249.666667 \n", + "2 39.333333 45.500000 36.166667 297.666667 \n", + "3 35.333333 37.833333 30.333333 268.000000 \n", + "4 46.166667 49.666667 41.166667 341.833333 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (unstacked)\n", + "* Output: readings (without turbine_id), turbine_id\n", + "* Effect: turbine_id has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "step = 2\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 T001\n", + "1 T001\n", + "2 T001\n", + "3 T001\n", + "4 T001\n", + "Name: turbine_id, dtype: object" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['turbine_id'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
02013-01-10 00:00:00313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06695143.166667...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
12013-01-10 01:00:00197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06695403.666667...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
22013-01-10 02:00:00248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06695656.500000...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
32013-01-10 03:00:00253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06695911.333333...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
42013-01-10 04:00:00305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06696195.833333...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 2013-01-10 00:00:00 313.333333 323.833333 336.000000 364.666667 \n", + "1 2013-01-10 01:00:00 197.500000 221.333333 216.000000 260.666667 \n", + "2 2013-01-10 02:00:00 248.166667 271.666667 277.500000 298.000000 \n", + "3 2013-01-10 03:00:00 253.166667 256.166667 242.666667 265.333333 \n", + "4 2013-01-10 04:00:00 305.000000 312.333333 346.166667 329.833333 \n", + "\n", + " value_S05 value_S06 value_S07 value_S08 value_S09 ... \\\n", + "0 286.500000 314.000000 243.166667 3.197980e+06 695143.166667 ... \n", + "1 206.833333 235.833333 186.666667 3.198221e+06 695403.666667 ... \n", + "2 233.666667 271.166667 216.333333 3.198448e+06 695656.500000 ... \n", + "3 211.666667 226.666667 181.000000 3.198691e+06 695911.333333 ... \n", + "4 280.666667 308.833333 271.833333 3.198978e+06 696195.833333 ... \n", + "\n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", + "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", + "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", + "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", + "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", + "\n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 47.666667 52.666667 40.833333 357.333333 \n", + "1 34.500000 39.333333 31.166667 249.666667 \n", + "2 39.333333 45.500000 36.166667 297.666667 \n", + "3 35.333333 37.833333 30.333333 268.000000 \n", + "4 46.166667 49.666667 41.166667 341.833333 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (without turbine_id)\n", + "* Output: readings (without timestamp), timestamp\n", + "* Effect: timestamp has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "step = 3\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2013-01-10 00:00:00\n", + "1 2013-01-10 01:00:00\n", + "2 2013-01-10 02:00:00\n", + "3 2013-01-10 03:00:00\n", + "4 2013-01-10 04:00:00\n", + "Name: timestamp, dtype: datetime64[ns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['timestamp'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09value_S10...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06695143.1666673.348384e+06...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
1197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06695403.6666673.348651e+06...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
2248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06695656.5000003.348910e+06...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
3253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06695911.3333333.349157e+06...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
4305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06696195.8333333.349452e+06...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", + "0 313.333333 323.833333 336.000000 364.666667 286.500000 314.000000 \n", + "1 197.500000 221.333333 216.000000 260.666667 206.833333 235.833333 \n", + "2 248.166667 271.666667 277.500000 298.000000 233.666667 271.166667 \n", + "3 253.166667 256.166667 242.666667 265.333333 211.666667 226.666667 \n", + "4 305.000000 312.333333 346.166667 329.833333 280.666667 308.833333 \n", + "\n", + " value_S07 value_S08 value_S09 value_S10 ... value_S17 \\\n", + "0 243.166667 3.197980e+06 695143.166667 3.348384e+06 ... 10.383333 \n", + "1 186.666667 3.198221e+06 695403.666667 3.348651e+06 ... 8.666667 \n", + "2 216.333333 3.198448e+06 695656.500000 3.348910e+06 ... 8.833333 \n", + "3 181.000000 3.198691e+06 695911.333333 3.349157e+06 ... 8.433333 \n", + "4 271.833333 3.198978e+06 696195.833333 3.349452e+06 ... 9.083333 \n", + "\n", + " value_S18 value_S19 value_S20 value_S21 value_S22 value_S23 \\\n", + "0 3.131958e+06 52.666667 54.333333 56.166667 61.000000 47.666667 \n", + "1 3.133668e+06 33.166667 37.000000 36.166667 43.666667 34.500000 \n", + "2 3.135413e+06 41.500000 45.666667 46.500000 49.666667 39.333333 \n", + "3 3.137001e+06 42.333333 42.833333 40.500000 44.166667 35.333333 \n", + "4 3.138843e+06 50.500000 51.166667 55.500000 53.666667 46.166667 \n", + "\n", + " value_S24 value_S25 value_S26 \n", + "0 52.666667 40.833333 357.333333 \n", + "1 39.333333 31.166667 249.666667 \n", + "2 45.500000 36.166667 297.666667 \n", + "3 37.833333 30.333333 268.000000 \n", + "4 49.666667 41.166667 341.833333 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## sklearn.impute.SimpleImputer\n", + "\n", + "* Input: readings (unstacked, no turbine_id, no timestamp)\n", + "* Output: readings (imputed, numpy array)\n", + "* Effect: readings have been imputed and converted to numpy array" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "step = 4\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[3.13333333e+02, 3.23833333e+02, 3.36000000e+02, 3.64666667e+02,\n", + " 2.86500000e+02, 3.14000000e+02, 2.43166667e+02, 3.19798000e+06,\n", + " 6.95143167e+05, 3.34838383e+06, 3.43692150e+06, 3.32248667e+06,\n", + " 3.35809000e+06, 3.22390150e+06, 7.95000000e+00, 5.85000000e+00,\n", + " 1.03833333e+01, 3.13195833e+06, 5.26666667e+01, 5.43333333e+01,\n", + " 5.61666667e+01, 6.10000000e+01, 4.76666667e+01, 5.26666667e+01,\n", + " 4.08333333e+01, 3.57333333e+02],\n", + " [1.97500000e+02, 2.21333333e+02, 2.16000000e+02, 2.60666667e+02,\n", + " 2.06833333e+02, 2.35833333e+02, 1.86666667e+02, 3.19822067e+06,\n", + " 6.95403667e+05, 3.34865117e+06, 3.43722283e+06, 3.32272200e+06,\n", + " 3.35834000e+06, 3.22409567e+06, 6.83333333e+00, 5.15000000e+00,\n", + " 8.66666667e+00, 3.13366817e+06, 3.31666667e+01, 3.70000000e+01,\n", + " 3.61666667e+01, 4.36666667e+01, 3.45000000e+01, 3.93333333e+01,\n", + " 3.11666667e+01, 2.49666667e+02],\n", + " [2.48166667e+02, 2.71666667e+02, 2.77500000e+02, 2.98000000e+02,\n", + " 2.33666667e+02, 2.71166667e+02, 2.16333333e+02, 3.19844767e+06,\n", + " 6.95656500e+05, 3.34890967e+06, 3.43751900e+06, 3.32295950e+06,\n", + " 3.35862067e+06, 3.22432333e+06, 7.11666667e+00, 5.56666667e+00,\n", + " 8.83333333e+00, 3.13541283e+06, 4.15000000e+01, 4.56666667e+01,\n", + " 4.65000000e+01, 4.96666667e+01, 3.93333333e+01, 4.55000000e+01,\n", + " 3.61666667e+01, 2.97666667e+02],\n", + " [2.53166667e+02, 2.56166667e+02, 2.42666667e+02, 2.65333333e+02,\n", + " 2.11666667e+02, 2.26666667e+02, 1.81000000e+02, 3.19869117e+06,\n", + " 6.95911333e+05, 3.34915717e+06, 3.43778050e+06, 3.32316850e+06,\n", + " 3.35884883e+06, 3.22450217e+06, 6.71666667e+00, 5.16666667e+00,\n", + " 8.43333333e+00, 3.13700133e+06, 4.23333333e+01, 4.28333333e+01,\n", + " 4.05000000e+01, 4.41666667e+01, 3.53333333e+01, 3.78333333e+01,\n", + " 3.03333333e+01, 2.68000000e+02],\n", + " [3.05000000e+02, 3.12333333e+02, 3.46166667e+02, 3.29833333e+02,\n", + " 2.80666667e+02, 3.08833333e+02, 2.71833333e+02, 3.19897850e+06,\n", + " 6.96195833e+05, 3.34945200e+06, 3.43807767e+06, 3.32340933e+06,\n", + " 3.35910983e+06, 3.22471400e+06, 7.20000000e+00, 5.28333333e+00,\n", + " 9.08333333e+00, 3.13884333e+06, 5.05000000e+01, 5.11666667e+01,\n", + " 5.55000000e+01, 5.36666667e+01, 4.61666667e+01, 4.96666667e+01,\n", + " 4.11666667e+01, 3.41833333e+02]])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## sklearn.preprocessing.MinMaxScaler\n", + "\n", + "* Input: (imputed, array)\n", + "* Output: readings (scaled, array)\n", + "* Effect: readings have been scaled to [-1, 1] range" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "step = 5\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.26126126, -0.23706897, -0.20870076, -0.14106583, -0.32328767,\n", + " -0.25969448, -0.42198789, -1. , -1. , -1. ,\n", + " -1. , -1. , -1. , -1. , -0.11007463,\n", + " -0.16824645, -0.10424155, -0.37397741, -0.25233645, -0.22716628,\n", + " -0.20140515, -0.13481829, -0.32239156, -0.25380117, -0.4182243 ,\n", + " -0.25697453],\n", + " [-0.53349001, -0.47805643, -0.49088771, -0.38557994, -0.51037182,\n", + " -0.44339992, -0.55438391, -0.99983031, -0.99982547, -0.99982499,\n", + " -0.99980741, -0.9998428 , -0.99983779, -0.99986887, -0.23507463,\n", + " -0.26777251, -0.25233645, -0.37363511, -0.52570093, -0.470726 ,\n", + " -0.4824356 , -0.37866354, -0.50762016, -0.44093567, -0.55373832,\n", + " -0.48085254],\n", + " [-0.41441441, -0.35971787, -0.3462669 , -0.29780564, -0.44735812,\n", + " -0.36036036, -0.48486624, -0.99967026, -0.99965608, -0.99965576,\n", + " -0.99961813, -0.99968416, -0.99965569, -0.99971512, -0.20335821,\n", + " -0.20853081, -0.2379583 , -0.37328583, -0.4088785 , -0.34894614,\n", + " -0.33723653, -0.29425557, -0.43962485, -0.35438596, -0.48364486,\n", + " -0.38104315],\n", + " [-0.40266353, -0.39615987, -0.4281795 , -0.37460815, -0.49902153,\n", + " -0.4649432 , -0.56766257, -0.99949857, -0.99948535, -0.99949373,\n", + " -0.999451 , -0.99954455, -0.99950765, -0.99959435, -0.24813433,\n", + " -0.26540284, -0.27246585, -0.37296782, -0.39719626, -0.38875878,\n", + " -0.42154567, -0.37162954, -0.49589683, -0.4619883 , -0.56542056,\n", + " -0.4427309 ],\n", + " [-0.28084606, -0.26410658, -0.18479326, -0.22296238, -0.3369863 ,\n", + " -0.27183705, -0.35481351, -0.99929598, -0.99929474, -0.99930071,\n", + " -0.99926107, -0.99938368, -0.99933831, -0.9994513 , -0.19402985,\n", + " -0.24881517, -0.21639109, -0.37259906, -0.28271028, -0.27166276,\n", + " -0.21077283, -0.23798359, -0.34349355, -0.29590643, -0.4135514 ,\n", + " -0.28920464]])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame\n", + "\n", + "* Input: readings (scaled, array)\n", + "* Output: readings (dataframe)\n", + "* Effect: readings have been converted into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "step = 6\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...16171819202122232425
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.104242-0.373977-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.252336-0.373635-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.237958-0.373286-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.272466-0.372968-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.216391-0.372599-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", + "\n", + " 7 8 9 ... 16 17 18 19 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.104242 -0.373977 -0.252336 -0.227166 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.252336 -0.373635 -0.525701 -0.470726 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.237958 -0.373286 -0.408879 -0.348946 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.272466 -0.372968 -0.397196 -0.388759 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.216391 -0.372599 -0.282710 -0.271663 \n", + "\n", + " 20 21 22 23 24 25 \n", + "0 -0.201405 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 \n", + "1 -0.482436 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 \n", + "2 -0.337237 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 \n", + "3 -0.421546 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 \n", + "4 -0.210773 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe)\n", + "* Output: readings (dataframe with turbine_id)\n", + "* Effect: turbine_id has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "step = 7\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...171819202122232425turbine_id
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.373977-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975T001
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.373635-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853T001
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.373286-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043T001
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.372968-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731T001
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.372599-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205T001
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", + "\n", + " 7 8 9 ... 17 18 19 20 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.373977 -0.252336 -0.227166 -0.201405 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.373635 -0.525701 -0.470726 -0.482436 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.373286 -0.408879 -0.348946 -0.337237 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.372968 -0.397196 -0.388759 -0.421546 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.372599 -0.282710 -0.271663 -0.210773 \n", + "\n", + " 21 22 23 24 25 turbine_id \n", + "0 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 T001 \n", + "1 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 T001 \n", + "2 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 T001 \n", + "3 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 T001 \n", + "4 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 T001 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe with turbine_id)\n", + "* Output: readings (dataframe with turbine_id and timestamp)\n", + "* Effect: timestamp has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "step = 8\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...1819202122232425turbine_idtimestamp
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975T0012013-01-10 00:00:00
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853T0012013-01-10 01:00:00
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043T0012013-01-10 02:00:00
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731T0012013-01-10 03:00:00
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205T0012013-01-10 04:00:00
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", + "\n", + " 7 8 9 ... 18 19 20 21 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.252336 -0.227166 -0.201405 -0.134818 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.525701 -0.470726 -0.482436 -0.378664 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.408879 -0.348946 -0.337237 -0.294256 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.397196 -0.388759 -0.421546 -0.371630 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.282710 -0.271663 -0.210773 -0.237984 \n", + "\n", + " 22 23 24 25 turbine_id timestamp \n", + "0 -0.322392 -0.253801 -0.418224 -0.256975 T001 2013-01-10 00:00:00 \n", + "1 -0.507620 -0.440936 -0.553738 -0.480853 T001 2013-01-10 01:00:00 \n", + "2 -0.439625 -0.354386 -0.483645 -0.381043 T001 2013-01-10 02:00:00 \n", + "3 -0.495897 -0.461988 -0.565421 -0.442731 T001 2013-01-10 03:00:00 \n", + "4 -0.343494 -0.295906 -0.413551 -0.289205 T001 2013-01-10 04:00:00 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences\n", + "\n", + "* Input: X, readings (dataframe with turbine_id and timestamp)\n", + "* Output: X\n", + "* Effect: X has been converted to a 3d numpy array that contains 1 matrix of shape\n", + " (window_size x num_signals) for each one of the target times." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline._pipeline.get_hyperparameters()[\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "step = 9\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(8521, 28)" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353,)" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['y'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353, 24, 26)" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.58793576, -0.60305643, -0.63981971, -0.61481191, -0.69823875,\n", + " -0.65021543, -0.68912322, -0.99436914, -0.99439755, -0.99454249,\n", + " -0.99446788, -0.99476185, -0.99490997, -0.99529511, -0.34701493,\n", + " -0.33886256, -0.33860532, -0.36301186, -0.57943925, -0.59250585,\n", + " -0.6323185 , -0.60609613, -0.69284877, -0.64444444, -0.68691589,\n", + " -0.63853752],\n", + " [-0.56600078, -0.5846395 , -0.63002156, -0.61559561, -0.70880626,\n", + " -0.66392479, -0.69732474, -0.9942427 , -0.99427986, -0.9944408 ,\n", + " -0.99436498, -0.99468147, -0.99482011, -0.99521249, -0.33955224,\n", + " -0.31516588, -0.38892883, -0.36280656, -0.55841121, -0.57611241,\n", + " -0.62295082, -0.61078546, -0.70222743, -0.65847953, -0.69392523,\n", + " -0.63645815],\n", + " [-0.64081473, -0.64184953, -0.67038997, -0.63597179, -0.71350294,\n", + " -0.65844105, -0.66764304, -0.99412236, -0.99416864, -0.99434228,\n", + " -0.99426059, -0.99459663, -0.99472365, -0.99511795, -0.34328358,\n", + " -0.30094787, -0.36304817, -0.36259859, -0.63317757, -0.6323185 ,\n", + " -0.66042155, -0.62954279, -0.70926143, -0.65380117, -0.66588785,\n", + " -0.66002426]])" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'][0][:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## keras.Sequential.LSTMTimeSeriesClassifier\n", + "\n", + "* Input: X, y\n", + "* Output: \n", + "* Effect: LSTM has been fitted." + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "WARNING:tensorflow:From /home/plamen/.virtualenvs/GreenGuard/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", + "Instructions for updating:\n", + "If using Keras pass *_constraint arguments to layers.\n", + "WARNING:tensorflow:From /home/plamen/.virtualenvs/GreenGuard/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n", + "\n" + ] + } + ], + "source": [ + "step = 10\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb b/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb new file mode 100644 index 0000000..6af0092 --- /dev/null +++ b/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb @@ -0,0 +1,1785 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# unstack_normalize_dfs_xgb_classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], + "source": [ + "from greenguard.demo import load_demo\n", + "\n", + "target_times, readings = load_demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_name = 'unstack_normalize_dfs_xgb_classifier'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from greenguard.pipeline import GreenGuardPipeline\n", + "\n", + "pipeline = GreenGuardPipeline(pipeline_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['pandas.DataFrame.resample',\n", + " 'pandas.DataFrame.unstack',\n", + " 'featuretools.EntitySet.entity_from_dataframe',\n", + " 'featuretools.EntitySet.normalize_entity',\n", + " 'featuretools.dfs',\n", + " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", + " 'xgboost.XGBClassifier']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.template['primitives']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step by Step execution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Input Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation (part of GreenGuard Pipeline)\n", + "\n", + "* Input: target_times, readings, turbines\n", + "* Output: X, y, readings, turbines\n", + "* Effect: target_times has been split into X and y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.resample\n", + "\n", + "* Input: readings\n", + "* Output: readings (resampled)\n", + "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", + " signal_id and timestamp have been set as a multi-index" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "step = 0\n", + "context = pipeline.fit(target_times, readings, output_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
turbine_idsignal_idtimestamp
T001S012013-01-10 00:00:00323.0
2013-01-10 00:10:00346.0
2013-01-10 00:20:00407.0
2013-01-10 00:30:00257.0
2013-01-10 00:40:00267.0
\n", + "
" + ], + "text/plain": [ + " value\n", + "turbine_id signal_id timestamp \n", + "T001 S01 2013-01-10 00:00:00 323.0\n", + " 2013-01-10 00:10:00 346.0\n", + " 2013-01-10 00:20:00 407.0\n", + " 2013-01-10 00:30:00 257.0\n", + " 2013-01-10 00:40:00 267.0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00323.0320.0284.0348.0273.0342.0280.03197842.0...11.73131020.055.055.047.058.045.058.047.0356.0
1T0012013-01-10 00:10:00346.0384.0367.0411.0331.0360.0249.03197900.0...10.23131420.058.063.062.067.055.061.042.0400.0
2T0012013-01-10 00:20:00407.0363.0407.0393.0275.0335.0270.03197968.0...9.53131822.068.061.067.066.046.055.045.0402.0
3T0012013-01-10 00:30:00257.0307.0315.0361.0317.0354.0271.03198011.0...10.53132179.043.051.053.062.053.060.045.0357.0
4T0012013-01-10 00:40:00267.0309.0314.0355.0262.0246.0212.03198056.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", + "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", + "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", + "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", + "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", + "\n", + " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", + "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", + "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", + "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", + "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", + "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", + "\n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + "\n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## featuretools.EntitySet.entity_from_dataframe\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: entityset\n", + "* Effect: Entityset has been generated from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "step = 2\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Entityset: entityset\n", + " Entities:\n", + " readings [Rows: 51121, Columns: 29]\n", + " Relationships:\n", + " No relationships" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['entityset']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## featuretools.EntitySet.normalize_entity\n", + "\n", + "* Input: entityset\n", + "* Output: entityset with relationship (readings.turbine_id with turbines.turbine_id)\n", + "* Effect: establish relation between readings and turbines" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "step = 3\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Entityset: entityset\n", + " Entities:\n", + " readings [Rows: 51121, Columns: 29]\n", + " turbines [Rows: 1, Columns: 1]\n", + " Relationships:\n", + " readings.turbine_id -> turbines.turbine_id" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['entityset']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## featuretools.dfs\n", + "\n", + "* Input: entityset (unstacked, no turbine_id, no timestamp)\n", + "* Output: X (has additional features)\n", + "* Effect: build features for relational dataset using DFS" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "step = 4\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys() " + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUM(readings.value_S09)SUM(readings.value_S01)SUM(readings.value_S12)SUM(readings.value_S10)SUM(readings.value_S18)SUM(readings.value_S03)SUM(readings.value_S16)SUM(readings.value_S11)SUM(readings.value_S21)SUM(readings.value_S08)...MEAN(readings.value_S20)COUNT(readings)NUM_UNIQUE(readings.WEEKDAY(timestamp))NUM_UNIQUE(readings.DAY(timestamp))NUM_UNIQUE(readings.YEAR(timestamp))NUM_UNIQUE(readings.MONTH(timestamp))MODE(readings.WEEKDAY(timestamp))MODE(readings.DAY(timestamp))MODE(readings.YEAR(timestamp))MODE(readings.MONTH(timestamp))
turbine_id
T001102204875.019558.0483068250.0486911931.0463347422.018602.0555.2499808026.03090.0465058755.0...22.406897145221141120131
T001102808505.037965.0483585662.0487487610.0467167621.034495.0719.2500401347.04970.0465669184.0...35.282759145221151220131
T001103701788.073948.0484538080.0488531121.0473938223.077804.0921.1501472849.09902.0466675578.0...53.255172145221161320131
T001104917985.087206.0486012792.0490024295.0483808936.081629.0977.2502994331.010720.0468099974.0...61.482759145221101420131
T00184328762.061778.0389879083.0396521849.0492596536.065122.0954.3403671026.08684.0375635231.0...87.315789145221111520131
\n", + "

5 rows × 165 columns

\n", + "
" + ], + "text/plain": [ + " SUM(readings.value_S09) SUM(readings.value_S01) \\\n", + "turbine_id \n", + "T001 102204875.0 19558.0 \n", + "T001 102808505.0 37965.0 \n", + "T001 103701788.0 73948.0 \n", + "T001 104917985.0 87206.0 \n", + "T001 84328762.0 61778.0 \n", + "\n", + " SUM(readings.value_S12) SUM(readings.value_S10) \\\n", + "turbine_id \n", + "T001 483068250.0 486911931.0 \n", + "T001 483585662.0 487487610.0 \n", + "T001 484538080.0 488531121.0 \n", + "T001 486012792.0 490024295.0 \n", + "T001 389879083.0 396521849.0 \n", + "\n", + " SUM(readings.value_S18) SUM(readings.value_S03) \\\n", + "turbine_id \n", + "T001 463347422.0 18602.0 \n", + "T001 467167621.0 34495.0 \n", + "T001 473938223.0 77804.0 \n", + "T001 483808936.0 81629.0 \n", + "T001 492596536.0 65122.0 \n", + "\n", + " SUM(readings.value_S16) SUM(readings.value_S11) \\\n", + "turbine_id \n", + "T001 555.2 499808026.0 \n", + "T001 719.2 500401347.0 \n", + "T001 921.1 501472849.0 \n", + "T001 977.2 502994331.0 \n", + "T001 954.3 403671026.0 \n", + "\n", + " SUM(readings.value_S21) SUM(readings.value_S08) ... \\\n", + "turbine_id ... \n", + "T001 3090.0 465058755.0 ... \n", + "T001 4970.0 465669184.0 ... \n", + "T001 9902.0 466675578.0 ... \n", + "T001 10720.0 468099974.0 ... \n", + "T001 8684.0 375635231.0 ... \n", + "\n", + " MEAN(readings.value_S20) COUNT(readings) \\\n", + "turbine_id \n", + "T001 22.406897 145 \n", + "T001 35.282759 145 \n", + "T001 53.255172 145 \n", + "T001 61.482759 145 \n", + "T001 87.315789 145 \n", + "\n", + " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "\n", + " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "\n", + " NUM_UNIQUE(readings.YEAR(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.WEEKDAY(timestamp)) MODE(readings.DAY(timestamp)) \\\n", + "turbine_id \n", + "T001 4 11 \n", + "T001 5 12 \n", + "T001 6 13 \n", + "T001 0 14 \n", + "T001 1 15 \n", + "\n", + " MODE(readings.YEAR(timestamp)) MODE(readings.MONTH(timestamp)) \n", + "turbine_id \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "\n", + "[5 rows x 165 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "165" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# features generated (the turbine_id is set as index).\n", + "len(context['X'].columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlprimitives.custom.feature_extraction.CategoricalEncoder\n", + "\n", + "* Input: X\n", + "* Output: X (label encoded)\n", + "* Effect: encodes categorical features using OneHotLabelEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "step = 5\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUM(readings.value_S09)SUM(readings.value_S01)SUM(readings.value_S12)SUM(readings.value_S10)SUM(readings.value_S18)SUM(readings.value_S03)SUM(readings.value_S16)SUM(readings.value_S11)SUM(readings.value_S21)SUM(readings.value_S08)...MEAN(readings.value_S20)COUNT(readings)NUM_UNIQUE(readings.WEEKDAY(timestamp))NUM_UNIQUE(readings.DAY(timestamp))NUM_UNIQUE(readings.YEAR(timestamp))NUM_UNIQUE(readings.MONTH(timestamp))MODE(readings.WEEKDAY(timestamp))MODE(readings.DAY(timestamp))MODE(readings.YEAR(timestamp))MODE(readings.MONTH(timestamp))
turbine_id
T001102204875.019558.0483068250.0486911931.0463347422.018602.0555.2499808026.03090.0465058755.0...22.406897145221141120131
T001102808505.037965.0483585662.0487487610.0467167621.034495.0719.2500401347.04970.0465669184.0...35.282759145221151220131
T001103701788.073948.0484538080.0488531121.0473938223.077804.0921.1501472849.09902.0466675578.0...53.255172145221161320131
T001104917985.087206.0486012792.0490024295.0483808936.081629.0977.2502994331.010720.0468099974.0...61.482759145221101420131
T00184328762.061778.0389879083.0396521849.0492596536.065122.0954.3403671026.08684.0375635231.0...87.315789145221111520131
\n", + "

5 rows × 165 columns

\n", + "
" + ], + "text/plain": [ + " SUM(readings.value_S09) SUM(readings.value_S01) \\\n", + "turbine_id \n", + "T001 102204875.0 19558.0 \n", + "T001 102808505.0 37965.0 \n", + "T001 103701788.0 73948.0 \n", + "T001 104917985.0 87206.0 \n", + "T001 84328762.0 61778.0 \n", + "\n", + " SUM(readings.value_S12) SUM(readings.value_S10) \\\n", + "turbine_id \n", + "T001 483068250.0 486911931.0 \n", + "T001 483585662.0 487487610.0 \n", + "T001 484538080.0 488531121.0 \n", + "T001 486012792.0 490024295.0 \n", + "T001 389879083.0 396521849.0 \n", + "\n", + " SUM(readings.value_S18) SUM(readings.value_S03) \\\n", + "turbine_id \n", + "T001 463347422.0 18602.0 \n", + "T001 467167621.0 34495.0 \n", + "T001 473938223.0 77804.0 \n", + "T001 483808936.0 81629.0 \n", + "T001 492596536.0 65122.0 \n", + "\n", + " SUM(readings.value_S16) SUM(readings.value_S11) \\\n", + "turbine_id \n", + "T001 555.2 499808026.0 \n", + "T001 719.2 500401347.0 \n", + "T001 921.1 501472849.0 \n", + "T001 977.2 502994331.0 \n", + "T001 954.3 403671026.0 \n", + "\n", + " SUM(readings.value_S21) SUM(readings.value_S08) ... \\\n", + "turbine_id ... \n", + "T001 3090.0 465058755.0 ... \n", + "T001 4970.0 465669184.0 ... \n", + "T001 9902.0 466675578.0 ... \n", + "T001 10720.0 468099974.0 ... \n", + "T001 8684.0 375635231.0 ... \n", + "\n", + " MEAN(readings.value_S20) COUNT(readings) \\\n", + "turbine_id \n", + "T001 22.406897 145 \n", + "T001 35.282759 145 \n", + "T001 53.255172 145 \n", + "T001 61.482759 145 \n", + "T001 87.315789 145 \n", + "\n", + " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "\n", + " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "\n", + " NUM_UNIQUE(readings.YEAR(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.WEEKDAY(timestamp)) MODE(readings.DAY(timestamp)) \\\n", + "turbine_id \n", + "T001 4 11 \n", + "T001 5 12 \n", + "T001 6 13 \n", + "T001 0 14 \n", + "T001 1 15 \n", + "\n", + " MODE(readings.YEAR(timestamp)) MODE(readings.MONTH(timestamp)) \n", + "turbine_id \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "\n", + "[5 rows x 165 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00323.0320.0284.0348.0273.0342.0280.03197842.0...11.73131020.055.055.047.058.045.058.047.0356.0
1T0012013-01-10 00:10:00346.0384.0367.0411.0331.0360.0249.03197900.0...10.23131420.058.063.062.067.055.061.042.0400.0
2T0012013-01-10 00:20:00407.0363.0407.0393.0275.0335.0270.03197968.0...9.53131822.068.061.067.066.046.055.045.0402.0
3T0012013-01-10 00:30:00257.0307.0315.0361.0317.0354.0271.03198011.0...10.53132179.043.051.053.062.053.060.045.0357.0
4T0012013-01-10 00:40:00267.0309.0314.0355.0262.0246.0212.03198056.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", + "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", + "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", + "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", + "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", + "\n", + " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", + "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", + "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", + "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", + "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", + "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", + "\n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + "\n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## xgboost.XGBClassifier\n", + "\n", + "* Input: X (label encoded and featurized)\n", + "* Output: None\n", + "* Effect: trained model" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "step = 6\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 0360fe2fa8b5d53f28f996e38bec8d4ca66798f8 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 23 Oct 2020 10:47:54 +0200 Subject: [PATCH 118/171] Fix mkdirs --- greenguard/benchmark.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index eff50bc..89d1076 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -586,7 +586,8 @@ def run_benchmark(templates, problems, window_size_resample_rule=None, results = pd.concat(results, ignore_index=True) if output_path: - os.makedirs(output_path, exist_ok=True) + if os.path.dirname(output_path): + os.makedirs(os.path.dirname(output_path), exist_ok=True) results.to_csv(output_path, index=False) else: From 704ad911d15faf18943daf2b1caa7a096ed46e83 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 23 Oct 2020 10:59:48 +0200 Subject: [PATCH 119/171] Add release notes for v0.2.6 --- HISTORY.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index d0c8364..ef6042e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,14 @@ # History +## 0.2.6 - 2020-10-23 + +* Fix ``mkdir`` when exporting to ``csv`` file the benchmark results. +* Intermediate steps for the pipelines with demo notebooks for each pipeline. + +### Resolved Issues + +* Issue #50: Expose partial outputs and executions in the ``GreenGuardPipeline``. + ## 0.2.5 - 2020-10-09 With this release we include: From 5fb8e05c872b635b585163b9ab5b62320d3b08fc Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 23 Oct 2020 10:59:52 +0200 Subject: [PATCH 120/171] =?UTF-8?q?Bump=20version:=200.2.6.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 662545d..4ff2249 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.6.dev0' +__version__ = '0.2.6' import os diff --git a/setup.cfg b/setup.cfg index 919f5d6..4a8c235 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.6.dev0 +current_version = 0.2.6 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 3bce783..a056e0d 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.6.dev0', + version='0.2.6', zip_safe=False, ) From bd035ec11de791205c0f1b2b44d7c0765913191a Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 23 Oct 2020 11:12:05 +0200 Subject: [PATCH 121/171] =?UTF-8?q?Bump=20version:=200.2.6=20=E2=86=92=200?= =?UTF-8?q?.2.7.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 4ff2249..63ff9ee 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.6' +__version__ = '0.2.7.dev0' import os diff --git a/setup.cfg b/setup.cfg index 4a8c235..86de050 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.6 +current_version = 0.2.7.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index a056e0d..63b5ae7 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.6', + version='0.2.7.dev0', zip_safe=False, ) From 486ca5c6303c97c6a6881a59abec1e31bc3f7547 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Tue, 1 Dec 2020 12:13:45 +0100 Subject: [PATCH 122/171] Update dependencies, python and tox (#53) * Update requirements and test environments. * Add system dependancie for xgbost. * Update python version available. --- .github/workflows/tests.yml | 83 +++++++++++++++++++++++++++++++++++-- Makefile | 47 +++++++++++++-------- README.md | 2 +- setup.py | 14 +++---- tox.ini | 40 ++++++++++++------ 5 files changed, 145 insertions(+), 41 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 135d2a5..97dbb0e 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,25 +7,100 @@ on: branches: [ master ] jobs: - build: + lint: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7] + python-version: [3.6, 3.7, 3.8] os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test with tox + run: tox -e lint + readme: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v1 with: python-version: ${{ matrix.python-version }} + - name: Install libgomp1 + run: | + sudo apt-get install libgomp1 + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test with tox + run: tox -e readme + unit: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | - sudo apt-get install pandoc python -m pip install --upgrade pip pip install tox tox-gh-actions + - name: Test with tox + run: tox -e unit + minimum: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions + - name: Test with tox + run: tox -e minimum + + tutorials: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install tox tox-gh-actions - name: Test with tox - run: tox + run: tox -e tutorials diff --git a/Makefile b/Makefile index ea625f3..2b2d2f8 100644 --- a/Makefile +++ b/Makefile @@ -49,9 +49,7 @@ clean-pyc: ## remove Python file artifacts .PHONY: clean-docs clean-docs: ## remove previously built docs - rm -f docs/api/*.rst - rm -rf docs/tutorials - -$(MAKE) -C docs clean 2>/dev/null # this fails if sphinx is not yet installed + rm -rf docs/api/ docs/api_reference/api/ docs/tutorials docs/build docs/_build .PHONY: clean-coverage clean-coverage: ## remove coverage artifacts @@ -82,21 +80,38 @@ install-test: clean-build clean-pyc ## install the package and test dependencies install-develop: clean-build clean-pyc ## install the package in editable mode and dependencies for development pip install -e .[dev] +MINIMUM := $(shell sed -n '/install_requires = \[/,/]/p' setup.py | grep -v -e '[][]' | sed 's/ *\(.*\),$?$$/\1/g' | tr '>' '=') + +.PHONY: install-minimum +install-minimum: ## install the minimum supported versions of the package dependencies + echo pip install $(MINIMUM) + # LINT TARGETS +.PHONY: lint-greenguard +lint-btb: ## check style with flake8 and isort + flake8 greenguard + isort -c --recursive greenguard + +.PHONY: lint-tests +lint-tests: ## check style with flake8 and isort + flake8 --ignore=D,SFS2 tests + isort -c --recursive tests + +.PHONY: check-dependencies +check-dependencies: ## test if there are any broken dependencies + pip check + .PHONY: lint -lint: ## check style with flake8 and isort - flake8 greenguard tests - isort -c --recursive greenguard tests +lint: check-dependencies lint-greenguard lint-tests ## Run all code style and static testing validations .PHONY: fix-lint fix-lint: ## fix lint issues using autoflake, autopep8, and isort - find greenguard tests -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables - autopep8 --in-place --recursive --aggressive greenguard tests + find greenguard -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables + autopep8 --in-place --recursive --aggressive greenguard isort --apply --atomic --recursive greenguard tests - # TEST TARGETS .PHONY: test-unit @@ -111,13 +126,14 @@ test-readme: ## run the readme snippets .PHONY: test-tutorials test-tutorials: ## run the tutorial notebooks - jupyter nbconvert --execute --ExecutePreprocessor.timeout=600 tutorials/*.ipynb --stdout > /dev/null + find tutorials -path "*/.ipynb_checkpoints" -prune -false -o -name "*.ipynb" -exec \ + jupyter nbconvert --execute --ExecutePreprocessor.timeout=3600 --to=html --stdout {} > /dev/null \; .PHONY: test -test: test-unit test-readme ## test everything that needs test dependencies +test: test-unit test-readme test-tutorials ## test everything that needs test dependencies -.PHONY: test-devel -test-devel: lint docs ## test everything that needs development dependencies +.PHONY: test-minimum +test-minimum: install-minimum check-dependencies test-unit ## run tests using the minimum supported dependencies .PHONY: test-all test-all: ## run tests on every Python version with tox @@ -130,17 +146,14 @@ coverage: ## check code coverage quickly with the default Python coverage html $(BROWSER) htmlcov/index.html - # DOCS TARGETS .PHONY: docs docs: clean-docs ## generate Sphinx HTML documentation, including API docs - cp -r tutorials docs/tutorials - sphinx-apidoc --separate --no-toc -o docs/api/ greenguard $(MAKE) -C docs html .PHONY: view-docs -view-docs: docs ## view docs in browser +view-docs: ## view the docs in a browser $(BROWSER) docs/_build/html/index.html .PHONY: serve-docs diff --git a/README.md b/README.md index adbc11e..1f7551c 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ The salient aspects of this customized project are: ## Requirements -**GreenGuard** has been developed and runs on Python 3.6 and 3.7. +**GreenGuard** has been developed and runs on Python 3.6, 3.7 and 3.8. Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering diff --git a/setup.py b/setup.py index 63b5ae7..8837963 100644 --- a/setup.py +++ b/setup.py @@ -16,18 +16,18 @@ history = '' install_requires = [ - 'baytune>=0.3.9,<0.4', + 'baytune>=0.3.13.dev0,<0.4', 'mlblocks>=0.3.4,<0.4', - 'mlprimitives>=0.2.5,<0.3', + 'mlprimitives>=0.2.6.dev0,<0.3', 'pymongo>=3.7.2,<4', - 'scikit-learn>=0.20.0,<0.21', + 'scikit-learn>=0.21', 'tqdm<4.50.0,>=4.36.1', 'cloudpickle>=1.6,<2', 'scipy>=1.0.1,<1.4.0', - 'numpy>=1.15.4,<1.17', - 'pandas>=0.23.4,<0.25', + 'numpy<1.19.0,>=1.16.0', + 'pandas>=1,<2', 'dask>=2.6.0,<3', - 'Keras>=2.1.6,<2.4', + 'Keras>=2.4', 'tabulate>=0.8.3,<0.9', 'xlsxwriter>=1.3.6<1.4', 'boto3==1.14.44', @@ -106,7 +106,7 @@ long_description_content_type='text/markdown', name='greenguard', packages=find_packages(include=['greenguard', 'greenguard.*']), - python_requires='>=3.6,<3.8', + python_requires='>=3.6,<3.9', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index 91af938..0068931 100644 --- a/tox.ini +++ b/tox.ini @@ -1,25 +1,41 @@ +[testenv:docs] +skipsdist = true +extras = dev +commands = + /usr/bin/env make docs + [tox] -envlist = py{36,37}, test-devel +envlist = py3{6,7,8}-{lint,readme,unit,minimum} [travis] python = - 3.7: py37, test-devel - 3.6: py36 + 3.8: py38-lint, py38-readme, py38-unit, py38-minimum, py38-tutorials + 3.7: py37-lint, py37-readme, py37-unit, py37-minimum, py37-tutorials + 3.6: py36-lint, py36-readme, py36-unit, py36-minimum, py36-tutorials [gh-actions] python = - 3.7: py37, test-devel - 3.6: py36 + 3.8: py38-lint, py38-readme, py38-unit, py38-minimum, py38-tutorials + 3.7: py37-lint, py37-readme, py37-unit, py37-minimum, py37-tutorials + 3.6: py36-lint, py36-readme, py36-unit, py36-minimum, py36-tutorials [testenv] passenv = CI TRAVIS TRAVIS_* skipsdist = false skip_install = false -extras = test -commands = - /usr/bin/env make test - -[testenv:test-devel] -extras = dev +deps = + readme: rundoc + tutorials: jupyter +extras = + lint: dev + unit: test + minimum: test commands = - /usr/bin/env make test-devel + lint: /usr/bin/env make lint + readme: /usr/bin/env make test-readme + unit: /usr/bin/env make test-unit + minimum: /usr/bin/env make test-minimum + tutorials: /usr/bin/env make test-tutorials + rm -r {envdir} +whitelist_externals = + rm From b4b4cd98639185ace7e984190e081341bf6019cf Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 1 Dec 2020 12:17:37 +0100 Subject: [PATCH 123/171] =?UTF-8?q?Bump=20version:=200.2.7.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.7.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 63ff9ee..a027a27 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.7.dev0' +__version__ = '0.2.7.dev1' import os diff --git a/setup.cfg b/setup.cfg index 86de050..84c637d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.7.dev0 +current_version = 0.2.7.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 8837963..b50e4a5 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.7.dev0', + version='0.2.7.dev1', zip_safe=False, ) From 27d362a7e9f1f18635972e9c2aec93a26cd42b5a Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 1 Dec 2020 15:06:51 +0100 Subject: [PATCH 124/171] Update scipy version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b50e4a5..6746be9 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ 'scikit-learn>=0.21', 'tqdm<4.50.0,>=4.36.1', 'cloudpickle>=1.6,<2', - 'scipy>=1.0.1,<1.4.0', + 'scipy>=1.0.1,<2', 'numpy<1.19.0,>=1.16.0', 'pandas>=1,<2', 'dask>=2.6.0,<3', From bf9e31ec7f9698e9c976b13dcfcfa13f0fb59dc3 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Tue, 1 Dec 2020 15:07:43 +0100 Subject: [PATCH 125/171] =?UTF-8?q?Bump=20version:=200.2.7.dev1=20?= =?UTF-8?q?=E2=86=92=200.2.7.dev2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index a027a27..abea0c2 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.7.dev1' +__version__ = '0.2.7.dev2' import os diff --git a/setup.cfg b/setup.cfg index 84c637d..724b591 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.7.dev1 +current_version = 0.2.7.dev2 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 6746be9..d3fcf1a 100644 --- a/setup.py +++ b/setup.py @@ -111,6 +111,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.7.dev1', + version='0.2.7.dev2', zip_safe=False, ) From 007bd5e60d415b533da1c96dfa9ca6c8a44cdc17 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Fri, 4 Dec 2020 12:53:05 +0100 Subject: [PATCH 126/171] Add threshold (#55) * Add threshold to the GreenGuardPipeline * Update description. * Curate docstrings --- greenguard/pipeline.py | 34 +++++++++++++++++++++++++++++----- 1 file changed, 29 insertions(+), 5 deletions(-) diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 34504e3..ba68278 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -150,6 +150,9 @@ def generate_preprocessing(templates_names, preprocessing): return preprocessing +SELF_THRESHOLD = object() + + class GreenGuardPipeline(object): """Main Machine Learning component in the GreenGuard project. @@ -228,6 +231,11 @@ class GreenGuardPipeline(object): cache_path (str): If given, cache the generated cross validation splits in this folder. Defatuls to ``None``. + threshold (float): + If ``None``, return the raw predictions as given by the pipeline. If not ``None``, + use the given value as a threshold to convert the predicted probabilities into + a binary output that indicates whether the probability is above the threshold (not + strict) or below the threshold (strict). Defaults to ``None``. """ template = None @@ -304,8 +312,9 @@ def _build_pipeline(self): self.fitted = False - def __init__(self, templates, metric='accuracy', cost=False, init_params=None, stratify=True, - cv_splits=5, shuffle=True, random_state=0, preprocessing=0, cache_path=None): + def __init__(self, templates, metric='accuracy', cost=False, init_params=None, + stratify=True, cv_splits=5, shuffle=True, random_state=0, preprocessing=0, + cache_path=None, threshold=None): if isinstance(metric, str): metric, cost = METRICS[metric] @@ -314,6 +323,7 @@ def __init__(self, templates, metric='accuracy', cost=False, init_params=None, s self._cost = cost self._cv = self._get_cv(stratify, cv_splits, shuffle, random_state) self.cv_score = np.inf if cost else -np.inf + self.threshold = threshold if not isinstance(templates, list): templates = [templates] @@ -556,7 +566,7 @@ def fit(self, target_times=None, readings=None, turbines=None, return out def predict(self, target_times=None, readings=None, turbines=None, - start_=None, output_='default', **kwargs): + start_=None, output_='default', threshold=SELF_THRESHOLD, **kwargs): """Make predictions using this pipeline. Args: @@ -567,6 +577,13 @@ def predict(self, target_times=None, readings=None, turbines=None, ``readings`` table. turbines (pandas.DataFrame): ``turbines`` table. + threshold (float): + If not given, use the threshold specified upon instance creation in the + ``__init__``. If ``None``, return the raw predictions as given by the pipeline. + If not ``None``, use the given value as a threshold to convert the predicted + probabilities into a binary output that indicates whether the probability is above + the threshold (not strict) or below the threshold (strict). + Defaults to ``self.threshold``. Returns: numpy.ndarray: @@ -576,8 +593,15 @@ def predict(self, target_times=None, readings=None, turbines=None, raise NotFittedError() X = target_times[['turbine_id', 'cutoff_time']] - return self._pipeline.predict(X, readings=readings, turbines=turbines, - start_=start_, output_=output_, **kwargs) + predictions = self._pipeline.predict(X, readings=readings, turbines=turbines, + start_=start_, output_=output_, **kwargs) + if threshold is SELF_THRESHOLD: + threshold = self.threshold + + if threshold is not None: + predictions = predictions >= threshold + + return predictions def save(self, path): """Serialize and save this pipeline using cloudpickle. From e360694afa0a4e7f808b176a362c11f64f7e7d7b Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Mon, 7 Dec 2020 17:56:43 +0100 Subject: [PATCH 127/171] Add false positive rate metric (#56) * Add integration tests * Add FPR metric * Fix tests * Fix lint py37 * Add dosctring --- greenguard/metrics.py | 41 +++++++++++++++++++++++++++++++++++++++-- tests/test_metrics.py | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 tests/test_metrics.py diff --git a/greenguard/metrics.py b/greenguard/metrics.py index 54a151e..ef1c249 100644 --- a/greenguard/metrics.py +++ b/greenguard/metrics.py @@ -1,18 +1,55 @@ # -*- coding: utf-8 -*- +import logging +import numpy as np from sklearn.metrics import ( - accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score) + accuracy_score, f1_score, mean_absolute_error, mean_squared_error, roc_curve, r2_score) + +LOGGER = logging.getLogger(__name__) def f1_macro(exp, obs): return f1_score(exp, obs, average='macro') +def fpr_score(ground_truth, probabilities, tpr=1): + """Compute the False Positive Rate associated with the given True Positive Rate. + + This metric computes the False Positive Rate that needs to be assumed in order + to achieve the desired True Positive Rate. + The metric is computed by finding the minimum necessary threshold to ensure + that the TPR is satisfied and then computing the associated FPR. The final output + is 1 minus the found FPR to produce a maximization score between 0 and 1. + + Args: + ground_truth (numpy.ndarray): + ``numpy.ndarray`` of the known values for the given predictions. + probabilities (numpy.ndarray): + ``numpy.ndarray`` with the generated predictions in probability. + tpr (float): + ``float`` value representing the percentage of True Positive Rate + to be satisfied. + + Returns: + float: + Value between 0 and 1, where bigger is better. + """ + roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1) + try: + index = np.where(roc_tpr >= tpr)[0][0] + except: + LOGGER.warn('Could not find a threshold that satisfies the requested True Positive Rate') + index = -1 + + return 1 - roc_fpr[index] + + METRICS = { 'accuracy': (accuracy_score, False), 'f1': (f1_score, False), 'f1_macro': (f1_macro, False), 'r2': (r2_score, False), 'mse': (mean_squared_error, True), - 'mae': (mean_absolute_error, True) + 'mae': (mean_absolute_error, True), + 'fpr_score': (fpr_score, False) } diff --git a/tests/test_metrics.py b/tests/test_metrics.py new file mode 100644 index 0000000..ce14132 --- /dev/null +++ b/tests/test_metrics.py @@ -0,0 +1,39 @@ +import numpy as np + +from greenguard.metrics import fpr_score + + +def test_fpr_score_perfect_scenario(): + truth = [0, 0, 0, 1, 1, 1] + false_probs = [0.2, 0.4, 0.6] + true_probs = [0.8, 0.7, 0.9] + probs = np.concatenate([false_probs, true_probs]) + score = fpr_score(truth, probs, tpr=1) + assert score == 1 + + +def test_fpr_score_predict_over_half(): + truth = [0, 0, 0, 0, 1, 1, 1, 1] + false_probs = [0.1, 0.2, 0.4, 0.6] + true_probs = [0.5, 0.7, 0.8, 0.9] + probs = np.concatenate([false_probs, true_probs]) + score = fpr_score(truth, probs, tpr=1) + assert score == 0.75 + + +def test_fpr_score_predict_half(): + truth = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + false_probs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] + true_probs = [0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + probs = np.concatenate([false_probs, true_probs]) + score = fpr_score(truth, probs, tpr=1) + assert score == 0.5 + + +def test_fpr_score_predict_one_third(): + truth = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1] + false_probs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] + true_probs = [0.3, 0.4, 0.5, 0.7, 0.8, 0.9] + probs = np.concatenate([false_probs, true_probs]) + score = fpr_score(truth, probs, tpr=1) + assert round(score, 4) == 0.3333 From 03c9838abb494e4c8f60f3710b3f14be2eb7cc35 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Mon, 7 Dec 2020 18:02:23 +0100 Subject: [PATCH 128/171] Add primitives and pipelines with proba (#54) * Add primitives module for GreenGuard and required primitives for the proba pipelines. * Add pipelines using predict_proba instead of predict. * Use numpy.take * Update primitives structure, readme and get primitives. * Fix get_pipelines. * Add contributor --- README.md | 14 +- greenguard/__init__.py | 1 + greenguard/pipeline.py | 15 +- .../normalize_dfs_xgb_classifier.json | 0 .../unstack_dfs_xgb_classifier.json | 0 ...ack_double_lstm_timeseries_classifier.json | 0 .../unstack_lstm_timeseries_classifier.json | 0 .../unstack_normalize_dfs_xgb_classifier.json | 0 .../normalize_dfs_xgb_classifier.json | 70 ++++ .../unstack_dfs_xgb_classifier.json | 83 ++++ ...ack_double_lstm_timeseries_classifier.json | 125 ++++++ .../unstack_lstm_timeseries_classifier.json | 125 ++++++ .../unstack_normalize_dfs_xgb_classifier.json | 74 ++++ greenguard/primitives/numpy.take.json | 40 ++ .../xgboost.XGBClassifier:probabilities.json | 93 +++++ setup.py | 3 +- .../01_GreenGuard_Machine_Learning.ipynb | 377 ++++++++---------- 17 files changed, 792 insertions(+), 228 deletions(-) rename greenguard/pipelines/{ => classes}/normalize_dfs_xgb_classifier.json (100%) rename greenguard/pipelines/{ => classes}/unstack_dfs_xgb_classifier.json (100%) rename greenguard/pipelines/{ => classes}/unstack_double_lstm_timeseries_classifier.json (100%) rename greenguard/pipelines/{ => classes}/unstack_lstm_timeseries_classifier.json (100%) rename greenguard/pipelines/{ => classes}/unstack_normalize_dfs_xgb_classifier.json (100%) create mode 100644 greenguard/pipelines/probability/normalize_dfs_xgb_classifier.json create mode 100644 greenguard/pipelines/probability/unstack_dfs_xgb_classifier.json create mode 100644 greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json create mode 100644 greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json create mode 100644 greenguard/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json create mode 100644 greenguard/primitives/numpy.take.json create mode 100644 greenguard/primitives/xgboost.XGBClassifier:probabilities.json diff --git a/README.md b/README.md index 1f7551c..0472817 100644 --- a/README.md +++ b/README.md @@ -225,18 +225,18 @@ The returned `pipeline` variable will be `list` containing the names of all the available in the GreenGuard system: ``` -['unstack_double_lstm_timeseries_classifier', - 'unstack_lstm_timeseries_classifier', - 'unstack_normalize_dfs_xgb_classifier', - 'unstack_dfs_xgb_classifier', - 'normalize_dfs_xgb_classifier'] +['classes.unstack_double_lstm_timeseries_classifier', + 'classes.unstack_lstm_timeseries_classifier', + 'classes.unstack_normalize_dfs_xgb_classifier', + 'classes.unstack_dfs_xgb_classifier', + 'classes.normalize_dfs_xgb_classifier'] ``` For the rest of this tutorial, we will select and use the pipeline -`normalize_dfs_xgb_classifier` as our template. +`classes.normalize_dfs_xgb_classifier` as our template. ```python3 -pipeline_name = 'normalize_dfs_xgb_classifier' +pipeline_name = 'classes.normalize_dfs_xgb_classifier' ``` ## 3. Fitting the Pipeline diff --git a/greenguard/__init__.py b/greenguard/__init__.py index abea0c2..4374a6d 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -12,6 +12,7 @@ _BASE_PATH = os.path.abspath(os.path.dirname(__file__)) MLBLOCKS_PIPELINES = os.path.join(_BASE_PATH, 'pipelines') +MLBLOCKS_PRIMITIVES = os.path.join(_BASE_PATH, 'primitives') __all__ = ( diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index ba68278..156c046 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -54,7 +54,7 @@ def __setstate__(self, state): Sequential.__setstate__ = __setstate__ -def get_pipelines(pattern='', path=False, unstacked=False): +def get_pipelines(pattern='', path=False, pipeline_type='classes'): """Get the list of available pipelines. Optionally filter the names using a patter or obtain @@ -66,9 +66,9 @@ def get_pipelines(pattern='', path=False, unstacked=False): path (bool): Whether to return a dictionary containing the pipeline paths instead of only a list with the names. - unstacked (bool): - Whether to load the pipelines that expect the readings - to be already unstacked by signal_id. Defaults to ``False``. + pipeline_type (str): + The pipeline category to filter by (`classes`, `probability` and `unstacked`). + Defaults to `classes`. Return: list or dict: @@ -77,14 +77,13 @@ def get_pipelines(pattern='', path=False, unstacked=False): names as keys and their absolute paths as values. """ pipelines = dict() - pipelines_dir = PIPELINES_DIR - if unstacked: - pipelines_dir = os.path.join(pipelines_dir, 'unstacked') + pipelines_dir = os.path.join(PIPELINES_DIR, pipeline_type) for filename in os.listdir(pipelines_dir): if filename.endswith('.json') and pattern in filename: name = os.path.basename(filename)[:-len('.json')] - pipeline_path = os.path.join(PIPELINES_DIR, filename) + name = f'{pipeline_type}.{name}' + pipeline_path = os.path.join(pipelines_dir, filename) pipelines[name] = pipeline_path if not path: diff --git a/greenguard/pipelines/normalize_dfs_xgb_classifier.json b/greenguard/pipelines/classes/normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/normalize_dfs_xgb_classifier.json rename to greenguard/pipelines/classes/normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/unstack_dfs_xgb_classifier.json b/greenguard/pipelines/classes/unstack_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/unstack_dfs_xgb_classifier.json rename to greenguard/pipelines/classes/unstack_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/unstack_double_lstm_timeseries_classifier.json b/greenguard/pipelines/classes/unstack_double_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/unstack_double_lstm_timeseries_classifier.json rename to greenguard/pipelines/classes/unstack_double_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/unstack_lstm_timeseries_classifier.json b/greenguard/pipelines/classes/unstack_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/unstack_lstm_timeseries_classifier.json rename to greenguard/pipelines/classes/unstack_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json b/greenguard/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json rename to greenguard/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/probability/normalize_dfs_xgb_classifier.json b/greenguard/pipelines/probability/normalize_dfs_xgb_classifier.json new file mode 100644 index 0000000..495a5d9 --- /dev/null +++ b/greenguard/pipelines/probability/normalize_dfs_xgb_classifier.json @@ -0,0 +1,70 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.normalize_entity", + "featuretools.EntitySet.normalize_entity", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier:probabilities", + "numpy.take" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": true + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "entity_id": "readings", + "index": "reading_id", + "make_index": true, + "time_index": "timestamp" + }, + "featuretools.EntitySet.normalize_entity#1": { + "base_entity_id": "readings", + "new_entity_id": "turbines", + "index": "turbine_id", + "make_time_index": false + }, + "featuretools.EntitySet.normalize_entity#2": { + "base_entity_id": "readings", + "new_entity_id": "signals", + "index": "signal_id", + "make_time_index": false + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "dataframe": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/probability/unstack_dfs_xgb_classifier.json b/greenguard/pipelines/probability/unstack_dfs_xgb_classifier.json new file mode 100644 index 0000000..aedbada --- /dev/null +++ b/greenguard/pipelines/probability/unstack_dfs_xgb_classifier.json @@ -0,0 +1,83 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.add_relationship", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier:probabilities", + "numpy.take" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "entity_id": "readings", + "index": "reading_id", + "make_index": true, + "time_index": "timestamp" + }, + "featuretools.EntitySet.entity_from_dataframe#2": { + "entity_id": "turbines", + "index": "turbine_id", + "make_index": false + }, + "featuretools.EntitySet.add_relationship#1": { + "parent": "turbines", + "parent_column": "turbine_id", + "child": "readings", + "child_column": "turbine_id" + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": true, + "n_jobs": 1, + "training_window": "1d" + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "dataframe": "readings" + }, + "featuretools.EntitySet.entity_from_dataframe#2": { + "dataframe": "turbines" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json b/greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json new file mode 100644 index 0000000..46f05e1 --- /dev/null +++ b/greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json @@ -0,0 +1,125 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.DoubleLSTMTimeSeriesClassifier", + "numpy.take" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "3600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 24, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { + "epochs": 35, + "verbose": false, + "classification": false + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json b/greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json new file mode 100644 index 0000000..34760d5 --- /dev/null +++ b/greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json @@ -0,0 +1,125 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.LSTMTimeSeriesClassifier", + "numpy.take" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "3600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 24, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { + "epochs": 35, + "verbose": false, + "classification": false + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/greenguard/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json b/greenguard/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json new file mode 100644 index 0000000..eddddd8 --- /dev/null +++ b/greenguard/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json @@ -0,0 +1,74 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.normalize_entity", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier:probabilities", + "numpy.take" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "entity_id": "readings", + "index": "reading_id", + "make_index": true, + "time_index": "timestamp" + }, + "featuretools.EntitySet.normalize_entity#1": { + "base_entity_id": "readings", + "new_entity_id": "turbines", + "index": "turbine_id", + "make_time_index": false + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "dataframe": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } +} diff --git a/greenguard/primitives/numpy.take.json b/greenguard/primitives/numpy.take.json new file mode 100644 index 0000000..ad5e7a3 --- /dev/null +++ b/greenguard/primitives/numpy.take.json @@ -0,0 +1,40 @@ +{ + "name": "numpy.take", + "contributors": [ + "Plamen Valentinov Kolev " + ], + "documentation": "https://docs.scipy.org/doc/numpy/reference/", + "description": "Take elements from an array along an axis.", + "classifiers": { + "type": "postprocessor" + }, + "modalities": [], + "primitive": "numpy.take", + "produce": { + "args": [ + { + "name": "y", + "keyword": "a", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "ndarray" + } + ] + }, + "hyperparameters": { + "fixed": { + "indices": { + "type": "int", + "default": 0 + }, + "axis": { + "type": "int", + "default": null + } + } + } +} diff --git a/greenguard/primitives/xgboost.XGBClassifier:probabilities.json b/greenguard/primitives/xgboost.XGBClassifier:probabilities.json new file mode 100644 index 0000000..8837381 --- /dev/null +++ b/greenguard/primitives/xgboost.XGBClassifier:probabilities.json @@ -0,0 +1,93 @@ +{ + "name": "xgboost.XGBClassifier", + "contributors": [ + "Carles Sala " + ], + "documentation": "https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier", + "description": "Implementation of the scikit-learn API for XGBoost classification.", + "classifiers": { + "type": "estimator", + "subtype": "classifier" + }, + "modalities": [], + "primitive": "xgboost.XGBClassifier", + "fit": { + "method": "fit", + "args": [ + { + "name": "X", + "type": "ndarray" + }, + { + "name": "y", + "type": "array" + } + ] + }, + "produce": { + "method": "predict_proba", + "args": [ + { + "name": "X", + "keyword": "data", + "type": "ndarray" + } + ], + "output": [ + { + "name": "y", + "type": "array" + } + ] + }, + "hyperparameters": { + "fixed": { + "n_jobs": { + "type": "int", + "default": -1 + } + }, + "tunable": { + "n_estimators": { + "type": "int", + "default": 100, + "range": [ + 10, + 1000 + ] + }, + "max_depth": { + "type": "int", + "default": 3, + "range": [ + 3, + 10 + ] + }, + "learning_rate": { + "type": "float", + "default": 0.1, + "range": [ + 0, + 1 + ] + }, + "gamma": { + "type": "float", + "default": 0, + "range": [ + 0, + 1 + ] + }, + "min_child_weight": { + "type": "int", + "default": 1, + "range": [ + 1, + 10 + ] + } + } + } +} diff --git a/setup.py b/setup.py index d3fcf1a..ebbde1c 100644 --- a/setup.py +++ b/setup.py @@ -91,7 +91,8 @@ description='AutoML for Renewable Energy Industries.', entry_points={ 'mlblocks': [ - 'pipelines=greenguard:MLBLOCKS_PIPELINES' + 'pipelines=greenguard:MLBLOCKS_PIPELINES', + 'primitives=greenguard:MLBLOCKS_PRIMITIVES' ], }, extras_require={ diff --git a/tutorials/01_GreenGuard_Machine_Learning.ipynb b/tutorials/01_GreenGuard_Machine_Learning.ipynb index 7fab764..03a2aa0 100644 --- a/tutorials/01_GreenGuard_Machine_Learning.ipynb +++ b/tutorials/01_GreenGuard_Machine_Learning.ipynb @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -62,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -157,7 +157,7 @@ "4 T001 2013-01-16 0" ] }, - "execution_count": 15, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -168,7 +168,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -177,7 +177,7 @@ "(353, 3)" ] }, - "execution_count": 16, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -188,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -200,7 +200,7 @@ "dtype: object" ] }, - "execution_count": 17, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -211,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -290,7 +290,7 @@ "4 T001 S05 2013-01-10 273.0" ] }, - "execution_count": 18, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -301,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -310,7 +310,7 @@ "(1313540, 4)" ] }, - "execution_count": 19, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -321,7 +321,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -334,7 +334,7 @@ "dtype: object" ] }, - "execution_count": 20, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -383,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -408,20 +408,37 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['normalize_dfs_xgb_classifier',\n", - " 'unstack_normalize_dfs_xgb_classifier',\n", - " 'unstack_dfs_xgb_classifier',\n", - " 'unstack_lstm_timeseries_classifier',\n", - " 'unstack_double_lstm_timeseries_classifier']" + "['unstacked.unstacked_normalize_dfs_xgb_classifier',\n", + " 'unstacked.unstacked_double_lstm_timeseries_classifier',\n", + " 'unstacked.unstacked_lstm_timeseries_classifier',\n", + " 'unstacked.unstacked_dfs_xgb_classifier',\n", + " 'classes.unstack_dfs_xgb_classifier',\n", + " 'classes.unstack_double_lstm_timeseries_classifier',\n", + " 'classes.normalize_dfs_xgb_classifier',\n", + " 'classes.unstack_lstm_timeseries_classifier',\n", + " 'classes.unstack_normalize_dfs_xgb_classifier',\n", + " 'disabled.resample_normalize_dfs_xgb_classifier',\n", + " 'disabled.resample_unstack_lstm_timeseries_classifier',\n", + " 'disabled.resample_unstack_normalize_dfs_xgb_classifier',\n", + " 'disabled.normalize_dfs_xgb_classifier',\n", + " 'disabled.resample_unstack_double_lstm_timeseries_classifier',\n", + " 'disabled.resample_dfs_xgb_classifier',\n", + " 'disabled.resample_unstack_dfs_xgb_classifier',\n", + " 'disabled.dfs_xgb_classifier',\n", + " 'probability.unstack_dfs_xgb_classifier',\n", + " 'probability.unstack_double_lstm_timeseries_classifier',\n", + " 'probability.normalize_dfs_xgb_classifier',\n", + " 'probability.unstack_lstm_timeseries_classifier',\n", + " 'probability.unstack_normalize_dfs_xgb_classifier']" ] }, - "execution_count": 22, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -441,18 +458,29 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['normalize_dfs_xgb_classifier',\n", - " 'unstack_normalize_dfs_xgb_classifier',\n", - " 'unstack_dfs_xgb_classifier']" + "['unstacked.unstacked_normalize_dfs_xgb_classifier',\n", + " 'unstacked.unstacked_dfs_xgb_classifier',\n", + " 'classes.unstack_dfs_xgb_classifier',\n", + " 'classes.normalize_dfs_xgb_classifier',\n", + " 'classes.unstack_normalize_dfs_xgb_classifier',\n", + " 'disabled.resample_normalize_dfs_xgb_classifier',\n", + " 'disabled.resample_unstack_normalize_dfs_xgb_classifier',\n", + " 'disabled.normalize_dfs_xgb_classifier',\n", + " 'disabled.resample_dfs_xgb_classifier',\n", + " 'disabled.resample_unstack_dfs_xgb_classifier',\n", + " 'disabled.dfs_xgb_classifier',\n", + " 'probability.unstack_dfs_xgb_classifier',\n", + " 'probability.normalize_dfs_xgb_classifier',\n", + " 'probability.unstack_normalize_dfs_xgb_classifier']" ] }, - "execution_count": 23, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -471,18 +499,29 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'normalize_dfs_xgb_classifier': '/home/usuario/Projects/GreenGuard/greenguard/pipelines/normalize_dfs_xgb_classifier.json',\n", - " 'unstack_normalize_dfs_xgb_classifier': '/home/usuario/Projects/GreenGuard/greenguard/pipelines/unstack_normalize_dfs_xgb_classifier.json',\n", - " 'unstack_dfs_xgb_classifier': '/home/usuario/Projects/GreenGuard/greenguard/pipelines/unstack_dfs_xgb_classifier.json'}" + "{'unstacked.unstacked_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json',\n", + " 'unstacked.unstacked_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/unstacked/unstacked_dfs_xgb_classifier.json',\n", + " 'classes.unstack_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/classes/unstack_dfs_xgb_classifier.json',\n", + " 'classes.normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/classes/normalize_dfs_xgb_classifier.json',\n", + " 'classes.unstack_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json',\n", + " 'disabled.resample_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json',\n", + " 'disabled.resample_unstack_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json',\n", + " 'disabled.normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/normalize_dfs_xgb_classifier.json',\n", + " 'disabled.resample_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/resample_dfs_xgb_classifier.json',\n", + " 'disabled.resample_unstack_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json',\n", + " 'disabled.dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/dfs_xgb_classifier.json',\n", + " 'probability.unstack_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/probability/unstack_dfs_xgb_classifier.json',\n", + " 'probability.normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/probability/normalize_dfs_xgb_classifier.json',\n", + " 'probability.unstack_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json'}" ] }, - "execution_count": 24, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -513,13 +552,13 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "templates = [\n", - " 'unstack_normalize_dfs_xgb_classifier', \n", - " 'normalize_dfs_xgb_classifier'\n", + " 'classes.unstack_normalize_dfs_xgb_classifier', \n", + " 'classes.normalize_dfs_xgb_classifier'\n", "]" ] }, @@ -546,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -568,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -585,37 +624,16 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "INFO:btb.session:Obtaining default configuration for unstack_normalize_dfs_xgb_classifier\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Built 165 features\n", - "Elapsed: 00:34 | Progress: 100%|██████████\n", - "Elapsed: 00:18 | Progress: 100%|██████████\n", - "Built 165 features\n", - "Elapsed: 00:36 | Progress: 100%|██████████\n", - "Elapsed: 00:17 | Progress: 100%|██████████\n", - "Built 165 features\n", - "Elapsed: 00:38 | Progress: 100%|██████████\n", - "Elapsed: 00:17 | Progress: 100%|██████████\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + "INFO:btb.session:Obtaining default configuration for classes.unstack_normalize_dfs_xgb_classifier\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: unstack_normalize_dfs_xgb_classifier \n", + " Template: classes.unstack_normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", @@ -623,60 +641,29 @@ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "INFO:btb.session:New optimal found: unstack_normalize_dfs_xgb_classifier - 0.605187908496732\n", - "INFO:btb.session:Obtaining default configuration for normalize_dfs_xgb_classifier\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Built 99 features\n", - "Elapsed: 01:44 | Progress: 100%|██████████\n", - "Elapsed: 00:52 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 01:38 | Progress: 100%|██████████\n", - "Elapsed: 00:52 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 01:39 | Progress: 100%|██████████\n", - "Elapsed: 00:49 | Progress: 100%|██████████\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", - "INFO:greenguard.pipeline:New configuration found:\n", - " Template: unstack_normalize_dfs_xgb_classifier \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 20\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 234\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23028782510751677\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9403975339570728\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "INFO:btb.session:New optimal found: unstack_normalize_dfs_xgb_classifier - 0.6106037764640573\n", - "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n" + "INFO:btb.session:New optimal found: classes.unstack_normalize_dfs_xgb_classifier - 0.611234532127027\n", + "INFO:btb.session:Obtaining default configuration for classes.normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n" ] }, { "data": { "text/plain": [ - "{'id': '28d8ebbde404a0e501262a652c4d9aa5',\n", - " 'name': 'unstack_normalize_dfs_xgb_classifier',\n", + "{'id': 'afc8e912142bc6c384231600df9874fc',\n", + " 'name': 'classes.unstack_normalize_dfs_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 20,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 234,\n", + " 'max_labels'): 0,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23028782510751677,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9403975339570728,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.6106037764640573}" + " 'score': 0.611234532127027}" ] }, - "execution_count": 28, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -695,25 +682,25 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'id': '28d8ebbde404a0e501262a652c4d9aa5',\n", - " 'name': 'unstack_normalize_dfs_xgb_classifier',\n", + "{'id': 'afc8e912142bc6c384231600df9874fc',\n", + " 'name': 'classes.unstack_normalize_dfs_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 20,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 234,\n", + " 'max_labels'): 0,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23028782510751677,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9403975339570728,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.6106037764640573}" + " 'score': 0.611234532127027}" ] }, - "execution_count": 29, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -731,22 +718,22 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 20,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 234,\n", + " 'max_labels'): 0,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.23028782510751677,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9403975339570728,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" ] }, - "execution_count": 30, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -764,16 +751,16 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'unstack_normalize_dfs_xgb_classifier'" + "'classes.unstack_normalize_dfs_xgb_classifier'" ] }, - "execution_count": 31, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -792,16 +779,16 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6106037764640573" + "0.611234532127027" ] }, - "execution_count": 32, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -821,61 +808,51 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", - "INFO:greenguard.pipeline:New configuration found:\n", - " Template: unstack_normalize_dfs_xgb_classifier \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 80\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 32\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 10\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.11814847201162682\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9589332448610124\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6\n", - "INFO:btb.session:New optimal found: unstack_normalize_dfs_xgb_classifier - 0.640497737556561\n", - "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: unstack_normalize_dfs_xgb_classifier \n", + " Template: classes.unstack_normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 98\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 34\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3652063328881058\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.8627183599656656\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6\n", - "INFO:btb.session:New optimal found: unstack_normalize_dfs_xgb_classifier - 0.6592605156037993\n", - "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for unstack_normalize_dfs_xgb_classifier\n" + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 364\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 7\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6635800510691365\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9852977392614163\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 2\n", + "INFO:btb.session:New optimal found: classes.unstack_normalize_dfs_xgb_classifier - 0.6379648413546719\n", + "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n" ] }, { "data": { "text/plain": [ - "{'id': 'f6b410d303a1cfeafdcfe0dbcf330767',\n", - " 'name': 'unstack_normalize_dfs_xgb_classifier',\n", + "{'id': '7e6de03286fd71179e2a2f7b3f089ffb',\n", + " 'name': 'classes.unstack_normalize_dfs_xgb_classifier',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 98,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 34,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3652063328881058,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.8627183599656656,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6},\n", - " 'score': 0.6592605156037993}" + " 'max_labels'): 97,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 364,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 7,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6635800510691365,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9852977392614163,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 2},\n", + " 'score': 0.6379648413546719}" ] }, - "execution_count": 33, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -886,16 +863,16 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.6592605156037993" + "0.6379648413546719" ] }, - "execution_count": 34, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -906,22 +883,22 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 98,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 34,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.3652063328881058,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.8627183599656656,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 6}" + " 'max_labels'): 97,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 364,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 7,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6635800510691365,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9852977392614163,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 2}" ] }, - "execution_count": 35, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -945,18 +922,9 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 24, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Built 165 features\n", - "Elapsed: 00:39 | Progress: 100%|██████████\n" - ] - } - ], + "outputs": [], "source": [ "pipeline.fit(train, readings)" ] @@ -972,17 +940,9 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 25, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Elapsed: 00:14 | Progress: 100%|██████████\n" - ] - } - ], + "outputs": [], "source": [ "predictions = pipeline.predict(test, readings)" ] @@ -996,16 +956,16 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.693877551020408" + "0.7346938775510203" ] }, - "execution_count": 38, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1034,7 +994,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -1053,7 +1013,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -1069,23 +1029,16 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 29, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Elapsed: 00:14 | Progress: 100%|██████████\n" - ] - }, { "data": { "text/plain": [ "array([0, 0, 0, 1, 0])" ] }, - "execution_count": 41, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1112,7 +1065,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.9" + "version": "3.7.9" } }, "nbformat": 4, From 5d9da875e87dad85038dc904e018cc797966ff14 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Wed, 9 Dec 2020 22:44:17 +0100 Subject: [PATCH 129/171] Change loss to binary --- .../unstack_double_lstm_timeseries_classifier.json | 3 ++- .../probability/unstack_lstm_timeseries_classifier.json | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json b/greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json index 46f05e1..ea48a87 100644 --- a/greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json +++ b/greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json @@ -58,7 +58,8 @@ "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { "epochs": 35, "verbose": false, - "classification": false + "classification": false, + "loss": "keras.losses.binary_crossentropy" }, "numpy.take#1": { "indices": 1, diff --git a/greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json b/greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json index 34760d5..9272257 100644 --- a/greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json +++ b/greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json @@ -55,10 +55,11 @@ "cutoff_time": "cutoff_time", "time_index": "timestamp" }, - "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { + "keras.Sequential.LSTMTimeSeriesClassifier#1": { "epochs": 35, "verbose": false, - "classification": false + "classification": false, + "loss": "keras.losses.binary_crossentropy" }, "numpy.take#1": { "indices": 1, From d644d1e84059fa1853d5bbe90905667c0032f300 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev <41479552+pvk-developer@users.noreply.github.com> Date: Fri, 22 Jan 2021 11:20:52 +0100 Subject: [PATCH 130/171] Benchmark Upgrade (#57) * Fix bug: set default instead of get * Fix dependencies * add tuning metric * Add additional arguments to the command line. * Rename metrics * Fix test * Fix dependency requirement * add parse type float * TPR and Threshold * Add partd and fsspec dependencies --- greenguard/benchmark.py | 260 ++++++++++++++++++++++++++++++++-------- greenguard/metrics.py | 37 +++++- greenguard/pipeline.py | 1 + greenguard/results.py | 15 +-- setup.py | 12 +- tests/test_benchmark.py | 49 ++++---- 6 files changed, 288 insertions(+), 86 deletions(-) diff --git a/greenguard/benchmark.py b/greenguard/benchmark.py index 89d1076..f638138 100644 --- a/greenguard/benchmark.py +++ b/greenguard/benchmark.py @@ -1,5 +1,6 @@ import argparse import logging +import multiprocessing as mp import os import pickle import re @@ -16,12 +17,14 @@ from greenguard import get_pipelines from greenguard.demo import load_demo from greenguard.loaders import CSVLoader -from greenguard.metrics import METRICS +from greenguard.metrics import (METRICS, accuracy_score, f1_score, + fpr_score, tpr_score, threshold_score) from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing from greenguard.results import load_results, write_results LOGGER = logging.getLogger(__name__) +DEFAULT_TUNING_METRIC_KWARGS = {'threshold': 0.5} LEADERBOARD_COLUMNS = [ 'problem_name', 'window_size', @@ -31,7 +34,8 @@ 'default_cv', 'tuned_cv', 'tuned_test', - 'metric', + 'tuning_metric', + 'tuning_metric_kwargs', 'fit_predict_time', 'default_cv_time', 'average_cv_time', @@ -40,6 +44,25 @@ ] +def _scorer(metric, metric_args): + if isinstance(metric, str): + metric, cost = METRICS[metric] + + def f(expected, observed): + try: + return metric(expected, observed, **metric_args) + except TypeError: + if 'threshold' not in metric_args: + raise + + kwargs = metric_args.copy() + threshold = kwargs.pop('threshold') + observed = observed >= threshold + return metric(expected, observed, **kwargs) + + return f + + def _build_init_params(template, window_size, rule, template_params): if 'dfs' in template: window_size_rule_params = { @@ -61,15 +84,31 @@ def _build_init_params(template, window_size, rule, template_params): } for primitive, params in window_size_rule_params.items(): - primitive_params = template_params.get(primitive, {}) + primitive_params = template_params.setdefault(primitive, {}) primitive_params.update(params) return template_params -def evaluate_template(template, target_times, readings, metric='f1', tuning_iterations=50, - preprocessing=0, init_params=None, cost=False, test_size=0.25, - cv_splits=3, random_state=0, cache_path=None): +def evaluate_template( + template, + target_times, + readings, + tuning_iterations=50, + init_params=None, + preprocessing=0, + metrics=None, + threshold=None, + tpr=None, + tuning_metric='roc_auc_score', + tuning_metric_kwargs=DEFAULT_TUNING_METRIC_KWARGS, + cost=False, + cv_splits=3, + test_size=0.25, + random_state=0, + cache_path=None, + scores={} +): """Returns the scores for a given template. Args: @@ -119,18 +158,15 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter Stores the four types of scores that are being evaluate. """ start_time = datetime.utcnow() - - scores = dict() - scores['metric'] = metric + scores['tuning_metric'] = str(tuning_metric) + scores['tuning_metric_kwargs'] = tuning_metric_kwargs + tuning_metric = _scorer(tuning_metric, tuning_metric_kwargs) train, test = train_test_split(target_times, test_size=test_size, random_state=random_state) - if isinstance(metric, str): - metric, cost = METRICS[metric] - pipeline = GreenGuardPipeline( template, - metric, + metric=tuning_metric, cost=cost, cv_splits=cv_splits, init_params=init_params, @@ -144,7 +180,7 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter predictions = pipeline.predict(test, readings) fit_predict_time = datetime.utcnow() - fit_predict_time - scores['default_test'] = metric(test['target'], predictions) + scores['default_test'] = tuning_metric(test['target'], predictions) # Computing the default cross validation score default_cv_time = datetime.utcnow() @@ -157,28 +193,90 @@ def evaluate_template(template, target_times, readings, metric='f1', tuning_iter # Computing the cross validation score with tuned hyperparameters average_cv_time = datetime.utcnow() session.run(tuning_iterations) - average_cv_time = (average_cv_time - datetime.utcnow()) / tuning_iterations + average_cv_time = (datetime.utcnow() - average_cv_time) / tuning_iterations scores['tuned_cv'] = pipeline.cv_score # Computing the test score with tuned hyperparameters pipeline.fit(train, readings) predictions = pipeline.predict(test, readings) + ground_truth = test['target'] + + # compute different metrics + if tpr: + tpr = tpr if isinstance(tpr, list) else [tpr] + for value in tpr: + threshold = threshold_score(ground_truth, predictions, tpr) + scores[f'fpr_tpr/{value}'] = fpr_score(ground_truth, predictions, tpr=tpr) + predictions_classes = predictions >= threshold + scores[f'accuracy_tpr/{value}'] = accuracy_score(ground_truth, predictions_classes) + scores[f'f1_tpr/{value}'] = f1_score(ground_truth, predictions_classes) + scores[f'threshold_tpr/{value}'] = threshold_score(ground_truth, predictions, value) + + if f'accuracy_tpr/{value}' not in LEADERBOARD_COLUMNS: + LEADERBOARD_COLUMNS.extend([ + f'accuracy_tpr/{value}', + f'f1_tpr/{value}', + f'fpr_tpr/{value}', + f'threshold_tpr/{value}', + ]) + + else: + threshold = 0.5 if threshold is None else threshold + threshold = threshold if isinstance(threshold, list) else [threshold] + + for value in threshold: + scores[f'fpr_threshold/{value}'] = fpr_score( + ground_truth, predictions, threshold=value) + + predictions_classes = predictions >= threshold + scores[f'accuracy_threshold/{value}'] = accuracy_score( + ground_truth, predictions_classes) + + scores[f'f1_threshold/{value}'] = f1_score(ground_truth, predictions_classes) + scores[f'tpr_threshold/{value}'] = tpr_score(ground_truth, predictions, value) + + if f'accuracy_threshold/{value}' not in LEADERBOARD_COLUMNS: + LEADERBOARD_COLUMNS.extend([ + f'accuracy_threshold/{value}', + f'f1_threshold/{value}', + f'fpr_threshold/{value}', + f'tpr_threshold/{value}', + ]) - scores['tuned_test'] = metric(test['target'], predictions) + scores['tuned_test'] = tuning_metric(test['target'], predictions) scores['fit_predict_time'] = fit_predict_time scores['default_cv_time'] = default_cv_time - scores['default_cv_time'] = default_cv_time scores['average_cv_time'] = average_cv_time scores['total_time'] = datetime.utcnow() - start_time return scores -def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iterations=50, - init_params=None, target_times=None, readings=None, preprocessing=0, - cost=False, test_size=0.25, cv_splits=3, random_state=0, cache_path=None, - cache_results=None, problem_name=None, output_path=None, progress_bar=None): +def evaluate_templates( + templates, + window_size_rule, + tuning_iterations=50, + init_params=None, + preprocessing=0, + metrics=None, + threshold=None, + tpr=None, + tuning_metric='roc_auc_score', + tuning_metric_kwargs=DEFAULT_TUNING_METRIC_KWARGS, + target_times=None, + readings=None, + cost=False, + test_size=0.25, + cv_splits=3, + random_state=0, + cache_path=None, + cache_results=None, + problem_name=None, + output_path=None, + progress_bar=None, + multiprocess=False +): """Execute the benchmark process and optionally store the result as a ``CSV``. Args: @@ -272,11 +370,6 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio for template, window_rule in product(templates, window_size_rule): window_size, rule = window_rule - scores = dict() - scores['problem_name'] = problem_name - scores['template'] = template - scores['window_size'] = window_size - scores['resample_rule'] = rule try: LOGGER.info('Evaluating template %s on problem %s (%s, %s)', @@ -285,24 +378,66 @@ def evaluate_templates(templates, window_size_rule, metric='f1', tuning_iteratio template_params = init_params[template] template_params = _build_init_params(template, window_size, rule, template_params) template_preprocessing = preprocessing[template] + if multiprocess: + manager = mp.Manager() + scores = manager.dict() + process = mp.Process( + target=evaluate_template, + args=( + template, + target_times, + readings, + tuning_iterations, + init_params, + preprocessing, + metrics, + threshold, + tpr, + tuning_metric, + tuning_metric_kwargs, + cost, + cv_splits, + test_size, + random_state, + cache_path, + scores + ) + ) - result = evaluate_template( - template=template, - target_times=target_times, - readings=readings, - metric=metric, - tuning_iterations=tuning_iterations, - preprocessing=template_preprocessing, - init_params=template_params, - cost=cost, - test_size=test_size, - cv_splits=cv_splits, - random_state=random_state, - cache_path=cache_path - ) + process.start() + process.join() + if 'tuned_test' not in scores: + scores['status'] = 'ERRORED' + + scores = dict(scores) # parse the managed dict to dict for pandas. + + else: + scores = dict() + scores['problem_name'] = problem_name + scores['template'] = template + scores['window_size'] = window_size + scores['resample_rule'] = rule + result = evaluate_template( + template=template, + target_times=target_times, + readings=readings, + metrics=metrics, + tuning_metric=tuning_metric, + tuning_metric_kwargs=tuning_metric_kwargs, + threshold=threshold, + tpr=tpr, + tuning_iterations=tuning_iterations, + preprocessing=template_preprocessing, + init_params=template_params, + cost=cost, + test_size=test_size, + cv_splits=cv_splits, + random_state=random_state, + cache_path=cache_path + ) - scores.update(result) - scores['status'] = 'OK' + scores.update(result) + scores['status'] = 'OK' except Exception: scores['status'] = 'ERRORED' @@ -420,8 +555,10 @@ def make_problems(target_times_paths, readings_path, window_size_resample_rule, def run_benchmark(templates, problems, window_size_resample_rule=None, tuning_iterations=50, signals=None, preprocessing=0, init_params=None, - metric='f1', cost=False, cv_splits=5, test_size=0.33, random_state=0, - cache_path=None, cache_results=None, output_path=None): + metrics=None, threshold=None, tpr=None, tuning_metric='roc_auc_score', + tuning_metric_kwargs=DEFAULT_TUNING_METRIC_KWARGS, cost=False, cv_splits=5, + test_size=0.33, random_state=0, cache_path=None, cache_results=None, + output_path=None, multiprocess=False): """Execute the benchmark function and optionally store the result as a ``CSV``. This function provides a user-friendly interface to interact with the ``evaluate_templates`` @@ -551,8 +688,10 @@ def run_benchmark(templates, problems, window_size_resample_rule=None, df = evaluate_templates( templates, [(window_size, resample_rule)], - metric=metric, + metrics=metrics, tuning_iterations=tuning_iterations, + threshold=threshold, + tpr=tpr, init_params=init_params, target_times=target_times, readings=readings, @@ -565,7 +704,8 @@ def run_benchmark(templates, problems, window_size_resample_rule=None, cache_results=cache_results, problem_name=problem_name, output_path=None, - progress_bar=pbar + progress_bar=pbar, + multiprocess=multiprocess, ) results.append(df) @@ -618,13 +758,23 @@ def _run(args): for item in args.window_size_resample_rule ] + if args.tuning_metric_kwargs: + args.tuning_metric_kwargs = json.loads(args.tuning_metric_kwargs) + + else: + args.tuning_metric_kwargs = DEFAULT_TUNING_METRIC_KWARGS + # run results = run_benchmark( templates=args.templates, problems=args.problems, window_size_resample_rule=window_size_resample_rule, cv_splits=args.cv_splits, - metric=args.metric, + metrics=args.metrics, + threshold=args.threshold, + tpr=args.tpr, + tuning_metric=args.tuning_metric, + tuning_metric_kwargs=args.tuning_metric_kwargs, test_size=args.test_size, random_state=args.random_state, cache_path=args.cache_path, @@ -632,6 +782,7 @@ def _run(args): tuning_iterations=args.iterations, output_path=args.output_path, signals=args.signals, + multiprocess=args.multiprocess ) if not args.output_path: @@ -699,8 +850,12 @@ def _get_parser(): help='Output path where to store the results.') run.add_argument('-s', '--cv-splits', type=int, default=5, help='Amount of cross validation splits to use.') - run.add_argument('-m', '--metric', type=str, default='f1', - help='Name of metric function to be used during benchmarking.') + run.add_argument('-m', '--metrics', nargs='+', + help='Names of metric functions to be used for the benchmarking.') + run.add_argument('-T', '--threshold', nargs='+', type=float, + help='Threhshold values for the metrics.') + run.add_argument('-P', '--tpr', nargs='+', type=float, + help='TPR vales for the metrics, if provided threshold will be ignored.') run.add_argument('-n', '--random-state', type=int, default=0, help='Random state for the cv splits.') run.add_argument('-e', '--test-size', type=float, default=0.33, @@ -713,6 +868,13 @@ def _get_parser(): help='Number of iterations to perform per challenge with each candidate.') run.add_argument('-S', '--signals', type=str, help='Path to csv file that has signal_id column to use as the signal') + run.add_argument('-k', '--tuning-metric', type=str, default='roc_auc_score', + help='Tuning metric to be used.') + run.add_argument('-K', '--tuning-metric-kwargs', type=str, + help='Tuning metric args to be used with the metric.') + run.add_argument('-u', '--multiprocess', action='http://github.com/store_true', + help='Wether or not to spawn a separate process and avoid crashing.') + # Summarize action summary = action.add_parser('summarize-results', diff --git a/greenguard/metrics.py b/greenguard/metrics.py index ef1c249..6f50d32 100644 --- a/greenguard/metrics.py +++ b/greenguard/metrics.py @@ -2,8 +2,8 @@ import logging import numpy as np -from sklearn.metrics import ( - accuracy_score, f1_score, mean_absolute_error, mean_squared_error, roc_curve, r2_score) +from sklearn.metrics import (accuracy_score, f1_score, mean_absolute_error, + mean_squared_error, roc_curve, roc_auc_score, r2_score) LOGGER = logging.getLogger(__name__) @@ -12,7 +12,29 @@ def f1_macro(exp, obs): return f1_score(exp, obs, average='macro') -def fpr_score(ground_truth, probabilities, tpr=1): +def threshold_score(ground_truth, probabilities, tpr): + roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1) + try: + index = np.where(roc_tpr >= tpr)[0][0] + except: + LOGGER.warn('Could not find a threshold that satisfies the requested True Positive Rate') + index = -1 + + return roc_threshold[index] + + +def tpr_score(ground_truth, probabilities, threshold): + roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1) + try: + index = np.where(roc_threshold >= threshold)[0][0] + except: + LOGGER.warn('Could not find a tpr that satisfies the requested threshold') + index = -1 + + return roc_tpr[index] + + +def fpr_score(ground_truth, probabilities, tpr=None, threshold=None): """Compute the False Positive Rate associated with the given True Positive Rate. This metric computes the False Positive Rate that needs to be assumed in order @@ -36,7 +58,11 @@ def fpr_score(ground_truth, probabilities, tpr=1): """ roc_fpr, roc_tpr, roc_threshold = roc_curve(ground_truth, probabilities, pos_label=1) try: - index = np.where(roc_tpr >= tpr)[0][0] + if tpr: + index = np.where(roc_tpr >= tpr)[0][0] + elif threshold: + index = np.where(roc_threshold >= threshold)[0][0] + except: LOGGER.warn('Could not find a threshold that satisfies the requested True Positive Rate') index = -1 @@ -51,5 +77,6 @@ def fpr_score(ground_truth, probabilities, tpr=1): 'r2': (r2_score, False), 'mse': (mean_squared_error, True), 'mae': (mean_absolute_error, True), - 'fpr_score': (fpr_score, False) + 'fpr': (fpr_score, False), + 'roc_auc_score': (roc_auc_score, False) } diff --git a/greenguard/pipeline.py b/greenguard/pipeline.py index 156c046..2a9cd84 100644 --- a/greenguard/pipeline.py +++ b/greenguard/pipeline.py @@ -403,6 +403,7 @@ def _generate_splits(self, template_name, target_times, readings, turbines=None) if self._cache_path: split_name = '{}_{}.pkl'.format(template_name, fold) split_path = os.path.join(self._cache_path, split_name) + os.makedirs(os.path.dirname(split_path), exist_ok=True) with open(split_path, 'wb') as split_file: pickle.dump(split, split_file) diff --git a/greenguard/results.py b/greenguard/results.py index 869c26d..bbe4165 100644 --- a/greenguard/results.py +++ b/greenguard/results.py @@ -1,4 +1,5 @@ import os +from random import random import pandas as pd @@ -7,15 +8,15 @@ def load_results(files): problems_results = dict() for filename in files: problem = os.path.basename(filename).replace('.csv', '') - problems_results[problem] = pd.read_csv(filename, index_col=0).round(6) + problems_results[problem] = pd.read_csv(filename).round(6) return problems_results def get_wins_by_problems(results): - df = results.groupby('problem_name')['template', 'window_size', 'resample_rule', 'tuned_test'] + df = results.groupby('problem_name')['template', 'window_size', 'resample_rule', 'fpr_threshold=0.5'] df = df.apply(max) - df = df.rename(columns={'tuned_test': 'score'}) + df = df.rename(columns={'fpr_threshold=0.5': 'score'}) return df @@ -25,8 +26,8 @@ def get_exclusive_wins(scores, column, pivot_columns=['window_size', 'resample_r for problem in scores.problem_name.unique(): df = scores[scores['problem_name'] == problem] df['wr'] = df.apply( - lambda row: '{}_{}'.format(row[pivot_columns[0]], row[pivot_columns[1]]), axis=1) - df = df.pivot(index='wr', columns=column, values='tuned_test') + lambda row: '{}_{}_{}'.format(row[pivot_columns[0]], row[pivot_columns[1]], random()), axis=1) + df = df.pivot(index='wr', columns=column, values='fpr_threshold=0.5') is_winner = df.T.rank(method='min', ascending=False) == 1 num_winners = is_winner.sum() @@ -93,9 +94,9 @@ def write_results(results, output): if isinstance(results, dict): results = pd.concat(list(results.values()), ignore_index=True) - window = get_exclusive_wins(results, 'window_size', ['window_size', 'tuned_test']) + window = get_exclusive_wins(results, 'window_size', ['window_size', 'fpr_threshold=0.5']) - resample_pivots = ['resample_rule', ['problem_name', 'tuned_test']] + resample_pivots = ['resample_rule', ['problem_name', 'fpr_threshold=0.5']] resample = get_exclusive_wins(results, 'resample_rule', resample_pivots) summary = { diff --git a/setup.py b/setup.py index ebbde1c..cefe9da 100644 --- a/setup.py +++ b/setup.py @@ -16,9 +16,9 @@ history = '' install_requires = [ - 'baytune>=0.3.13.dev0,<0.4', - 'mlblocks>=0.3.4,<0.4', - 'mlprimitives>=0.2.6.dev0,<0.3', + 'baytune>=0.4.0,<0.5', + 'mlprimitives>=0.3.0,<0.4', + 'mlblocks>=0.4.0,<0.5', 'pymongo>=3.7.2,<4', 'scikit-learn>=0.21', 'tqdm<4.50.0,>=4.36.1', @@ -26,12 +26,14 @@ 'scipy>=1.0.1,<2', 'numpy<1.19.0,>=1.16.0', 'pandas>=1,<2', + 'partd>=1.1.0,<2', + 'fsspec>=0.8.5,<0.9', 'dask>=2.6.0,<3', + 'distributed>=2.6.0,<3', + 'h5py<2.11.0,>=2.10.0', # fix tensorflow requirement 'Keras>=2.4', 'tabulate>=0.8.3,<0.9', 'xlsxwriter>=1.3.6<1.4', - 'boto3==1.14.44', - 'botocore==1.17.44', ] setup_requires = [ diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index a1a2d6f..ac87cc2 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,5 +1,5 @@ """Tests for `greenguard.benchmark` module.""" -from sklearn.metrics import f1_score +import numpy as np from greenguard.benchmark import evaluate_templates from greenguard.demo import load_demo @@ -8,7 +8,7 @@ def test_predict(): # setup templates = [ - 'unstack_lstm_timeseries_classifier' + 'probability.unstack_lstm_timeseries_classifier' ] window_size_rule = [ @@ -16,7 +16,7 @@ def test_predict(): ] target_times, readings = load_demo() - target_times = target_times.head(10) + target_times = target_times.head(40) readings = readings.head(100) # run @@ -25,7 +25,6 @@ def test_predict(): readings=readings, templates=templates, window_size_rule=window_size_rule, - metric=f1_score, tuning_iterations=1, cv_splits=2 ) @@ -40,29 +39,39 @@ def test_predict(): 'default_cv', 'tuned_cv', 'tuned_test', - 'metric', + 'tuning_metric', + 'tuning_metric_kwargs', 'fit_predict_time', 'default_cv_time', 'average_cv_time', 'total_time', - 'status' + 'status', + 'accuracy_threshold/0.5', + 'f1_threshold/0.5', + 'fpr_threshold/0.5', + 'tpr_threshold/0.5', ] expected_dtypes = [ - 'object', - 'object', - 'object', - 'object', - 'float64', - 'float64', - 'float64', - 'float64', - 'float64', - 'float64', - 'float64', - 'float64', - 'float64', - 'object', + np.dtype('O'), + np.dtype('O'), + np.dtype('O'), + np.dtype('O'), + np.dtype('float64'), + np.dtype('float64'), + np.dtype('float64'), + np.dtype('float64'), + np.dtype('O'), + np.dtype('O'), + np.dtype(' Date: Fri, 22 Jan 2021 15:20:36 +0100 Subject: [PATCH 131/171] Update notebooks with the new pipelines. --- tutorials/03_Benchmarking.ipynb | 312 ++++--- .../normalize_dfs_xgb_classifier.ipynb | 760 +++++++++--------- ...ck_double_lstm_timeseries_classifier.ipynb | 26 +- .../unstack_lstm_timeseries_classifier.ipynb | 26 +- ...unstack_normalize_dfs_xgb_classifier.ipynb | 676 ++++++++-------- 5 files changed, 896 insertions(+), 904 deletions(-) diff --git a/tutorials/03_Benchmarking.ipynb b/tutorials/03_Benchmarking.ipynb index ee765a5..56e8701 100644 --- a/tutorials/03_Benchmarking.ipynb +++ b/tutorials/03_Benchmarking.ipynb @@ -75,8 +75,11 @@ "metadata": {}, "outputs": [], "source": [ - "templates = ['unstack_lstm_timeseries_classifier', 'normalize_dfs_xgb_classifier']\n", - "window_size_rule = [('1d', '1h'), ('3d', '4h')]\n", + "templates = [\n", + " 'probability.unstack_lstm_timeseries_classifier',\n", + " 'probability.normalize_dfs_xgb_classifier'\n", + "]\n", + "window_size_rule = [('1d', '1h'), ('2d', '2h')]\n", "init_params = {\n", " 'unstack_lstm_timeseries_classifier': {\n", " 'keras.Sequential.LSTMTimeSeriesClassifier#1': {\n", @@ -97,124 +100,90 @@ "name": "stderr", "output_type": "stream", "text": [ - "Using TensorFlow backend.\n", + "INFO:greenguard.benchmark:Evaluating template probability.unstack_lstm_timeseries_classifier on problem None (1d, 1h)\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: unstack_lstm_timeseries_classifier \n", + " Template: probability.unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: unstack_lstm_timeseries_classifier \n", + " Template: probability.unstack_lstm_timeseries_classifier \n", + " Hyperparameters: \n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 287\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.565737233372491\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 145\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: probability.unstack_lstm_timeseries_classifier \n", + " Hyperparameters: \n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 269\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.5973752345055594\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 412\n", + "INFO:greenguard.benchmark:Evaluating template probability.unstack_lstm_timeseries_classifier on problem None (2d, 2h)\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: probability.unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: unstack_lstm_timeseries_classifier \n", + " Template: probability.unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): median\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 353\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.4718077136146996\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 151\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Built 99 features\n", - "Elapsed: 02:58 | Progress: 100%|██████████\n", - "Elapsed: 00:58 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 01:54 | Progress: 100%|██████████\n", - "Elapsed: 01:08 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 02:20 | Progress: 100%|██████████\n", - "Elapsed: 01:09 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 02:16 | Progress: 100%|██████████\n", - "Elapsed: 01:07 | Progress: 100%|██████████\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 114\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.016427744327526084\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 224\n", + "INFO:greenguard.benchmark:Evaluating template probability.normalize_dfs_xgb_classifier on problem None (1d, 1h)\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: normalize_dfs_xgb_classifier \n", + " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", + " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 100\n", + " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.1\n", + " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.0\n", + " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 1\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: normalize_dfs_xgb_classifier \n", + " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 18\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 920\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 10\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.02731362750079913\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.46258174821600884\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 3\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Built 99 features\n", - "Elapsed: 03:18 | Progress: 100%|██████████\n", - "Elapsed: 01:03 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 03:15 | Progress: 100%|██████████\n", - "Elapsed: 01:06 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 02:05 | Progress: 100%|██████████\n", - "Elapsed: 01:10 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 01:51 | Progress: 100%|██████████\n", - "Elapsed: 00:54 | Progress: 100%|██████████\n", - "Built 99 features\n", - "Elapsed: 01:51 | Progress: 100%|██████████\n", - "Elapsed: 00:58 | Progress: 100%|██████████\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 11\n", + " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 231\n", + " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 9\n", + " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.554989010368875\n", + " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.909957492053926\n", + " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 7\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: normalize_dfs_xgb_classifier \n", + " Template: probability.normalize_dfs_xgb_classifier \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 61\n", + " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 122\n", + " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 5\n", + " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.6840927016151666\n", + " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.5480298094360865\n", + " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 6\n", + "INFO:greenguard.benchmark:Evaluating template probability.normalize_dfs_xgb_classifier on problem None (2d, 2h)\n", + "INFO:greenguard.pipeline:New configuration found:\n", + " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", + " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 100\n", + " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 3\n", + " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.1\n", + " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.0\n", + " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 1\n", "INFO:greenguard.pipeline:New configuration found:\n", - " Template: normalize_dfs_xgb_classifier \n", + " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 7\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 348\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.5272082810065426\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.04014402178038856\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 2\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Built 99 features\n", - "Elapsed: 02:42 | Progress: 100%|██████████\n", - "Elapsed: 01:00 | Progress: 100%|██████████\n" + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 99\n", + " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 616\n", + " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 8\n", + " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.0700166745838724\n", + " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.40990340522001234\n", + " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 10\n" ] } ], @@ -256,77 +225,156 @@ " \n", " \n", " \n", - " template\n", + " problem_name\n", " window_size\n", " resample_rule\n", + " template\n", " default_test\n", " default_cv\n", " tuned_cv\n", " tuned_test\n", + " tuning_metric\n", + " tuning_metric_kwargs\n", + " fit_predict_time\n", + " default_cv_time\n", + " average_cv_time\n", + " total_time\n", " status\n", + " accuracy_threshold/0.5\n", + " f1_threshold/0.5\n", + " fpr_threshold/0.5\n", + " tpr_threshold/0.5\n", " \n", " \n", " \n", " \n", " 0\n", - " unstack_lstm_timeseries_classifier\n", + " None\n", " 1d\n", " 1h\n", - " 0.711864\n", - " 0.646437\n", - " 0.646437\n", - " 0.666667\n", + " probability.unstack_lstm_timeseries_classifier\n", + " 0.350122\n", + " 0.538316\n", + " 0.618558\n", + " 0.463675\n", + " roc_auc_score\n", + " {'threshold': 0.5}\n", + " 0 days 00:00:04.250012\n", + " 0 days 00:00:14.374875\n", + " 0 days 00:00:15.360015\n", + " 0 days 00:01:10.806375\n", " OK\n", + " 0.640449\n", + " 0.058824\n", + " 1.0\n", + " 0.0\n", " \n", " \n", " 1\n", - " unstack_lstm_timeseries_classifier\n", - " 3d\n", - " 4h\n", - " 0.703704\n", - " 0.577295\n", - " 0.616052\n", - " 0.709677\n", + " None\n", + " 2d\n", + " 2h\n", + " probability.unstack_lstm_timeseries_classifier\n", + " 0.686203\n", + " 0.491949\n", + " 0.556803\n", + " 0.510989\n", + " roc_auc_score\n", + " {'threshold': 0.5}\n", + " 0 days 00:00:04.410682\n", + " 0 days 00:00:14.411205\n", + " 0 days 00:00:10.633619\n", + " 0 days 00:00:55.011304\n", " OK\n", + " 0.595506\n", + " 0.307692\n", + " 1.0\n", + " 0.0\n", " \n", " \n", " 2\n", - " normalize_dfs_xgb_classifier\n", + " None\n", " 1d\n", " 1h\n", - " 0.581818\n", - " 0.619698\n", - " 0.646750\n", - " 0.631579\n", + " probability.normalize_dfs_xgb_classifier\n", + " 0.697802\n", + " 0.669508\n", + " 0.701792\n", + " 0.766789\n", + " roc_auc_score\n", + " {'threshold': 0.5}\n", + " 0 days 00:01:11.416859\n", + " 0 days 00:02:55.012078\n", + " 0 days 00:00:00.806430\n", + " 0 days 00:05:20.653100\n", " OK\n", + " 0.797753\n", + " 0.666667\n", + " 1.0\n", + " 0.0\n", " \n", " \n", " 3\n", - " normalize_dfs_xgb_classifier\n", - " 3d\n", - " 4h\n", - " 0.581818\n", - " 0.619698\n", - " 0.637957\n", - " 0.642857\n", + " None\n", + " 2d\n", + " 2h\n", + " probability.normalize_dfs_xgb_classifier\n", + " 0.720391\n", + " 0.718617\n", + " 0.740664\n", + " 0.782662\n", + " roc_auc_score\n", + " {'threshold': 0.5}\n", + " 0 days 00:01:03.612676\n", + " 0 days 00:02:26.925796\n", + " 0 days 00:00:00.755424\n", + " 0 days 00:04:37.570182\n", " OK\n", + " 0.820225\n", + " 0.692308\n", + " 1.0\n", + " 0.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " template window_size resample_rule default_test \\\n", - "0 unstack_lstm_timeseries_classifier 1d 1h 0.711864 \n", - "1 unstack_lstm_timeseries_classifier 3d 4h 0.703704 \n", - "2 normalize_dfs_xgb_classifier 1d 1h 0.581818 \n", - "3 normalize_dfs_xgb_classifier 3d 4h 0.581818 \n", + " problem_name window_size resample_rule \\\n", + "0 None 1d 1h \n", + "1 None 2d 2h \n", + "2 None 1d 1h \n", + "3 None 2d 2h \n", + "\n", + " template default_test default_cv \\\n", + "0 probability.unstack_lstm_timeseries_classifier 0.350122 0.538316 \n", + "1 probability.unstack_lstm_timeseries_classifier 0.686203 0.491949 \n", + "2 probability.normalize_dfs_xgb_classifier 0.697802 0.669508 \n", + "3 probability.normalize_dfs_xgb_classifier 0.720391 0.718617 \n", + "\n", + " tuned_cv tuned_test tuning_metric tuning_metric_kwargs \\\n", + "0 0.618558 0.463675 roc_auc_score {'threshold': 0.5} \n", + "1 0.556803 0.510989 roc_auc_score {'threshold': 0.5} \n", + "2 0.701792 0.766789 roc_auc_score {'threshold': 0.5} \n", + "3 0.740664 0.782662 roc_auc_score {'threshold': 0.5} \n", + "\n", + " fit_predict_time default_cv_time average_cv_time \\\n", + "0 0 days 00:00:04.250012 0 days 00:00:14.374875 0 days 00:00:15.360015 \n", + "1 0 days 00:00:04.410682 0 days 00:00:14.411205 0 days 00:00:10.633619 \n", + "2 0 days 00:01:11.416859 0 days 00:02:55.012078 0 days 00:00:00.806430 \n", + "3 0 days 00:01:03.612676 0 days 00:02:26.925796 0 days 00:00:00.755424 \n", + "\n", + " total_time status accuracy_threshold/0.5 f1_threshold/0.5 \\\n", + "0 0 days 00:01:10.806375 OK 0.640449 0.058824 \n", + "1 0 days 00:00:55.011304 OK 0.595506 0.307692 \n", + "2 0 days 00:05:20.653100 OK 0.797753 0.666667 \n", + "3 0 days 00:04:37.570182 OK 0.820225 0.692308 \n", "\n", - " default_cv tuned_cv tuned_test status \n", - "0 0.646437 0.646437 0.666667 OK \n", - "1 0.577295 0.616052 0.709677 OK \n", - "2 0.619698 0.646750 0.631579 OK \n", - "3 0.619698 0.637957 0.642857 OK " + " fpr_threshold/0.5 tpr_threshold/0.5 \n", + "0 1.0 0.0 \n", + "1 1.0 0.0 \n", + "2 1.0 0.0 \n", + "3 1.0 0.0 " ] }, "execution_count": 4, @@ -355,7 +403,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.11" + "version": "3.6.12" } }, "nbformat": 4, diff --git a/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb b/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb index 5bcb1ea..5fc510e 100644 --- a/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb +++ b/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb @@ -11,15 +11,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "outputs": [], "source": [ "from greenguard.demo import load_demo\n", "\n", @@ -32,7 +24,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_name = 'normalize_dfs_xgb_classifier'" + "pipeline_name = 'classes.normalize_dfs_xgb_classifier'" ] }, { @@ -759,27 +751,27 @@ " \n", " \n", " \n", - " SUM(readings.value)\n", - " STD(readings.value)\n", + " COUNT(readings)\n", " MAX(readings.value)\n", - " SKEW(readings.value)\n", - " MIN(readings.value)\n", " MEAN(readings.value)\n", - " COUNT(readings)\n", - " NUM_UNIQUE(readings.signal_id)\n", + " MIN(readings.value)\n", " MODE(readings.signal_id)\n", - " NUM_UNIQUE(readings.DAY(timestamp))\n", + " NUM_UNIQUE(readings.signal_id)\n", + " SKEW(readings.value)\n", + " STD(readings.value)\n", + " SUM(readings.value)\n", + " MODE(readings.DAY(timestamp))\n", " ...\n", - " MEAN(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", - " MEAN(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", - " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp)))\n", - " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp)))\n", - " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp)))\n", - " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp)))\n", - " MODE(readings.signals.MODE(readings.MONTH(timestamp)))\n", - " MODE(readings.signals.MODE(readings.DAY(timestamp)))\n", - " MODE(readings.signals.MODE(readings.YEAR(timestamp)))\n", - " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp)))\n", + " SKEW(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", + " SKEW(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", + " STD(readings.signals.NUM_UNIQUE(readings.DAY(timestamp)))\n", + " STD(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", + " STD(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", + " STD(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", + " SUM(readings.signals.NUM_UNIQUE(readings.DAY(timestamp)))\n", + " SUM(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", + " SUM(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", + " SUM(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", " \n", " \n", " turbine_id\n", @@ -809,123 +801,123 @@ " \n", " \n", " T001\n", - " 3.457475e+09\n", - " 1.456852e+06\n", + " 3744\n", " 3448719.0\n", - " 1.019212\n", + " 917107.079193\n", " 0.0\n", - " 917102.224456\n", - " 3770\n", - " 26\n", " S01\n", - " 2\n", - " ...\n", - " 1\n", - " 2\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", + " 26\n", + " 1.019214\n", + " 1.456860e+06\n", + " 3.433649e+09\n", " 11\n", - " 2013\n", - " 4\n", + " ...\n", + " 0\n", + " 0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " \n", " \n", " T001\n", - " 3.465358e+09\n", - " 1.459852e+06\n", + " 3744\n", " 3453777.0\n", - " 1.018760\n", + " 919201.162179\n", " 0.0\n", - " 919193.186021\n", - " 3770\n", - " 26\n", " S01\n", - " 2\n", - " ...\n", - " 1\n", - " 2\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", + " 26\n", + " 1.018761\n", + " 1.459865e+06\n", + " 3.441489e+09\n", " 12\n", - " 2013\n", - " 5\n", + " ...\n", + " 0\n", + " 0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " \n", " \n", " T001\n", - " 3.479406e+09\n", - " 1.465252e+06\n", + " 3744\n", " 3463880.0\n", - " 1.018192\n", + " 922935.352244\n", " 2.7\n", - " 922919.430027\n", - " 3770\n", - " 26\n", " S01\n", - " 2\n", - " ...\n", - " 1\n", - " 2\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", + " 26\n", + " 1.018192\n", + " 1.465277e+06\n", + " 3.455470e+09\n", " 13\n", - " 2013\n", - " 6\n", + " ...\n", + " 0\n", + " 0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " \n", " \n", " T001\n", - " 3.499427e+09\n", - " 1.473308e+06\n", + " 3744\n", " 3474703.0\n", - " 1.017664\n", + " 928248.092869\n", " -1.0\n", - " 928229.883899\n", - " 3770\n", - " 26\n", " S01\n", - " 2\n", - " ...\n", - " 1\n", - " 2\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", + " 26\n", + " 1.017666\n", + " 1.473337e+06\n", + " 3.475361e+09\n", " 14\n", - " 2013\n", + " ...\n", " 0\n", + " 0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " \n", " \n", " T001\n", - " 2.912289e+09\n", - " 1.477955e+06\n", + " 3744\n", " 3485019.0\n", - " 1.031879\n", + " 924186.531200\n", " 0.0\n", - " 924242.895144\n", - " 3770\n", - " 26\n", " S01\n", - " 2\n", - " ...\n", - " 1\n", - " 2\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", + " 26\n", + " 1.032002\n", + " 1.477958e+06\n", + " 2.888083e+09\n", " 15\n", - " 2013\n", - " 1\n", + " ...\n", + " 0\n", + " 0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " \n", " \n", "\n", @@ -933,117 +925,125 @@ "" ], "text/plain": [ - " SUM(readings.value) STD(readings.value) MAX(readings.value) \\\n", - "turbine_id \n", - "T001 3.457475e+09 1.456852e+06 3448719.0 \n", - "T001 3.465358e+09 1.459852e+06 3453777.0 \n", - "T001 3.479406e+09 1.465252e+06 3463880.0 \n", - "T001 3.499427e+09 1.473308e+06 3474703.0 \n", - "T001 2.912289e+09 1.477955e+06 3485019.0 \n", + " COUNT(readings) MAX(readings.value) MEAN(readings.value) \\\n", + "turbine_id \n", + "T001 3744 3448719.0 917107.079193 \n", + "T001 3744 3453777.0 919201.162179 \n", + "T001 3744 3463880.0 922935.352244 \n", + "T001 3744 3474703.0 928248.092869 \n", + "T001 3744 3485019.0 924186.531200 \n", "\n", - " SKEW(readings.value) MIN(readings.value) MEAN(readings.value) \\\n", - "turbine_id \n", - "T001 1.019212 0.0 917102.224456 \n", - "T001 1.018760 0.0 919193.186021 \n", - "T001 1.018192 2.7 922919.430027 \n", - "T001 1.017664 -1.0 928229.883899 \n", - "T001 1.031879 0.0 924242.895144 \n", + " MIN(readings.value) MODE(readings.signal_id) \\\n", + "turbine_id \n", + "T001 0.0 S01 \n", + "T001 0.0 S01 \n", + "T001 2.7 S01 \n", + "T001 -1.0 S01 \n", + "T001 0.0 S01 \n", "\n", - " COUNT(readings) NUM_UNIQUE(readings.signal_id) \\\n", - "turbine_id \n", - "T001 3770 26 \n", - "T001 3770 26 \n", - "T001 3770 26 \n", - "T001 3770 26 \n", - "T001 3770 26 \n", + " NUM_UNIQUE(readings.signal_id) SKEW(readings.value) \\\n", + "turbine_id \n", + "T001 26 1.019214 \n", + "T001 26 1.018761 \n", + "T001 26 1.018192 \n", + "T001 26 1.017666 \n", + "T001 26 1.032002 \n", "\n", - " MODE(readings.signal_id) NUM_UNIQUE(readings.DAY(timestamp)) ... \\\n", - "turbine_id ... \n", - "T001 S01 2 ... \n", - "T001 S01 2 ... \n", - "T001 S01 2 ... \n", - "T001 S01 2 ... \n", - "T001 S01 2 ... \n", + " STD(readings.value) SUM(readings.value) \\\n", + "turbine_id \n", + "T001 1.456860e+06 3.433649e+09 \n", + "T001 1.459865e+06 3.441489e+09 \n", + "T001 1.465277e+06 3.455470e+09 \n", + "T001 1.473337e+06 3.475361e+09 \n", + "T001 1.477958e+06 2.888083e+09 \n", "\n", - " MEAN(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " MODE(readings.DAY(timestamp)) ... \\\n", + "turbine_id ... \n", + "T001 11 ... \n", + "T001 12 ... \n", + "T001 13 ... \n", + "T001 14 ... \n", + "T001 15 ... \n", "\n", - " MEAN(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", + " SKEW(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", + "T001 0 \n", + "T001 0 \n", + "T001 0 \n", + "T001 0 \n", + "T001 0 \n", "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " SKEW(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 0 \n", + "T001 0 \n", + "T001 0 \n", + "T001 0 \n", + "T001 0 \n", "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " STD(readings.signals.NUM_UNIQUE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", + " STD(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " STD(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", "\n", - " MODE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " STD(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", "\n", - " MODE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 11 \n", - "T001 12 \n", - "T001 13 \n", - "T001 14 \n", - "T001 15 \n", + " SUM(readings.signals.NUM_UNIQUE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", "\n", - " MODE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", + " SUM(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "\n", + " SUM(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", "\n", - " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \n", - "turbine_id \n", - "T001 4 \n", - "T001 5 \n", - "T001 6 \n", - "T001 0 \n", - "T001 1 \n", + " SUM(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \n", + "turbine_id \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", "\n", "[5 rows x 99 columns]" ] @@ -1235,25 +1235,25 @@ " \n", " \n", " \n", - " SUM(readings.value)\n", - " STD(readings.value)\n", + " COUNT(readings)\n", " MAX(readings.value)\n", - " SKEW(readings.value)\n", - " MIN(readings.value)\n", " MEAN(readings.value)\n", - " COUNT(readings)\n", + " MIN(readings.value)\n", " NUM_UNIQUE(readings.signal_id)\n", - " NUM_UNIQUE(readings.DAY(timestamp))\n", - " NUM_UNIQUE(readings.MONTH(timestamp))\n", + " SKEW(readings.value)\n", + " STD(readings.value)\n", + " SUM(readings.value)\n", + " MODE(readings.DAY(timestamp))\n", + " MODE(readings.MONTH(timestamp))\n", " ...\n", - " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp)))\n", - " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp)))\n", - " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp)))\n", - " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp)))\n", - " MODE(readings.signals.MODE(readings.MONTH(timestamp)))\n", - " MODE(readings.signals.MODE(readings.DAY(timestamp)))\n", - " MODE(readings.signals.MODE(readings.YEAR(timestamp)))\n", - " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp)))\n", + " STD(readings.signals.NUM_UNIQUE(readings.DAY(timestamp)))\n", + " STD(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", + " STD(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", + " STD(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", + " SUM(readings.signals.NUM_UNIQUE(readings.DAY(timestamp)))\n", + " SUM(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", + " SUM(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", + " SUM(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", " MODE(readings.signal_id)=S01\n", " MODE(readings.signals.MODE(readings.turbine_id))=T001\n", " \n", @@ -1285,121 +1285,121 @@ " \n", " \n", " T001\n", - " 3.457475e+09\n", - " 1.456852e+06\n", + " 3744\n", " 3448719.0\n", - " 1.019212\n", + " 917107.079193\n", " 0.0\n", - " 917102.224456\n", - " 3770\n", " 26\n", - " 2\n", + " 1.019214\n", + " 1.456860e+06\n", + " 3.433649e+09\n", + " 11\n", " 1\n", " ...\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 11\n", - " 2013\n", - " 4\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " 1\n", " 1\n", " \n", " \n", " T001\n", - " 3.465358e+09\n", - " 1.459852e+06\n", + " 3744\n", " 3453777.0\n", - " 1.018760\n", + " 919201.162179\n", " 0.0\n", - " 919193.186021\n", - " 3770\n", " 26\n", - " 2\n", + " 1.018761\n", + " 1.459865e+06\n", + " 3.441489e+09\n", + " 12\n", " 1\n", " ...\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 12\n", - " 2013\n", - " 5\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " 1\n", " 1\n", " \n", " \n", " T001\n", - " 3.479406e+09\n", - " 1.465252e+06\n", + " 3744\n", " 3463880.0\n", - " 1.018192\n", + " 922935.352244\n", " 2.7\n", - " 922919.430027\n", - " 3770\n", " 26\n", - " 2\n", + " 1.018192\n", + " 1.465277e+06\n", + " 3.455470e+09\n", + " 13\n", " 1\n", " ...\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 13\n", - " 2013\n", - " 6\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " 1\n", " 1\n", " \n", " \n", " T001\n", - " 3.499427e+09\n", - " 1.473308e+06\n", + " 3744\n", " 3474703.0\n", - " 1.017664\n", + " 928248.092869\n", " -1.0\n", - " 928229.883899\n", - " 3770\n", " 26\n", - " 2\n", + " 1.017666\n", + " 1.473337e+06\n", + " 3.475361e+09\n", + " 14\n", " 1\n", " ...\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 14\n", - " 2013\n", - " 0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " 1\n", " 1\n", " \n", " \n", " T001\n", - " 2.912289e+09\n", - " 1.477955e+06\n", + " 3744\n", " 3485019.0\n", - " 1.031879\n", + " 924186.531200\n", " 0.0\n", - " 924242.895144\n", - " 3770\n", " 26\n", - " 2\n", - " 1\n", - " ...\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", - " 1\n", + " 1.032002\n", + " 1.477958e+06\n", + " 2.888083e+09\n", " 15\n", - " 2013\n", " 1\n", + " ...\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 0.0\n", + " 7488\n", + " 3744\n", + " 7488\n", + " 3744\n", " 1\n", " 1\n", " \n", @@ -1409,109 +1409,101 @@ "" ], "text/plain": [ - " SUM(readings.value) STD(readings.value) MAX(readings.value) \\\n", - "turbine_id \n", - "T001 3.457475e+09 1.456852e+06 3448719.0 \n", - "T001 3.465358e+09 1.459852e+06 3453777.0 \n", - "T001 3.479406e+09 1.465252e+06 3463880.0 \n", - "T001 3.499427e+09 1.473308e+06 3474703.0 \n", - "T001 2.912289e+09 1.477955e+06 3485019.0 \n", - "\n", - " SKEW(readings.value) MIN(readings.value) MEAN(readings.value) \\\n", - "turbine_id \n", - "T001 1.019212 0.0 917102.224456 \n", - "T001 1.018760 0.0 919193.186021 \n", - "T001 1.018192 2.7 922919.430027 \n", - "T001 1.017664 -1.0 928229.883899 \n", - "T001 1.031879 0.0 924242.895144 \n", - "\n", - " COUNT(readings) NUM_UNIQUE(readings.signal_id) \\\n", - "turbine_id \n", - "T001 3770 26 \n", - "T001 3770 26 \n", - "T001 3770 26 \n", - "T001 3770 26 \n", - "T001 3770 26 \n", + " COUNT(readings) MAX(readings.value) MEAN(readings.value) \\\n", + "turbine_id \n", + "T001 3744 3448719.0 917107.079193 \n", + "T001 3744 3453777.0 919201.162179 \n", + "T001 3744 3463880.0 922935.352244 \n", + "T001 3744 3474703.0 928248.092869 \n", + "T001 3744 3485019.0 924186.531200 \n", "\n", - " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", - "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", + " MIN(readings.value) NUM_UNIQUE(readings.signal_id) \\\n", + "turbine_id \n", + "T001 0.0 26 \n", + "T001 0.0 26 \n", + "T001 2.7 26 \n", + "T001 -1.0 26 \n", + "T001 0.0 26 \n", "\n", - " NUM_UNIQUE(readings.MONTH(timestamp)) ... \\\n", - "turbine_id ... \n", - "T001 1 ... \n", - "T001 1 ... \n", - "T001 1 ... \n", - "T001 1 ... \n", - "T001 1 ... \n", + " SKEW(readings.value) STD(readings.value) SUM(readings.value) \\\n", + "turbine_id \n", + "T001 1.019214 1.456860e+06 3.433649e+09 \n", + "T001 1.018761 1.459865e+06 3.441489e+09 \n", + "T001 1.018192 1.465277e+06 3.455470e+09 \n", + "T001 1.017666 1.473337e+06 3.475361e+09 \n", + "T001 1.032002 1.477958e+06 2.888083e+09 \n", "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " MODE(readings.DAY(timestamp)) MODE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 11 1 \n", + "T001 12 1 \n", + "T001 13 1 \n", + "T001 14 1 \n", + "T001 15 1 \n", "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " ... STD(readings.signals.NUM_UNIQUE(readings.DAY(timestamp))) \\\n", + "turbine_id ... \n", + "T001 ... 0.0 \n", + "T001 ... 0.0 \n", + "T001 ... 0.0 \n", + "T001 ... 0.0 \n", + "T001 ... 0.0 \n", "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", + " STD(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " STD(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", "\n", - " MODE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " STD(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", + "T001 0.0 \n", "\n", - " MODE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 11 \n", - "T001 12 \n", - "T001 13 \n", - "T001 14 \n", - "T001 15 \n", + " SUM(readings.signals.NUM_UNIQUE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "\n", + " SUM(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", "\n", - " MODE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", + " SUM(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", + "T001 7488 \n", "\n", - " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 4 \n", - "T001 5 \n", - "T001 6 \n", - "T001 0 \n", - "T001 1 \n", + " SUM(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", + "T001 3744 \n", "\n", " MODE(readings.signal_id)=S01 \\\n", "turbine_id \n", diff --git a/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb b/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb index 5c7b442..f44377b 100644 --- a/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb +++ b/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb @@ -11,15 +11,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "outputs": [], "source": [ "from greenguard.demo import load_demo\n", "\n", @@ -32,7 +24,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_name = 'unstack_double_lstm_timeseries_classifier'" + "pipeline_name = 'classes.unstack_double_lstm_timeseries_classifier'" ] }, { @@ -2458,19 +2450,7 @@ "cell_type": "code", "execution_count": 43, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/plamen/.virtualenvs/GreenGuard/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "If using Keras pass *_constraint arguments to layers.\n", - "WARNING:tensorflow:From /home/plamen/.virtualenvs/GreenGuard/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "step = 10\n", "context = pipeline.fit(**context, output_=step, start_=step)" diff --git a/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb b/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb index faec108..ec68b0e 100644 --- a/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb +++ b/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb @@ -11,15 +11,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "outputs": [], "source": [ "from greenguard.demo import load_demo\n", "\n", @@ -32,7 +24,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_name = 'unstack_lstm_timeseries_classifier'" + "pipeline_name = 'classes.unstack_lstm_timeseries_classifier'" ] }, { @@ -2332,19 +2324,7 @@ "cell_type": "code", "execution_count": 43, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING:tensorflow:From /home/plamen/.virtualenvs/GreenGuard/lib/python3.6/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n", - "Instructions for updating:\n", - "If using Keras pass *_constraint arguments to layers.\n", - "WARNING:tensorflow:From /home/plamen/.virtualenvs/GreenGuard/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "step = 10\n", "context = pipeline.fit(**context, output_=step, start_=step)" diff --git a/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb b/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb index 6af0092..8fc6c8b 100644 --- a/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb +++ b/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb @@ -11,15 +11,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using TensorFlow backend.\n" - ] - } - ], + "outputs": [], "source": [ "from greenguard.demo import load_demo\n", "\n", @@ -32,7 +24,7 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_name = 'unstack_normalize_dfs_xgb_classifier'" + "pipeline_name = 'classes.unstack_normalize_dfs_xgb_classifier'" ] }, { @@ -851,27 +843,27 @@ " \n", " \n", " \n", - " SUM(readings.value_S09)\n", - " SUM(readings.value_S01)\n", - " SUM(readings.value_S12)\n", - " SUM(readings.value_S10)\n", - " SUM(readings.value_S18)\n", - " SUM(readings.value_S03)\n", - " SUM(readings.value_S16)\n", - " SUM(readings.value_S11)\n", - " SUM(readings.value_S21)\n", - " SUM(readings.value_S08)\n", - " ...\n", - " MEAN(readings.value_S20)\n", " COUNT(readings)\n", - " NUM_UNIQUE(readings.WEEKDAY(timestamp))\n", - " NUM_UNIQUE(readings.DAY(timestamp))\n", - " NUM_UNIQUE(readings.YEAR(timestamp))\n", - " NUM_UNIQUE(readings.MONTH(timestamp))\n", - " MODE(readings.WEEKDAY(timestamp))\n", + " MAX(readings.value_S01)\n", + " MAX(readings.value_S02)\n", + " MAX(readings.value_S03)\n", + " MAX(readings.value_S04)\n", + " MAX(readings.value_S05)\n", + " MAX(readings.value_S06)\n", + " MAX(readings.value_S07)\n", + " MAX(readings.value_S08)\n", + " MAX(readings.value_S09)\n", + " ...\n", + " SUM(readings.value_S25)\n", + " SUM(readings.value_S26)\n", " MODE(readings.DAY(timestamp))\n", - " MODE(readings.YEAR(timestamp))\n", " MODE(readings.MONTH(timestamp))\n", + " MODE(readings.WEEKDAY(timestamp))\n", + " MODE(readings.YEAR(timestamp))\n", + " NUM_UNIQUE(readings.DAY(timestamp))\n", + " NUM_UNIQUE(readings.MONTH(timestamp))\n", + " NUM_UNIQUE(readings.WEEKDAY(timestamp))\n", + " NUM_UNIQUE(readings.YEAR(timestamp))\n", " \n", " \n", " turbine_id\n", @@ -901,122 +893,122 @@ " \n", " \n", " T001\n", - " 102204875.0\n", - " 19558.0\n", - " 483068250.0\n", - " 486911931.0\n", - " 463347422.0\n", - " 18602.0\n", - " 555.2\n", - " 499808026.0\n", - " 3090.0\n", - " 465058755.0\n", + " 144\n", + " 369.0\n", + " 376.0\n", + " 378.0\n", + " 401.0\n", + " 317.0\n", + " 324.0\n", + " 301.0\n", + " 3209069.0\n", + " 706654.0\n", " ...\n", - " 22.406897\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 2743.0\n", + " 20569.0\n", + " 11\n", " 1\n", " 4\n", - " 11\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", " T001\n", - " 102808505.0\n", - " 37965.0\n", - " 483585662.0\n", - " 487487610.0\n", - " 467167621.0\n", - " 34495.0\n", - " 719.2\n", - " 500401347.0\n", - " 4970.0\n", - " 465669184.0\n", + " 144\n", + " 505.0\n", + " 426.0\n", + " 393.0\n", + " 517.0\n", + " 469.0\n", + " 407.0\n", + " 459.0\n", + " 3214181.0\n", + " 711718.0\n", " ...\n", - " 35.282759\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 4237.0\n", + " 32991.0\n", + " 12\n", " 1\n", " 5\n", - " 12\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", " T001\n", - " 103701788.0\n", - " 73948.0\n", - " 484538080.0\n", - " 488531121.0\n", - " 473938223.0\n", - " 77804.0\n", - " 921.1\n", - " 501472849.0\n", - " 9902.0\n", - " 466675578.0\n", + " 144\n", + " 827.0\n", + " 794.0\n", + " 839.0\n", + " 848.0\n", + " 843.0\n", + " 843.0\n", + " 844.0\n", + " 3223315.0\n", + " 719405.0\n", " ...\n", - " 53.255172\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 9008.0\n", + " 63463.0\n", + " 13\n", " 1\n", " 6\n", - " 13\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", " T001\n", - " 104917985.0\n", - " 87206.0\n", - " 486012792.0\n", - " 490024295.0\n", - " 483808936.0\n", - " 81629.0\n", - " 977.2\n", - " 502994331.0\n", - " 10720.0\n", - " 468099974.0\n", + " 144\n", + " 848.0\n", + " 841.0\n", + " 838.0\n", + " 849.0\n", + " 850.0\n", + " 848.0\n", + " 850.0\n", + " 3233989.0\n", + " 728250.0\n", " ...\n", - " 61.482759\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 10073.0\n", + " 70393.0\n", + " 14\n", " 1\n", " 0\n", - " 14\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", " T001\n", - " 84328762.0\n", - " 61778.0\n", - " 389879083.0\n", - " 396521849.0\n", - " 492596536.0\n", - " 65122.0\n", - " 954.3\n", - " 403671026.0\n", - " 8684.0\n", - " 375635231.0\n", + " 144\n", + " 825.0\n", + " 840.0\n", + " 840.0\n", + " 844.0\n", + " 844.0\n", + " 830.0\n", + " 839.0\n", + " 3242820.0\n", + " 738155.0\n", " ...\n", - " 87.315789\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 7381.0\n", + " 59954.0\n", + " 15\n", " 1\n", " 1\n", - " 15\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", @@ -1025,61 +1017,69 @@ "" ], "text/plain": [ - " SUM(readings.value_S09) SUM(readings.value_S01) \\\n", - "turbine_id \n", - "T001 102204875.0 19558.0 \n", - "T001 102808505.0 37965.0 \n", - "T001 103701788.0 73948.0 \n", - "T001 104917985.0 87206.0 \n", - "T001 84328762.0 61778.0 \n", + " COUNT(readings) MAX(readings.value_S01) MAX(readings.value_S02) \\\n", + "turbine_id \n", + "T001 144 369.0 376.0 \n", + "T001 144 505.0 426.0 \n", + "T001 144 827.0 794.0 \n", + "T001 144 848.0 841.0 \n", + "T001 144 825.0 840.0 \n", "\n", - " SUM(readings.value_S12) SUM(readings.value_S10) \\\n", + " MAX(readings.value_S03) MAX(readings.value_S04) \\\n", "turbine_id \n", - "T001 483068250.0 486911931.0 \n", - "T001 483585662.0 487487610.0 \n", - "T001 484538080.0 488531121.0 \n", - "T001 486012792.0 490024295.0 \n", - "T001 389879083.0 396521849.0 \n", + "T001 378.0 401.0 \n", + "T001 393.0 517.0 \n", + "T001 839.0 848.0 \n", + "T001 838.0 849.0 \n", + "T001 840.0 844.0 \n", "\n", - " SUM(readings.value_S18) SUM(readings.value_S03) \\\n", + " MAX(readings.value_S05) MAX(readings.value_S06) \\\n", "turbine_id \n", - "T001 463347422.0 18602.0 \n", - "T001 467167621.0 34495.0 \n", - "T001 473938223.0 77804.0 \n", - "T001 483808936.0 81629.0 \n", - "T001 492596536.0 65122.0 \n", + "T001 317.0 324.0 \n", + "T001 469.0 407.0 \n", + "T001 843.0 843.0 \n", + "T001 850.0 848.0 \n", + "T001 844.0 830.0 \n", "\n", - " SUM(readings.value_S16) SUM(readings.value_S11) \\\n", + " MAX(readings.value_S07) MAX(readings.value_S08) \\\n", "turbine_id \n", - "T001 555.2 499808026.0 \n", - "T001 719.2 500401347.0 \n", - "T001 921.1 501472849.0 \n", - "T001 977.2 502994331.0 \n", - "T001 954.3 403671026.0 \n", + "T001 301.0 3209069.0 \n", + "T001 459.0 3214181.0 \n", + "T001 844.0 3223315.0 \n", + "T001 850.0 3233989.0 \n", + "T001 839.0 3242820.0 \n", "\n", - " SUM(readings.value_S21) SUM(readings.value_S08) ... \\\n", - "turbine_id ... \n", - "T001 3090.0 465058755.0 ... \n", - "T001 4970.0 465669184.0 ... \n", - "T001 9902.0 466675578.0 ... \n", - "T001 10720.0 468099974.0 ... \n", - "T001 8684.0 375635231.0 ... \n", + " MAX(readings.value_S09) ... SUM(readings.value_S25) \\\n", + "turbine_id ... \n", + "T001 706654.0 ... 2743.0 \n", + "T001 711718.0 ... 4237.0 \n", + "T001 719405.0 ... 9008.0 \n", + "T001 728250.0 ... 10073.0 \n", + "T001 738155.0 ... 7381.0 \n", "\n", - " MEAN(readings.value_S20) COUNT(readings) \\\n", - "turbine_id \n", - "T001 22.406897 145 \n", - "T001 35.282759 145 \n", - "T001 53.255172 145 \n", - "T001 61.482759 145 \n", - "T001 87.315789 145 \n", + " SUM(readings.value_S26) MODE(readings.DAY(timestamp)) \\\n", + "turbine_id \n", + "T001 20569.0 11 \n", + "T001 32991.0 12 \n", + "T001 63463.0 13 \n", + "T001 70393.0 14 \n", + "T001 59954.0 15 \n", "\n", - " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", - "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", + " MODE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.WEEKDAY(timestamp)) MODE(readings.YEAR(timestamp)) \\\n", + "turbine_id \n", + "T001 4 2013 \n", + "T001 5 2013 \n", + "T001 6 2013 \n", + "T001 0 2013 \n", + "T001 1 2013 \n", "\n", " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", "turbine_id \n", @@ -1089,14 +1089,6 @@ "T001 2 \n", "T001 2 \n", "\n", - " NUM_UNIQUE(readings.YEAR(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", "turbine_id \n", "T001 1 \n", @@ -1105,21 +1097,21 @@ "T001 1 \n", "T001 1 \n", "\n", - " MODE(readings.WEEKDAY(timestamp)) MODE(readings.DAY(timestamp)) \\\n", - "turbine_id \n", - "T001 4 11 \n", - "T001 5 12 \n", - "T001 6 13 \n", - "T001 0 14 \n", - "T001 1 15 \n", + " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", "\n", - " MODE(readings.YEAR(timestamp)) MODE(readings.MONTH(timestamp)) \n", - "turbine_id \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", + " NUM_UNIQUE(readings.YEAR(timestamp)) \n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", "[5 rows x 165 columns]" ] @@ -1221,27 +1213,27 @@ " \n", " \n", " \n", - " SUM(readings.value_S09)\n", - " SUM(readings.value_S01)\n", - " SUM(readings.value_S12)\n", - " SUM(readings.value_S10)\n", - " SUM(readings.value_S18)\n", - " SUM(readings.value_S03)\n", - " SUM(readings.value_S16)\n", - " SUM(readings.value_S11)\n", - " SUM(readings.value_S21)\n", - " SUM(readings.value_S08)\n", - " ...\n", - " MEAN(readings.value_S20)\n", " COUNT(readings)\n", - " NUM_UNIQUE(readings.WEEKDAY(timestamp))\n", - " NUM_UNIQUE(readings.DAY(timestamp))\n", - " NUM_UNIQUE(readings.YEAR(timestamp))\n", - " NUM_UNIQUE(readings.MONTH(timestamp))\n", - " MODE(readings.WEEKDAY(timestamp))\n", + " MAX(readings.value_S01)\n", + " MAX(readings.value_S02)\n", + " MAX(readings.value_S03)\n", + " MAX(readings.value_S04)\n", + " MAX(readings.value_S05)\n", + " MAX(readings.value_S06)\n", + " MAX(readings.value_S07)\n", + " MAX(readings.value_S08)\n", + " MAX(readings.value_S09)\n", + " ...\n", + " SUM(readings.value_S25)\n", + " SUM(readings.value_S26)\n", " MODE(readings.DAY(timestamp))\n", - " MODE(readings.YEAR(timestamp))\n", " MODE(readings.MONTH(timestamp))\n", + " MODE(readings.WEEKDAY(timestamp))\n", + " MODE(readings.YEAR(timestamp))\n", + " NUM_UNIQUE(readings.DAY(timestamp))\n", + " NUM_UNIQUE(readings.MONTH(timestamp))\n", + " NUM_UNIQUE(readings.WEEKDAY(timestamp))\n", + " NUM_UNIQUE(readings.YEAR(timestamp))\n", " \n", " \n", " turbine_id\n", @@ -1271,122 +1263,122 @@ " \n", " \n", " T001\n", - " 102204875.0\n", - " 19558.0\n", - " 483068250.0\n", - " 486911931.0\n", - " 463347422.0\n", - " 18602.0\n", - " 555.2\n", - " 499808026.0\n", - " 3090.0\n", - " 465058755.0\n", + " 144\n", + " 369.0\n", + " 376.0\n", + " 378.0\n", + " 401.0\n", + " 317.0\n", + " 324.0\n", + " 301.0\n", + " 3209069.0\n", + " 706654.0\n", " ...\n", - " 22.406897\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 2743.0\n", + " 20569.0\n", + " 11\n", " 1\n", " 4\n", - " 11\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", " T001\n", - " 102808505.0\n", - " 37965.0\n", - " 483585662.0\n", - " 487487610.0\n", - " 467167621.0\n", - " 34495.0\n", - " 719.2\n", - " 500401347.0\n", - " 4970.0\n", - " 465669184.0\n", + " 144\n", + " 505.0\n", + " 426.0\n", + " 393.0\n", + " 517.0\n", + " 469.0\n", + " 407.0\n", + " 459.0\n", + " 3214181.0\n", + " 711718.0\n", " ...\n", - " 35.282759\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 4237.0\n", + " 32991.0\n", + " 12\n", " 1\n", " 5\n", - " 12\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", " T001\n", - " 103701788.0\n", - " 73948.0\n", - " 484538080.0\n", - " 488531121.0\n", - " 473938223.0\n", - " 77804.0\n", - " 921.1\n", - " 501472849.0\n", - " 9902.0\n", - " 466675578.0\n", + " 144\n", + " 827.0\n", + " 794.0\n", + " 839.0\n", + " 848.0\n", + " 843.0\n", + " 843.0\n", + " 844.0\n", + " 3223315.0\n", + " 719405.0\n", " ...\n", - " 53.255172\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 9008.0\n", + " 63463.0\n", + " 13\n", " 1\n", " 6\n", - " 13\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", " T001\n", - " 104917985.0\n", - " 87206.0\n", - " 486012792.0\n", - " 490024295.0\n", - " 483808936.0\n", - " 81629.0\n", - " 977.2\n", - " 502994331.0\n", - " 10720.0\n", - " 468099974.0\n", + " 144\n", + " 848.0\n", + " 841.0\n", + " 838.0\n", + " 849.0\n", + " 850.0\n", + " 848.0\n", + " 850.0\n", + " 3233989.0\n", + " 728250.0\n", " ...\n", - " 61.482759\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 10073.0\n", + " 70393.0\n", + " 14\n", " 1\n", " 0\n", - " 14\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", " T001\n", - " 84328762.0\n", - " 61778.0\n", - " 389879083.0\n", - " 396521849.0\n", - " 492596536.0\n", - " 65122.0\n", - " 954.3\n", - " 403671026.0\n", - " 8684.0\n", - " 375635231.0\n", + " 144\n", + " 825.0\n", + " 840.0\n", + " 840.0\n", + " 844.0\n", + " 844.0\n", + " 830.0\n", + " 839.0\n", + " 3242820.0\n", + " 738155.0\n", " ...\n", - " 87.315789\n", - " 145\n", - " 2\n", - " 2\n", - " 1\n", + " 7381.0\n", + " 59954.0\n", + " 15\n", " 1\n", " 1\n", - " 15\n", " 2013\n", + " 2\n", + " 1\n", + " 2\n", " 1\n", " \n", " \n", @@ -1395,61 +1387,69 @@ "" ], "text/plain": [ - " SUM(readings.value_S09) SUM(readings.value_S01) \\\n", - "turbine_id \n", - "T001 102204875.0 19558.0 \n", - "T001 102808505.0 37965.0 \n", - "T001 103701788.0 73948.0 \n", - "T001 104917985.0 87206.0 \n", - "T001 84328762.0 61778.0 \n", + " COUNT(readings) MAX(readings.value_S01) MAX(readings.value_S02) \\\n", + "turbine_id \n", + "T001 144 369.0 376.0 \n", + "T001 144 505.0 426.0 \n", + "T001 144 827.0 794.0 \n", + "T001 144 848.0 841.0 \n", + "T001 144 825.0 840.0 \n", "\n", - " SUM(readings.value_S12) SUM(readings.value_S10) \\\n", + " MAX(readings.value_S03) MAX(readings.value_S04) \\\n", "turbine_id \n", - "T001 483068250.0 486911931.0 \n", - "T001 483585662.0 487487610.0 \n", - "T001 484538080.0 488531121.0 \n", - "T001 486012792.0 490024295.0 \n", - "T001 389879083.0 396521849.0 \n", + "T001 378.0 401.0 \n", + "T001 393.0 517.0 \n", + "T001 839.0 848.0 \n", + "T001 838.0 849.0 \n", + "T001 840.0 844.0 \n", "\n", - " SUM(readings.value_S18) SUM(readings.value_S03) \\\n", + " MAX(readings.value_S05) MAX(readings.value_S06) \\\n", "turbine_id \n", - "T001 463347422.0 18602.0 \n", - "T001 467167621.0 34495.0 \n", - "T001 473938223.0 77804.0 \n", - "T001 483808936.0 81629.0 \n", - "T001 492596536.0 65122.0 \n", + "T001 317.0 324.0 \n", + "T001 469.0 407.0 \n", + "T001 843.0 843.0 \n", + "T001 850.0 848.0 \n", + "T001 844.0 830.0 \n", "\n", - " SUM(readings.value_S16) SUM(readings.value_S11) \\\n", + " MAX(readings.value_S07) MAX(readings.value_S08) \\\n", "turbine_id \n", - "T001 555.2 499808026.0 \n", - "T001 719.2 500401347.0 \n", - "T001 921.1 501472849.0 \n", - "T001 977.2 502994331.0 \n", - "T001 954.3 403671026.0 \n", + "T001 301.0 3209069.0 \n", + "T001 459.0 3214181.0 \n", + "T001 844.0 3223315.0 \n", + "T001 850.0 3233989.0 \n", + "T001 839.0 3242820.0 \n", "\n", - " SUM(readings.value_S21) SUM(readings.value_S08) ... \\\n", - "turbine_id ... \n", - "T001 3090.0 465058755.0 ... \n", - "T001 4970.0 465669184.0 ... \n", - "T001 9902.0 466675578.0 ... \n", - "T001 10720.0 468099974.0 ... \n", - "T001 8684.0 375635231.0 ... \n", + " MAX(readings.value_S09) ... SUM(readings.value_S25) \\\n", + "turbine_id ... \n", + "T001 706654.0 ... 2743.0 \n", + "T001 711718.0 ... 4237.0 \n", + "T001 719405.0 ... 9008.0 \n", + "T001 728250.0 ... 10073.0 \n", + "T001 738155.0 ... 7381.0 \n", "\n", - " MEAN(readings.value_S20) COUNT(readings) \\\n", - "turbine_id \n", - "T001 22.406897 145 \n", - "T001 35.282759 145 \n", - "T001 53.255172 145 \n", - "T001 61.482759 145 \n", - "T001 87.315789 145 \n", + " SUM(readings.value_S26) MODE(readings.DAY(timestamp)) \\\n", + "turbine_id \n", + "T001 20569.0 11 \n", + "T001 32991.0 12 \n", + "T001 63463.0 13 \n", + "T001 70393.0 14 \n", + "T001 59954.0 15 \n", "\n", - " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", - "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", + " MODE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "\n", + " MODE(readings.WEEKDAY(timestamp)) MODE(readings.YEAR(timestamp)) \\\n", + "turbine_id \n", + "T001 4 2013 \n", + "T001 5 2013 \n", + "T001 6 2013 \n", + "T001 0 2013 \n", + "T001 1 2013 \n", "\n", " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", "turbine_id \n", @@ -1459,14 +1459,6 @@ "T001 2 \n", "T001 2 \n", "\n", - " NUM_UNIQUE(readings.YEAR(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", "turbine_id \n", "T001 1 \n", @@ -1475,21 +1467,21 @@ "T001 1 \n", "T001 1 \n", "\n", - " MODE(readings.WEEKDAY(timestamp)) MODE(readings.DAY(timestamp)) \\\n", - "turbine_id \n", - "T001 4 11 \n", - "T001 5 12 \n", - "T001 6 13 \n", - "T001 0 14 \n", - "T001 1 15 \n", + " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", "\n", - " MODE(readings.YEAR(timestamp)) MODE(readings.MONTH(timestamp)) \n", - "turbine_id \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", + " NUM_UNIQUE(readings.YEAR(timestamp)) \n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", "[5 rows x 165 columns]" ] From 23216be0272572d8d2567e894e11e3466d10b23d Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 22 Jan 2021 15:33:29 +0100 Subject: [PATCH 132/171] Add release notes for 0.3.0 --- HISTORY.md | 19 +++++++++++++++++-- setup.py | 1 + 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index ef6042e..e656d1a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,20 @@ # History +## 0.3.0 - 2021-01-22 + +This release increases the supported version of python to `3.8` and also includes changes +in the installation requirements, where ``pandas`` and ``scikit-optimize`` packages have +been updated to support higher versions. This changes come together with the newer versions +of ``MLBlocks`` and ``MLPrimitives``. + +### Internal Improvements + +* Fix ``run_benchmark`` generating properly the ``init_hyperparameters`` for the pipelines. +* New ``FPR`` metric. +* New ``roc_auc_score`` metric. +* Multiple benchmarking metrics allowed. +* Multiple ``tpr`` or ``threshold`` values allowed for the benchmark. + ## 0.2.6 - 2020-10-23 * Fix ``mkdir`` when exporting to ``csv`` file the benchmark results. @@ -16,7 +31,7 @@ With this release we include: * `run_benchmark`: A function within the module `benchmark` that allows the user to evaluate templates against problems with different window size and resample rules. * `summarize_results`: A function that given a `csv` file generates a `xlsx` file with a summary -tab and a deatailed tab with the results from `run_benchmark`. +tab and a detailed tab with the results from `run_benchmark`. ## 0.2.4 - 2020-09-25 @@ -28,7 +43,7 @@ tab and a deatailed tab with the results from `run_benchmark`. ## 0.2.2 - 2020-07-10 -### Internal Imrpovements +### Internal Improvements * Added github actions. diff --git a/setup.py b/setup.py index cefe9da..ef8df55 100644 --- a/setup.py +++ b/setup.py @@ -89,6 +89,7 @@ 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', ], description='AutoML for Renewable Energy Industries.', entry_points={ From abcf8bd5f8a071eab1665770b2e2f866301e2e71 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 22 Jan 2021 15:33:42 +0100 Subject: [PATCH 133/171] =?UTF-8?q?Bump=20version:=200.2.7.dev2=20?= =?UTF-8?q?=E2=86=92=201.0.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 4374a6d..3a19035 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.7.dev2' +__version__ = '1.0.0.dev0' import os diff --git a/setup.cfg b/setup.cfg index 724b591..564d55c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.7.dev2 +current_version = 1.0.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index ef8df55..4f00063 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.2.7.dev2', + version='1.0.0.dev0', zip_safe=False, ) From ccf15189b5224e8fbadd1542f29f0b883f186c4d Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 22 Jan 2021 15:33:43 +0100 Subject: [PATCH 134/171] =?UTF-8?q?Bump=20version:=201.0.0.dev0=20?= =?UTF-8?q?=E2=86=92=201.0.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 3a19035..a579d1a 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '1.0.0.dev0' +__version__ = '1.0.0' import os diff --git a/setup.cfg b/setup.cfg index 564d55c..5979b04 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.0.dev0 +current_version = 1.0.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 4f00063..68d76d5 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='1.0.0.dev0', + version='1.0.0', zip_safe=False, ) From a9c2c38105ea18f7591fce491bf4090e86b25f60 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 22 Jan 2021 15:53:31 +0100 Subject: [PATCH 135/171] =?UTF-8?q?Bump=20version:=201.0.0=20=E2=86=92=201?= =?UTF-8?q?.0.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index a579d1a..869ef83 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '1.0.0' +__version__ = '1.0.1.dev0' import os diff --git a/setup.cfg b/setup.cfg index 5979b04..f92e999 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.0 +current_version = 1.0.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 68d76d5..ad79245 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='1.0.0', + version='1.0.1.dev0', zip_safe=False, ) From c730e1e1d6cbca04a355e6ae08fba045e5cd0a19 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 22 Jan 2021 18:07:48 +0100 Subject: [PATCH 136/171] Revert 1.0.0 error release --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 869ef83..2a70d1f 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '1.0.1.dev0' +__version__ = '0.3.0.dev0' import os diff --git a/setup.cfg b/setup.cfg index f92e999..e2b3ca8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 1.0.1.dev0 +current_version = 0.3.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index ad79245..1ade9c6 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='1.0.1.dev0', + version='0.3.0.dev0', zip_safe=False, ) From 027a44abe44fc069fd0549574cb534610293fe62 Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Fri, 22 Jan 2021 18:09:58 +0100 Subject: [PATCH 137/171] Prevent making a release before making a release candidate --- Makefile | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 2b2d2f8..9a69d1a 100644 --- a/Makefile +++ b/Makefile @@ -224,6 +224,7 @@ bumpversion-revert: ## Undo a previous bumpversion-release CLEAN_DIR := $(shell git status --short | grep -v ??) CURRENT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) +CURRENT_VERSION := $(shell grep "^current_version" setup.cfg | grep -o "dev[0-9]*") CHANGELOG_LINES := $(shell git diff HEAD..origin/stable HISTORY.md 2>&1 | wc -l) .PHONY: check-clean @@ -238,6 +239,12 @@ ifneq ($(CURRENT_BRANCH),master) $(error Please make the release from master branch\n) endif +.PHONY: check-candidate +check-candidate: ## Check if a release candidate has been made +ifeq ($(CURRENT_VERSION),dev0) + $(error Please make a release candidate and test it before atempting a release) +endif + .PHONY: check-history check-history: ## Check if HISTORY.md has been modified ifeq ($(CHANGELOG_LINES),0) @@ -245,7 +252,7 @@ ifeq ($(CHANGELOG_LINES),0) endif .PHONY: check-release -check-release: check-clean check-master check-history ## Check if the release can be made +check-release: check-candidate check-clean check-master check-history ## Check if the release can be made @echo "A new release can be made" .PHONY: release @@ -260,12 +267,6 @@ release-candidate: check-master publish bumpversion-candidate .PHONY: release-candidate-test release-candidate-test: check-clean check-master publish-test -.PHONY: release-minor -release-minor: check-release bumpversion-minor release - -.PHONY: release-major -release-major: check-release bumpversion-major release - # DOCKER TARGETS From 9b57da2d0e133a1a1e05faf2aaee4d6048fcefbb Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 22 Jan 2021 18:26:04 +0100 Subject: [PATCH 138/171] =?UTF-8?q?Bump=20version:=200.3.0.dev0=20?= =?UTF-8?q?=E2=86=92=200.3.0.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index 2a70d1f..f340385 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.3.0.dev0' +__version__ = '0.3.0.dev1' import os diff --git a/setup.cfg b/setup.cfg index e2b3ca8..c1622f5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.0.dev0 +current_version = 0.3.0.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 1ade9c6..f24b19e 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.3.0.dev0', + version='0.3.0.dev1', zip_safe=False, ) From 32bbc48d68c6573990fec5d9fa164d0df9e1e0d5 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 22 Jan 2021 19:34:39 +0100 Subject: [PATCH 139/171] =?UTF-8?q?Bump=20version:=200.3.0.dev1=20?= =?UTF-8?q?=E2=86=92=200.3.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index f340385..dc3e138 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.3.0.dev1' +__version__ = '0.3.0' import os diff --git a/setup.cfg b/setup.cfg index c1622f5..32f7445 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.0.dev1 +current_version = 0.3.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index f24b19e..ad985e3 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.3.0.dev1', + version='0.3.0', zip_safe=False, ) From d8597f8adf9b579e28a650312616c4373f611cc8 Mon Sep 17 00:00:00 2001 From: Plamen Valentinov Kolev Date: Fri, 22 Jan 2021 20:06:49 +0100 Subject: [PATCH 140/171] =?UTF-8?q?Bump=20version:=200.3.0=20=E2=86=92=200?= =?UTF-8?q?.3.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- greenguard/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/greenguard/__init__.py b/greenguard/__init__.py index dc3e138..4d310c6 100644 --- a/greenguard/__init__.py +++ b/greenguard/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.3.0' +__version__ = '0.3.1.dev0' import os diff --git a/setup.cfg b/setup.cfg index 32f7445..ee6b598 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.0 +current_version = 0.3.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index ad985e3..bed1432 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/D3-AI/GreenGuard', - version='0.3.0', + version='0.3.1.dev0', zip_safe=False, ) From 2c3588f4b6afa70c81ecda21c8ea3ede1b53ac3f Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Thu, 16 Sep 2021 16:07:10 -0400 Subject: [PATCH 141/171] Update dependencies (#58) * cap keras version * increase * let 2.5 be the cap --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bed1432..d457c01 100644 --- a/setup.py +++ b/setup.py @@ -31,7 +31,7 @@ 'dask>=2.6.0,<3', 'distributed>=2.6.0,<3', 'h5py<2.11.0,>=2.10.0', # fix tensorflow requirement - 'Keras>=2.4', + 'Keras>=2.4,<2.5', 'tabulate>=0.8.3,<0.9', 'xlsxwriter>=1.3.6<1.4', ] From 93899cfc1363005281f790685bfce257db259fee Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Mon, 15 Nov 2021 09:12:03 -0500 Subject: [PATCH 142/171] Update dependencies (#60) * update mlprimitives --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d457c01..79eeb5b 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ install_requires = [ 'baytune>=0.4.0,<0.5', - 'mlprimitives>=0.3.0,<0.4', + 'mlprimitives>=0.3.2,<0.4', 'mlblocks>=0.4.0,<0.5', 'pymongo>=3.7.2,<4', 'scikit-learn>=0.21', @@ -31,7 +31,6 @@ 'dask>=2.6.0,<3', 'distributed>=2.6.0,<3', 'h5py<2.11.0,>=2.10.0', # fix tensorflow requirement - 'Keras>=2.4,<2.5', 'tabulate>=0.8.3,<0.9', 'xlsxwriter>=1.3.6<1.4', ] From 059e78db2b3eaba9f24289f771751c0f5f2a0868 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Mon, 13 Dec 2021 08:26:33 -0500 Subject: [PATCH 143/171] Change GreenGuard to Draco (#61) * change greenguard to draco * update badge to gh workflow * fix logo * revert docker settings * change version --- .github/ISSUE_TEMPLATE.md | 2 +- .gitignore | 2 +- CONTRIBUTING.rst | 32 ++-- DATABASE.md | 2 +- DATA_FORMAT.md | 12 +- MANIFEST.in | 2 +- Makefile | 30 ++-- README.md | 67 ++++---- docs/Makefile | 2 +- docs/advanced_usage/concepts.md | 8 +- docs/advanced_usage/csv.md | 8 +- docs/advanced_usage/docker.md | 44 ++--- docs/conf.py | 18 +- .../{GreenGuard-200.png => Draco-200.png} | Bin docs/images/{GreenGuard.ico => Draco.ico} | Bin docs/images/{GreenGuard.png => Draco.png} | Bin docs/index.rst | 4 +- docs/make.bat | 2 +- {greenguard => draco}/__init__.py | 8 +- {greenguard => draco}/benchmark.py | 29 ++-- {greenguard => draco}/db.py | 2 +- {greenguard => draco}/demo.py | 4 +- draco/loaders/__init__.py | 5 + {greenguard => draco}/loaders/csv.py | 2 +- {greenguard => draco}/metrics.py | 0 {greenguard => draco}/pipeline.py | 18 +- .../classes/normalize_dfs_xgb_classifier.json | 0 .../classes/unstack_dfs_xgb_classifier.json | 0 ...ack_double_lstm_timeseries_classifier.json | 0 .../unstack_lstm_timeseries_classifier.json | 0 .../unstack_normalize_dfs_xgb_classifier.json | 0 .../disabled/dfs_xgb_classifier.json | 0 .../normalize_dfs_xgb_classifier.json | 0 .../disabled/resample_dfs_xgb_classifier.json | 0 ...resample_normalize_dfs_xgb_classifier.json | 0 .../resample_unstack_dfs_xgb_classifier.json | 0 ...ack_double_lstm_timeseries_classifier.json | 0 ...le_unstack_lstm_timeseries_classifier.json | 0 ..._unstack_normalize_dfs_xgb_classifier.json | 0 .../normalize_dfs_xgb_classifier.json | 0 .../unstack_dfs_xgb_classifier.json | 0 ...ack_double_lstm_timeseries_classifier.json | 0 .../unstack_lstm_timeseries_classifier.json | 0 .../unstack_normalize_dfs_xgb_classifier.json | 0 .../unstacked_dfs_xgb_classifier.json | 0 ...ked_double_lstm_timeseries_classifier.json | 0 .../unstacked_lstm_timeseries_classifier.json | 0 ...nstacked_normalize_dfs_xgb_classifier.json | 0 .../primitives/numpy.take.json | 0 .../xgboost.XGBClassifier:probabilities.json | 0 {greenguard => draco}/results.py | 0 {greenguard => draco}/targets.py | 0 {greenguard => draco}/utils.py | 0 greenguard/loaders/__init__.py | 5 - setup.cfg | 4 +- setup.py | 15 +- tests/test_benchmark.py | 6 +- tests/test_metrics.py | 2 +- tests/test_pipeline.py | 18 +- ....ipynb => 01_Draco_Machine_Learning.ipynb} | 72 ++++---- tutorials/02_Extract_Readings.ipynb | 156 +++++++++--------- tutorials/03_Benchmarking.ipynb | 38 ++--- .../normalize_dfs_xgb_classifier.ipynb | 8 +- ...ck_double_lstm_timeseries_classifier.ipynb | 8 +- .../unstack_lstm_timeseries_classifier.ipynb | 8 +- ...unstack_normalize_dfs_xgb_classifier.ipynb | 8 +- 66 files changed, 323 insertions(+), 328 deletions(-) rename docs/images/{GreenGuard-200.png => Draco-200.png} (100%) rename docs/images/{GreenGuard.ico => Draco.ico} (100%) rename docs/images/{GreenGuard.png => Draco.png} (100%) rename {greenguard => draco}/__init__.py (67%) rename {greenguard => draco}/benchmark.py (97%) rename {greenguard => draco}/db.py (97%) rename {greenguard => draco}/demo.py (95%) create mode 100644 draco/loaders/__init__.py rename {greenguard => draco}/loaders/csv.py (99%) rename {greenguard => draco}/metrics.py (100%) rename {greenguard => draco}/pipeline.py (97%) rename {greenguard => draco}/pipelines/classes/normalize_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/classes/unstack_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/classes/unstack_double_lstm_timeseries_classifier.json (100%) rename {greenguard => draco}/pipelines/classes/unstack_lstm_timeseries_classifier.json (100%) rename {greenguard => draco}/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/disabled/dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/disabled/normalize_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/disabled/resample_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json (100%) rename {greenguard => draco}/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json (100%) rename {greenguard => draco}/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/probability/normalize_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/probability/unstack_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/probability/unstack_double_lstm_timeseries_classifier.json (100%) rename {greenguard => draco}/pipelines/probability/unstack_lstm_timeseries_classifier.json (100%) rename {greenguard => draco}/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/unstacked/unstacked_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json (100%) rename {greenguard => draco}/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json (100%) rename {greenguard => draco}/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json (100%) rename {greenguard => draco}/primitives/numpy.take.json (100%) rename {greenguard => draco}/primitives/xgboost.XGBClassifier:probabilities.json (100%) rename {greenguard => draco}/results.py (100%) rename {greenguard => draco}/targets.py (100%) rename {greenguard => draco}/utils.py (100%) delete mode 100644 greenguard/loaders/__init__.py rename tutorials/{01_GreenGuard_Machine_Learning.ipynb => 01_Draco_Machine_Learning.ipynb} (89%) diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md index dc283d2..8877ddd 100644 --- a/.github/ISSUE_TEMPLATE.md +++ b/.github/ISSUE_TEMPLATE.md @@ -1,4 +1,4 @@ -* GreenGuard version: +* Draco version: * Python version: * Operating System: diff --git a/.gitignore b/.gitignore index f0a4be1..fe2c47a 100644 --- a/.gitignore +++ b/.gitignore @@ -106,7 +106,7 @@ ENV/ # Vim .*.swp -greenguard/demo/ +draco/demo/ notebooks/ notebooks-private/ scripts/ diff --git a/CONTRIBUTING.rst b/CONTRIBUTING.rst index 9f83841..e97e89e 100644 --- a/CONTRIBUTING.rst +++ b/CONTRIBUTING.rst @@ -38,8 +38,8 @@ and "help wanted" is open to whoever wants to implement it. Write Documentation ~~~~~~~~~~~~~~~~~~~ -GreenGuard could always use more documentation, whether as part of the -official GreenGuard docs, in docstrings, or even on the web in blog posts, +Draco could always use more documentation, whether as part of the +official Draco docs, in docstrings, or even on the web in blog posts, articles, and such. Submit Feedback @@ -57,18 +57,18 @@ If you are proposing a feature: Get Started! ------------ -Ready to contribute? Here's how to set up `GreenGuard` for local development. +Ready to contribute? Here's how to set up `Draco` for local development. -1. Fork the `GreenGuard` repo on GitHub. +1. Fork the `Draco` repo on GitHub. 2. Clone your fork locally:: - $ git clone git@github.com:your_name_here/GreenGuard.git + $ git clone git@github.com:your_name_here/Draco.git 3. Install your local copy into a virtualenv. Assuming you have virtualenvwrapper installed, this is how you set up your fork for local development:: - $ mkvirtualenv GreenGuard - $ cd GreenGuard/ + $ mkvirtualenv Draco + $ cd Draco/ $ make install-develop 4. Create a branch for local development:: @@ -133,9 +133,9 @@ All the Unit Tests should comply with the following requirements: 1. Unit Tests should be based only in unittest and pytest modules. -2. The tests that cover a module called ``greenguard/path/to/a_module.py`` +2. The tests that cover a module called ``draco/path/to/a_module.py`` should be implemented in a separated module called - ``tests/greenguard/path/to/test_a_module.py``. + ``tests/draco/path/to/test_a_module.py``. Note that the module name has the ``test_`` prefix and is located in a path similar to the one of the tested module, just inside the ``tests`` folder. @@ -165,7 +165,7 @@ Tips To run a subset of tests:: - $ python -m pytest tests.test_greenguard + $ python -m pytest tests.test_draco $ python -m pytest -k 'foo' Release Workflow @@ -175,11 +175,11 @@ The process of releasing a new version involves several steps combining both ``g ``bumpversion`` which, briefly: 1. Merge what is in ``master`` branch into ``stable`` branch. -2. Update the version in ``setup.cfg``, ``greenguard/__init__.py`` and +2. Update the version in ``setup.cfg``, ``draco/__init__.py`` and ``HISTORY.md`` files. 3. Create a new git tag pointing at the corresponding commit in ``stable`` branch. 4. Merge the new commit from ``stable`` into ``master``. -5. Update the version in ``setup.cfg`` and ``greenguard/__init__.py`` +5. Update the version in ``setup.cfg`` and ``draco/__init__.py`` to open the next development iteration. .. note:: Before starting the process, make sure that ``HISTORY.md`` has been updated with a new @@ -223,15 +223,15 @@ dependency specification, either in ``setup.py``:: install_requires = [ ... - 'greenguard>=X.Y.Z.dev', + 'draco>=X.Y.Z.dev', ... ] or in command line:: - pip install 'greenguard>=X.Y.Z.dev' + pip install 'draco>=X.Y.Z.dev' -.. _GitHub issues page: https://github.com/D3-AI/GreenGuard/issues -.. _Travis Build Status page: https://travis-ci.org/D3-AI/GreenGuard/pull_requests +.. _GitHub issues page: https://github.com/sintel-dev/Draco/issues +.. _Travis Build Status page: https://travis-ci.org/sintel-dev/Draco/pull_requests .. _Google docstrings style: https://google.github.io/styleguide/pyguide.html?showone=Comments#Comments diff --git a/DATABASE.md b/DATABASE.md index 45cfd38..d3ef8f4 100644 --- a/DATABASE.md +++ b/DATABASE.md @@ -1,6 +1,6 @@ # Database Schema -The **GreenGuard Database** contains the following collections and relationships +The **Draco Database** contains the following collections and relationships * Farm * Trubine diff --git a/DATA_FORMAT.md b/DATA_FORMAT.md index b0fc5a3..7354461 100644 --- a/DATA_FORMAT.md +++ b/DATA_FORMAT.md @@ -1,8 +1,8 @@ -# GreenGuard Data Format +# Draco Data Format ## Input -The minimum input expected by the **GreenGuard** system consists of the following two elements, +The minimum input expected by the **Draco** system consists of the following two elements, which need to be passed as `pandas.DataFrame` objects: ### Target Times @@ -60,7 +60,7 @@ an arbitraty number of additional fields. ## CSV Format -As explained in a previous section, the input expected by the **GreenGuard** system consists of +As explained in a previous section, the input expected by the **Draco** system consists of two tables which need to be passed as `pandas.DataFrame` objects: * The `target_times` table, which containing the specification of the problem that we are solving @@ -69,11 +69,11 @@ two tables which need to be passed as `pandas.DataFrame` objects: `turbine_id`, `signal_id`, `timestamp` and `value` fields. However, in most scenarios the size of the available will far exceed the memory limitations -of the system on which **GreenGuard** is being run, so loading all the data in a single +of the system on which **Draco** is being run, so loading all the data in a single `pandas.DataFrame` will not be possible. -In order to solve this situation, **GreenGuard** provides a [CSVLoader]( -https://d3-ai.github.io/GreenGuard/api/greenguard.loaders.csv.html#greenguard.loaders.csv.CSVLoader) +In order to solve this situation, **Draco** provides a [CSVLoader]( +https://sintel-dev.github.io/Draco/api/draco.loaders.csv.html#draco.loaders.csv.CSVLoader) class which can be used to load data from what we call the **Raw Data Format**. ### Raw Data Format diff --git a/MANIFEST.in b/MANIFEST.in index 4ebe1c6..0669023 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,7 +4,7 @@ include HISTORY.md include LICENSE include README.md -recursive-include greenguard *.json +recursive-include draco *.json recursive-include tests * recursive-exclude * __pycache__ diff --git a/Makefile b/Makefile index 9a69d1a..a6ad0e5 100644 --- a/Makefile +++ b/Makefile @@ -89,10 +89,10 @@ install-minimum: ## install the minimum supported versions of the package depend # LINT TARGETS -.PHONY: lint-greenguard +.PHONY: lint-draco lint-btb: ## check style with flake8 and isort - flake8 greenguard - isort -c --recursive greenguard + flake8 draco + isort -c --recursive draco .PHONY: lint-tests lint-tests: ## check style with flake8 and isort @@ -104,19 +104,19 @@ check-dependencies: ## test if there are any broken dependencies pip check .PHONY: lint -lint: check-dependencies lint-greenguard lint-tests ## Run all code style and static testing validations +lint: check-dependencies lint-draco lint-tests ## Run all code style and static testing validations .PHONY: fix-lint fix-lint: ## fix lint issues using autoflake, autopep8, and isort - find greenguard -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables - autopep8 --in-place --recursive --aggressive greenguard - isort --apply --atomic --recursive greenguard tests + find draco -name '*.py' | xargs autoflake --in-place --remove-all-unused-imports --remove-unused-variables + autopep8 --in-place --recursive --aggressive draco + isort --apply --atomic --recursive draco tests # TEST TARGETS .PHONY: test-unit test-unit: ## run tests quickly with the default Python - python -m pytest --cov=greenguard + python -m pytest --cov=draco .PHONY: test-readme test-readme: ## run the readme snippets @@ -141,7 +141,7 @@ test-all: ## run tests on every Python version with tox .PHONY: coverage coverage: ## check code coverage quickly with the default Python - coverage run --source greenguard -m pytest + coverage run --source draco -m pytest coverage report -m coverage html $(BROWSER) htmlcov/index.html @@ -272,7 +272,7 @@ release-candidate-test: check-clean check-master publish-test .PHONY: docker-build docker-build: - docker build -f docker/Dockerfile -t greenguard . + docker build -f docker/Dockerfile -t draco . .PHONY: docker-login docker-login: @@ -280,8 +280,8 @@ docker-login: .PHONY: docker-push docker-push: docker-login docker-build - @$(eval VERSION := $(shell python -c 'import greenguard; print(greenguard.__version__)')) - docker tag greenguard signalsdev/greenguard:$(VERSION) - docker push signalsdev/greenguard:$(VERSION) - docker tag greenguard signalsdev/greenguard - docker push signalsdev/greenguard + @$(eval VERSION := $(shell python -c 'import draco; print(draco.__version__)')) + docker tag draco signalsdev/draco:$(VERSION) + docker push signalsdev/draco:$(VERSION) + docker tag draco signalsdev/draco + docker push signalsdev/draco diff --git a/README.md b/README.md index 0472817..70eb0fe 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@

-GreenGuard +Draco

@@ -12,23 +12,23 @@ AutoML for Renewable Energy Industries.

-[![PyPI Shield](https://img.shields.io/pypi/v/greenguard.svg)](https://pypi.python.org/pypi/greenguard) -[![Travis CI Shield](https://travis-ci.org/signals-dev/GreenGuard.svg?branch=master)](https://travis-ci.org/signals-dev/GreenGuard) -[![Downloads](https://pepy.tech/badge/greenguard)](https://pepy.tech/project/greenguard) -[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/signals-dev/GreenGuard/master?filepath=tutorials) +[![PyPI Shield](https://img.shields.io/pypi/v/draco-ml.svg)](https://pypi.python.org/pypi/draco-ml) +[![Tests](https://github.com/sintel-dev/Draco/workflows/Run%20Tests/badge.svg)](https://github.com/sintel-dev/Draco/actions?query=workflow%3A%22Run+Tests%22+branch%3Amaster) +[![Downloads](https://pepy.tech/badge/draco-ml)](https://pepy.tech/project/draco-ml) +[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/sintel-dev/Draco/master?filepath=tutorials) -# GreenGuard +# Draco -- License: [MIT](https://github.com/signals-dev/GreenGuard/blob/master/LICENSE) -- Documentation: https://signals-dev.github.io/GreenGuard -- Homepage: https://github.com/signals-dev/GreenGuard +- License: [MIT](https://github.com/sintel-dev/Draco/blob/master/LICENSE) +- Documentation: https://sintel-dev.github.io/Draco +- Homepage: https://github.com/sintel-dev/Draco ## Overview -The GreenGuard project is a collection of end-to-end solutions for machine learning problems +The Draco project is a collection of end-to-end solutions for machine learning problems commonly found in monitoring wind energy production systems. Most tasks utilize sensor data emanating from monitoring systems. We utilize the foundational innovations developed for automation of machine Learning at Data to AI Lab at MIT. @@ -47,40 +47,35 @@ The salient aspects of this customized project are: ## Resources * [Data Format](DATA_FORMAT.md). -* [GreenGuard folder structure](DATA_FORMAT.md#folder-structure). +* [Draco folder structure](DATA_FORMAT.md#folder-structure). # Install ## Requirements -**GreenGuard** has been developed and runs on Python 3.6, 3.7 and 3.8. +**Draco** has been developed and runs on Python 3.6, 3.7 and 3.8. Also, although it is not strictly required, the usage of a [virtualenv]( https://virtualenv.pypa.io/en/latest/) is highly recommended in order to avoid interfering -with other software installed in the system where you are trying to run **GreenGuard**. +with other software installed in the system where you are trying to run **Draco**. ## Download and Install -**GreenGuard** can be installed locally using [pip](https://pip.pypa.io/en/stable/) with +**Draco** can be installed locally using [pip](https://pip.pypa.io/en/stable/) with the following command: ```bash -pip install greenguard +pip install draco-ml ``` This will pull and install the latest stable release from [PyPi](https://pypi.org/). If you want to install from source or contribute to the project please read the -[Contributing Guide](https://signals-dev.github.io/GreenGuard/contributing.html#get-started). - -## Docker usage - -**GreenGuard** is prepared to be run inside a docker environment. Please check the -[docker documentation](docker/README.md) for details about how to run **GreenGuard** using docker. +[Contributing Guide](https://sintel-dev.github.io/Draco/contributing.html#get-started). # Data Format -The minimum input expected by the **GreenGuard** system consists of the following two elements, +The minimum input expected by the **Draco** system consists of the following two elements, which need to be passed as `pandas.DataFrame` objects: ## Target Times @@ -138,23 +133,23 @@ an arbitraty number of additional fields. ## CSV Format A part from the in-memory data format explained above, which is limited by the memory -allocation capabilities of the system where it is run, **GreenGuard** is also prepared to +allocation capabilities of the system where it is run, **Draco** is also prepared to load and work with data stored as a collection of CSV files, drastically increasing the amount of data which it can work with. Further details about this format can be found in the [project documentation site](DATA_FORMAT.md#csv-format). # Quickstart -In this example we will load some demo data and classify it using a **GreenGuard Pipeline**. +In this example we will load some demo data and classify it using a **Draco Pipeline**. ## 1. Load and split the demo data The first step is to load the demo data. -For this, we will import and call the `greenguard.demo.load_demo` function without any arguments: +For this, we will import and call the `draco.demo.load_demo` function without any arguments: ```python3 -from greenguard.demo import load_demo +from draco.demo import load_demo target_times, readings = load_demo() ``` @@ -212,17 +207,17 @@ test_targets = test.pop('target') Once we have the data ready, we need to find a suitable pipeline. -The list of available GreenGuard Pipelines can be obtained using the `greenguard.get_pipelines` +The list of available Draco Pipelines can be obtained using the `draco.get_pipelines` function. ```python3 -from greenguard import get_pipelines +from draco import get_pipelines pipelines = get_pipelines() ``` The returned `pipeline` variable will be `list` containing the names of all the pipelines -available in the GreenGuard system: +available in the Draco system: ``` ['classes.unstack_double_lstm_timeseries_classifier', @@ -244,13 +239,13 @@ pipeline_name = 'classes.normalize_dfs_xgb_classifier' Once we have loaded the data and selected the pipeline that we will use, we have to fit it. -For this, we will create an instance of a `GreenGuardPipeline` object passing the name +For this, we will create an instance of a `DracoPipeline` object passing the name of the pipeline that we want to use: ```python3 -from greenguard.pipeline import GreenGuardPipeline +from draco.pipeline import DracoPipeline -pipeline = GreenGuardPipeline(pipeline_name) +pipeline = DracoPipeline(pipeline_name) ``` And then we can directly fit it to our data by calling its `fit` method and passing in the @@ -283,7 +278,7 @@ f1_score(test_targets, predictions) ## What's next? -For more details about **GreenGuard** and all its possibilities and features, please check the -[project documentation site](https://signals-dev.github.io/GreenGuard/) +For more details about **Draco** and all its possibilities and features, please check the +[project documentation site](https://sintel-dev.github.io/Draco/) Also do not forget to have a look at the [tutorials]( -https://github.com/signals-dev/GreenGuard/tree/master/tutorials)! +https://github.com/sintel-dev/Draco/tree/master/tutorials)! diff --git a/docs/Makefile b/docs/Makefile index 5c762a7..e2106b7 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -4,7 +4,7 @@ # You can set these variables from the command line. SPHINXOPTS = SPHINXBUILD = python -msphinx -SPHINXPROJ = greenguard +SPHINXPROJ = draco SOURCEDIR = . BUILDDIR = _build diff --git a/docs/advanced_usage/concepts.md b/docs/advanced_usage/concepts.md index f39bffa..2489b44 100644 --- a/docs/advanced_usage/concepts.md +++ b/docs/advanced_usage/concepts.md @@ -1,6 +1,6 @@ # Concepts -Here we briefly explain some of the concepts and terminology used within the GreenGuard +Here we briefly explain some of the concepts and terminology used within the Draco project and documentation. ## Primitive @@ -34,7 +34,7 @@ hyperparameters for a Template. Hence, Pipelines: hyperparameters of their template. A pipeline can be fitted and evaluated directly using [MLBlocks]( -https://hdi-project.github.io/MLBlocks), or using the **GreenGuardPipeline**. +https://MLBazaar.github.io/MLBlocks), or using the **DracoPipeline**. ## Tuning @@ -48,11 +48,11 @@ which hyperparameters are more likely to get the best results in the next iterat We call each one of these evaluations a **tuning iteration**. The process of selecting and tuning the templates is handled by a [BTBSession]( -https://hdi-project.github.io/BTB/tutorials/03_Session.html), which is responsible for +https://MLBazaar.github.io/BTB/tutorials/03_Session.html), which is responsible for discarding the templates that do not work on the given data and for keeping track of the template and hyperparameters that obtain the best performance. -## GreenGuardPipeline +## DracoPipeline This class is the one in charge of loading the **MLBlocks Pipelines** configured in the system and use them to learn from the data and make predictions. diff --git a/docs/advanced_usage/csv.md b/docs/advanced_usage/csv.md index c267807..eab0c19 100644 --- a/docs/advanced_usage/csv.md +++ b/docs/advanced_usage/csv.md @@ -1,6 +1,6 @@ # CSV Format -As explained in a previous section, the input expected by the **GreenGuard** system consists of +As explained in a previous section, the input expected by the **Draco** system consists of two tables which need to be passed as `pandas.DataFrame` objects: * The `target_times` table, which containing the specification of the problem that we are solving @@ -9,11 +9,11 @@ two tables which need to be passed as `pandas.DataFrame` objects: `turbine_id`, `signal_id`, `timestamp` and `value` fields. However, in most scenarios the size of the available will far exceed the memory limitations -of the system on which **GreenGuard** is being run, so loading all the data in a single +of the system on which **Draco** is being run, so loading all the data in a single `pandas.DataFrame` will not be possible. -In order to solve this situation, **GreenGuard** provides a [CSVLoader]( -https://d3-ai.github.io/GreenGuard/api/greenguard.loaders.csv.html#greenguard.loaders.csv.CSVLoader) +In order to solve this situation, **Draco** provides a [CSVLoader]( +https://sintel-dev.github.io/Draco/api/draco.loaders.csv.html#draco.loaders.csv.CSVLoader) class which can be used to load data from what we call the **Raw Data Format**. ## Raw Data Format diff --git a/docs/advanced_usage/docker.md b/docs/advanced_usage/docker.md index e5603df..a2cb198 100644 --- a/docs/advanced_usage/docker.md +++ b/docs/advanced_usage/docker.md @@ -1,34 +1,34 @@ # Docker Usage -**GreenGuard** comes configured and ready to be distributed and run as a docker image which starts -a jupyter notebook already configured to use greenguard, with all the required dependencies already +**Draco** comes configured and ready to be distributed and run as a docker image which starts +a jupyter notebook already configured to use draco, with all the required dependencies already installed. ## Requirements -The only requirement in order to run the GreenGuard Docker image is to have Docker installed and +The only requirement in order to run the Draco Docker image is to have Docker installed and that the user has enough permissions to run it. Installation instructions for any possible system compatible can be found [here](https://docs.docker.com/install/) -Additionally, the system that builds the GreenGuard Docker image will also need to have a working +Additionally, the system that builds the Draco Docker image will also need to have a working internet connection that allows downloading the base image and the additional python depenedencies. -## Building the GreenGuard Docker Image +## Building the Draco Docker Image -After having cloned the **GreenGuard** repository, all you have to do in order to build the GreenGuard Docker +After having cloned the **Draco** repository, all you have to do in order to build the Draco Docker Image is running this command: ```bash make docker-jupyter-build ``` -After a few minutes, the new image, called `greenguard-jupyter`, will have been built into the system +After a few minutes, the new image, called `draco-jupyter`, will have been built into the system and will be ready to be used or distributed. -## Distributing the GreenGuard Docker Image +## Distributing the Draco Docker Image -Once the `greenguard-jupyter` image is built, it can be distributed in several ways. +Once the `draco-jupyter` image is built, it can be distributed in several ways. ### Distributing using a Docker registry @@ -38,7 +38,7 @@ In order to do so, we will need to have write access to a public or private regi [login](https://docs.docker.com/engine/reference/commandline/login/)!) and execute these commands: ```bash -docker tag greenguard-jupyter:latest your-registry-name:some-tag +docker tag draco-jupyter:latest your-registry-name:some-tag docker push your-registry-name:some-tag ``` @@ -46,7 +46,7 @@ Afterwards, in the receiving machine: ```bash docker pull your-registry-name:some-tag -docker tag your-registry-name:some-tag greenguard-jupyter:latest +docker tag your-registry-name:some-tag draco-jupyter:latest ``` ### Distributing as a file @@ -57,28 +57,28 @@ using the following command. In the system that already has the image: ```bash -docker save --output greenguard-jupyter.tar greenguard-jupyter +docker save --output draco-jupyter.tar draco-jupyter ``` -Then copy over the file `greenguard-jupyter.tar` to the new system and there, run: +Then copy over the file `draco-jupyter.tar` to the new system and there, run: ```bash -docker load --input greenguard-jupyter.tar +docker load --input draco-jupyter.tar ``` -After these commands, the `greenguard-jupyter` image should be available and ready to be used in the +After these commands, the `draco-jupyter` image should be available and ready to be used in the new system. -## Running the greenguard-jupyter image +## Running the draco-jupyter image -Once the `greenguard-jupyter` image has been built, pulled or loaded, it is ready to be run. +Once the `draco-jupyter` image has been built, pulled or loaded, it is ready to be run. This can be done in two ways: -### Running greenguard-jupyter with the code +### Running draco-jupyter with the code -If the GreenGuard source code is available in the system, running the image is as simple as running +If the Draco source code is available in the system, running the image is as simple as running this command from within the root of the project: ```bash @@ -93,13 +93,13 @@ which means that any changes that you do in your local code will immediately be within your notebooks, and that any notebook that you create within jupyter will also show up in your `notebooks` folder! -### Running greenguard-jupyter without the greenguard code +### Running draco-jupyter without the draco code -If the GreenGuard source code is not available in the system and only the Docker Image is, you can +If the Draco source code is not available in the system and only the Docker Image is, you can still run the image by using this command: ```bash -docker run -ti -p8888:8888 greenguard-jupyter +docker run -ti -p8888:8888 draco-jupyter ``` In this case, the code changes and the notebooks that you create within jupyter will stay diff --git a/docs/conf.py b/docs/conf.py index 9e23c07..ecd0023 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# GreenGuard documentation build configuration file, created by +# Draco documentation build configuration file, created by # sphinx-quickstart on Fri Jun 9 13:47:02 2017. # # This file is execfile()d with the current directory set to its @@ -20,7 +20,7 @@ import sphinx_rtd_theme # For read the docs theme -import greenguard +import draco # -- General configuration --------------------------------------------- @@ -58,22 +58,22 @@ nbsphinx_execute = 'never' # General information about the project. -project = 'GreenGuard' -slug = 'greenguard' +project = 'Draco' +slug = 'draco' title = project + ' Documentation', copyright = '2018, MIT Data To AI Lab' author = 'MIT Data To AI Lab' description = 'AutoML for Renewable Energy Industries' -user = 'D3-AI' +user = 'sintel-dev' # The version info for the project you're documenting, acts as replacement # for |version| and |release|, also used in various other places throughout # the built documents. # # The short X.Y version. -version = greenguard.__version__ +version = draco.__version__ # The full version, including alpha/beta/rc tags. -release = greenguard.__version__ +release = draco.__version__ # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. @@ -127,13 +127,13 @@ # the docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. # html_favicon = 'images/favicon.ico' -html_favicon = 'images/GreenGuard.ico' +html_favicon = 'images/Draco.ico' # If given, this must be the name of an image file (path relative to the # configuration directory) that is the logo of the docs. It is placed at # the top of the sidebar; its width should therefore not exceed 200 pixels. # html_logo = 'images/dai-logo.png' -html_logo = 'images/GreenGuard-200.png' +html_logo = 'images/Draco-200.png' # -- Options for HTMLHelp output --------------------------------------- diff --git a/docs/images/GreenGuard-200.png b/docs/images/Draco-200.png similarity index 100% rename from docs/images/GreenGuard-200.png rename to docs/images/Draco-200.png diff --git a/docs/images/GreenGuard.ico b/docs/images/Draco.ico similarity index 100% rename from docs/images/GreenGuard.ico rename to docs/images/Draco.ico diff --git a/docs/images/GreenGuard.png b/docs/images/Draco.png similarity index 100% rename from docs/images/GreenGuard.png rename to docs/images/Draco.png diff --git a/docs/index.rst b/docs/index.rst index dad6c5f..75b0cdb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,7 +10,7 @@ :caption: Tutorials :hidden: - tutorials/01_GreenGuard_Quickstart + tutorials/01_Draco_Quickstart tutorials/02_Extract_Readings .. toctree:: @@ -25,7 +25,7 @@ :caption: Resources :hidden: - API Reference + API Reference contributing authors history diff --git a/docs/make.bat b/docs/make.bat index ad6474a..b427863 100644 --- a/docs/make.bat +++ b/docs/make.bat @@ -9,7 +9,7 @@ if "%SPHINXBUILD%" == "" ( ) set SOURCEDIR=. set BUILDDIR=_build -set SPHINXPROJ=greenguard +set SPHINXPROJ=draco if "%1" == "" goto help diff --git a/greenguard/__init__.py b/draco/__init__.py similarity index 67% rename from greenguard/__init__.py rename to draco/__init__.py index 4d310c6..885fac1 100644 --- a/greenguard/__init__.py +++ b/draco/__init__.py @@ -1,14 +1,14 @@ # -*- coding: utf-8 -*- -"""Top-level package for GreenGuard.""" +"""Top-level package for Draco.""" __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.3.1.dev0' +__version__ = '0.0.1.dev0' import os -from greenguard.pipeline import GreenGuardPipeline, get_pipelines +from draco.pipeline import DracoPipeline, get_pipelines _BASE_PATH = os.path.abspath(os.path.dirname(__file__)) MLBLOCKS_PIPELINES = os.path.join(_BASE_PATH, 'pipelines') @@ -16,6 +16,6 @@ __all__ = ( - 'GreenGuardPipeline', + 'DracoPipeline', 'get_pipelines', ) diff --git a/greenguard/benchmark.py b/draco/benchmark.py similarity index 97% rename from greenguard/benchmark.py rename to draco/benchmark.py index f638138..712bc6e 100644 --- a/greenguard/benchmark.py +++ b/draco/benchmark.py @@ -14,13 +14,12 @@ from sklearn.model_selection import train_test_split from tqdm import tqdm -from greenguard import get_pipelines -from greenguard.demo import load_demo -from greenguard.loaders import CSVLoader -from greenguard.metrics import (METRICS, accuracy_score, f1_score, - fpr_score, tpr_score, threshold_score) -from greenguard.pipeline import GreenGuardPipeline, generate_init_params, generate_preprocessing -from greenguard.results import load_results, write_results +from draco import get_pipelines +from draco.demo import load_demo +from draco.loaders import CSVLoader +from draco.metrics import METRICS, accuracy_score, f1_score, fpr_score, tpr_score, threshold_score +from draco.pipeline import DracoPipeline, generate_init_params, generate_preprocessing +from draco.results import load_results, write_results LOGGER = logging.getLogger(__name__) @@ -134,7 +133,7 @@ def evaluate_template( metric (function or str): Metric to use. If an ``str`` is give it must be one of the metrics - defined in the ``greenguard.metrics.METRICS`` dictionary. + defined in the ``draco.metrics.METRICS`` dictionary. tuning_iterations (int): Number of iterations to be used. preprocessing (int, list or dict): @@ -164,7 +163,7 @@ def evaluate_template( train, test = train_test_split(target_times, test_size=test_size, random_state=random_state) - pipeline = GreenGuardPipeline( + pipeline = DracoPipeline( template, metric=tuning_metric, cost=cost, @@ -286,7 +285,7 @@ def evaluate_templates( List of tuples (int, str or Timedelta object). metric (function or str): Metric to use. If an ``str`` is give it must be one of the metrics - defined in the ``greenguard.metrics.METRICS`` dictionary. + defined in the ``draco.metrics.METRICS`` dictionary. tuning_iterations (int): Number of iterations to be used. init_params (dict): @@ -625,7 +624,7 @@ def run_benchmark(templates, problems, window_size_resample_rule=None, Defaults to ``None``. metric (function or str): Metric to use. If an ``str`` is give it must be one of the metrics - defined in the ``greenguard.metrics.METRICS`` dictionary. + defined in the ``draco.metrics.METRICS`` dictionary. cost (bool): Whether the metric is a cost function (the lower the better) or not. Defaults to ``False``. @@ -826,13 +825,13 @@ def _make_problems(args): def _get_parser(): - parser = argparse.ArgumentParser(description='GreenGuard Benchmark Command Line Interface.') + parser = argparse.ArgumentParser(description='Draco Benchmark Command Line Interface.') parser.set_defaults(action=None) action = parser.add_subparsers(title='action') action.required = True # Run action - run = action.add_parser('run', help='Run the GreenGuard Benchmark') + run = action.add_parser('run', help='Run the Draco Benchmark') run.set_defaults(action=_run) run.set_defaults(user=None) @@ -878,13 +877,13 @@ def _get_parser(): # Summarize action summary = action.add_parser('summarize-results', - help='Summarize the GreenGuard Benchmark results') + help='Summarize the Draco Benchmark results') summary.set_defaults(action=_summarize_results) summary.add_argument('input', nargs='+', help='Input path with results.') summary.add_argument('output', help='Output file.') # Make problems action - problems = action.add_parser('make-problems', help='Create GreenGuard problems') + problems = action.add_parser('make-problems', help='Create Draco problems') problems.set_defaults(action=_make_problems) problems.add_argument('target-times-paths', nargs='+', help='List of target times paths.') problems.add_argument('readings-path', type=str, help='Path to the readings folder.') diff --git a/greenguard/db.py b/draco/db.py similarity index 97% rename from greenguard/db.py rename to draco/db.py index 99da2fd..053e01b 100644 --- a/greenguard/db.py +++ b/draco/db.py @@ -7,7 +7,7 @@ from pymongo import MongoClient -from greenguard.utils import remove_dots, restore_dots +from draco.utils import remove_dots, restore_dots LOGGER = logging.getLogger(__name__) diff --git a/greenguard/demo.py b/draco/demo.py similarity index 95% rename from greenguard/demo.py rename to draco/demo.py index 429e0e9..f7848e5 100644 --- a/greenguard/demo.py +++ b/draco/demo.py @@ -28,10 +28,10 @@ def _load_or_download(filename, dates): def load_demo(load_readings=True): - """Load the demo included in the GreenGuard project. + """Load the demo included in the Draco project. The first time that this function is executed, the data will be downloaded - and cached inside the `greenguard/demo` folder. + and cached inside the `draco/demo` folder. Subsequent calls will load the cached data instead of downloading it again. Returns: diff --git a/draco/loaders/__init__.py b/draco/loaders/__init__.py new file mode 100644 index 0000000..5f25f4c --- /dev/null +++ b/draco/loaders/__init__.py @@ -0,0 +1,5 @@ +from draco.loaders.csv import CSVLoader + +__all__ = ( + 'CSVLoader', +) diff --git a/greenguard/loaders/csv.py b/draco/loaders/csv.py similarity index 99% rename from greenguard/loaders/csv.py rename to draco/loaders/csv.py index 97d33ee..d50229c 100644 --- a/greenguard/loaders/csv.py +++ b/draco/loaders/csv.py @@ -4,7 +4,7 @@ import dask import pandas as pd -from greenguard.targets import drop_duplicates, select_valid_targets +from draco.targets import drop_duplicates, select_valid_targets LOGGER = logging.getLogger(__name__) diff --git a/greenguard/metrics.py b/draco/metrics.py similarity index 100% rename from greenguard/metrics.py rename to draco/metrics.py diff --git a/greenguard/pipeline.py b/draco/pipeline.py similarity index 97% rename from greenguard/pipeline.py rename to draco/pipeline.py index 2a9cd84..b50567b 100644 --- a/greenguard/pipeline.py +++ b/draco/pipeline.py @@ -20,7 +20,7 @@ from sklearn.exceptions import NotFittedError from sklearn.model_selection import KFold, StratifiedKFold -from greenguard.metrics import METRICS +from draco.metrics import METRICS LOGGER = logging.getLogger(__name__) @@ -152,11 +152,11 @@ def generate_preprocessing(templates_names, preprocessing): SELF_THRESHOLD = object() -class GreenGuardPipeline(object): - """Main Machine Learning component in the GreenGuard project. +class DracoPipeline(object): + """Main Machine Learning component in the Draco project. - The ``GreenGuardPipeline`` represents the abstraction of a Machine - Learning pipeline architecture specialized on the GreenGuard data + The ``DracoPipeline`` represents the abstraction of a Machine + Learning pipeline architecture specialized on the Draco data format. In order to use it, an MLBlocks pipeline template needs to be given, @@ -169,7 +169,7 @@ class GreenGuardPipeline(object): template_name: Name of the template being used. fitted (bool): - Whether this GreenGuardPipeline has already been fitted or not. + Whether this DracoPipeline has already been fitted or not. steps (list): List of primitives that compose this template. preprocessing (list): @@ -192,7 +192,7 @@ class GreenGuardPipeline(object): ``MLPipeline``. Also can be a list combining both. metric (str or function): Metric to use. If an ``str`` is given it must be one of the metrics - defined in the ``greenguard.metrics.METRICS`` dictionary. + defined in the ``draco.metrics.METRICS`` dictionary. cost (bool): Whether the metric is a cost function (the lower the better) or not. Defaults to ``False``. @@ -622,8 +622,8 @@ def load(cls, path): Path to the file where the pipeline is saved. Returns: - GreenGuardPipeline: - Loaded GreenGuardPipeline instance. + DracoPipeline: + Loaded DracoPipeline instance. """ with open(path, 'rb') as pickle_file: return cloudpickle.load(pickle_file) diff --git a/greenguard/pipelines/classes/normalize_dfs_xgb_classifier.json b/draco/pipelines/classes/normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/classes/normalize_dfs_xgb_classifier.json rename to draco/pipelines/classes/normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/classes/unstack_dfs_xgb_classifier.json b/draco/pipelines/classes/unstack_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/classes/unstack_dfs_xgb_classifier.json rename to draco/pipelines/classes/unstack_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/classes/unstack_double_lstm_timeseries_classifier.json b/draco/pipelines/classes/unstack_double_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/classes/unstack_double_lstm_timeseries_classifier.json rename to draco/pipelines/classes/unstack_double_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/classes/unstack_lstm_timeseries_classifier.json b/draco/pipelines/classes/unstack_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/classes/unstack_lstm_timeseries_classifier.json rename to draco/pipelines/classes/unstack_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json b/draco/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json rename to draco/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/disabled/dfs_xgb_classifier.json b/draco/pipelines/disabled/dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/disabled/dfs_xgb_classifier.json rename to draco/pipelines/disabled/dfs_xgb_classifier.json diff --git a/greenguard/pipelines/disabled/normalize_dfs_xgb_classifier.json b/draco/pipelines/disabled/normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/disabled/normalize_dfs_xgb_classifier.json rename to draco/pipelines/disabled/normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/disabled/resample_dfs_xgb_classifier.json b/draco/pipelines/disabled/resample_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/disabled/resample_dfs_xgb_classifier.json rename to draco/pipelines/disabled/resample_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json b/draco/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json rename to draco/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json b/draco/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json rename to draco/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json b/draco/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json rename to draco/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json b/draco/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json rename to draco/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json b/draco/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json rename to draco/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/probability/normalize_dfs_xgb_classifier.json b/draco/pipelines/probability/normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/probability/normalize_dfs_xgb_classifier.json rename to draco/pipelines/probability/normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/probability/unstack_dfs_xgb_classifier.json b/draco/pipelines/probability/unstack_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/probability/unstack_dfs_xgb_classifier.json rename to draco/pipelines/probability/unstack_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json b/draco/pipelines/probability/unstack_double_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/probability/unstack_double_lstm_timeseries_classifier.json rename to draco/pipelines/probability/unstack_double_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json b/draco/pipelines/probability/unstack_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/probability/unstack_lstm_timeseries_classifier.json rename to draco/pipelines/probability/unstack_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json b/draco/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json rename to draco/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/unstacked/unstacked_dfs_xgb_classifier.json b/draco/pipelines/unstacked/unstacked_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/unstacked/unstacked_dfs_xgb_classifier.json rename to draco/pipelines/unstacked/unstacked_dfs_xgb_classifier.json diff --git a/greenguard/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json b/draco/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json rename to draco/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json b/draco/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json similarity index 100% rename from greenguard/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json rename to draco/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json diff --git a/greenguard/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json b/draco/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json similarity index 100% rename from greenguard/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json rename to draco/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json diff --git a/greenguard/primitives/numpy.take.json b/draco/primitives/numpy.take.json similarity index 100% rename from greenguard/primitives/numpy.take.json rename to draco/primitives/numpy.take.json diff --git a/greenguard/primitives/xgboost.XGBClassifier:probabilities.json b/draco/primitives/xgboost.XGBClassifier:probabilities.json similarity index 100% rename from greenguard/primitives/xgboost.XGBClassifier:probabilities.json rename to draco/primitives/xgboost.XGBClassifier:probabilities.json diff --git a/greenguard/results.py b/draco/results.py similarity index 100% rename from greenguard/results.py rename to draco/results.py diff --git a/greenguard/targets.py b/draco/targets.py similarity index 100% rename from greenguard/targets.py rename to draco/targets.py diff --git a/greenguard/utils.py b/draco/utils.py similarity index 100% rename from greenguard/utils.py rename to draco/utils.py diff --git a/greenguard/loaders/__init__.py b/greenguard/loaders/__init__.py deleted file mode 100644 index 0113f15..0000000 --- a/greenguard/loaders/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from greenguard.loaders.csv import CSVLoader - -__all__ = ( - 'CSVLoader', -) diff --git a/setup.cfg b/setup.cfg index ee6b598..d2829c0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.1.dev0 +current_version = 0.0.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? @@ -20,7 +20,7 @@ values = search = version='{current_version}' replace = version='{new_version}' -[bumpversion:file:greenguard/__init__.py] +[bumpversion:file:draco/__init__.py] search = __version__ = '{current_version}' replace = __version__ = '{new_version}' diff --git a/setup.py b/setup.py index 79eeb5b..1305aaf 100644 --- a/setup.py +++ b/setup.py @@ -57,6 +57,7 @@ 'nbsphinx>=0.5.0,<0.7', 'Sphinx>=1.7.1,<3', 'sphinx_rtd_theme>=0.2.4,<0.5', + 'docutils>=0.14,<0.18', 'autodocsumm>=0.1.10', # style check @@ -93,8 +94,8 @@ description='AutoML for Renewable Energy Industries.', entry_points={ 'mlblocks': [ - 'pipelines=greenguard:MLBLOCKS_PIPELINES', - 'primitives=greenguard:MLBLOCKS_PRIMITIVES' + 'pipelines=draco:MLBLOCKS_PIPELINES', + 'primitives=draco:MLBLOCKS_PRIMITIVES' ], }, extras_require={ @@ -103,17 +104,17 @@ }, include_package_data=True, install_requires=install_requires, - keywords='wind machine learning greenguard', + keywords='wind machine learning draco', license='MIT license', long_description=readme + '\n\n' + history, long_description_content_type='text/markdown', - name='greenguard', - packages=find_packages(include=['greenguard', 'greenguard.*']), + name='draco-ml', + packages=find_packages(include=['draco', 'draco.*']), python_requires='>=3.6,<3.9', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, - url='https://github.com/D3-AI/GreenGuard', - version='0.3.1.dev0', + url='https://github.com/sintel-dev/Draco', + version='0.0.1.dev0', zip_safe=False, ) diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index ac87cc2..5d6f116 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -1,8 +1,8 @@ -"""Tests for `greenguard.benchmark` module.""" +"""Tests for `draco.benchmark` module.""" import numpy as np -from greenguard.benchmark import evaluate_templates -from greenguard.demo import load_demo +from draco.benchmark import evaluate_templates +from draco.demo import load_demo def test_predict(): diff --git a/tests/test_metrics.py b/tests/test_metrics.py index ce14132..a942669 100644 --- a/tests/test_metrics.py +++ b/tests/test_metrics.py @@ -1,6 +1,6 @@ import numpy as np -from greenguard.metrics import fpr_score +from draco.metrics import fpr_score def test_fpr_score_perfect_scenario(): diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 541ad6f..725d299 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -1,16 +1,16 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -"""Tests for `greenguard.pipeline` module.""" +"""Tests for `draco.pipeline` module.""" from unittest import TestCase from unittest.mock import patch import pandas as pd -from greenguard.pipeline import GreenGuardPipeline +from draco.pipeline import DracoPipeline -class TestGreenGuardPipeline(TestCase): +class TestDracoPipeline(TestCase): def _get_data(self): target_times = pd.DataFrame({ @@ -26,26 +26,26 @@ def _get_data(self): }) return target_times, readings - @patch('greenguard.pipeline.MLPipeline') - @patch('greenguard.pipeline.load_pipeline') + @patch('draco.pipeline.MLPipeline') + @patch('draco.pipeline.load_pipeline') def test_fit(self, load_pipeline_mock, mlpipeline_mock): load_pipeline_mock.return_value = dict() # Run - instance = GreenGuardPipeline('a_pipeline', 'accuracy') + instance = DracoPipeline('a_pipeline', 'accuracy') target_times, readings = self._get_data() instance.fit(target_times, readings) # Asserts assert instance.fitted - @patch('greenguard.pipeline.MLPipeline') - @patch('greenguard.pipeline.load_pipeline') + @patch('draco.pipeline.MLPipeline') + @patch('draco.pipeline.load_pipeline') def test_predict(self, load_pipeline_mock, mlpipeline_mock): load_pipeline_mock.return_value = dict() # Run - instance = GreenGuardPipeline('a_pipeline', 'accuracy') + instance = DracoPipeline('a_pipeline', 'accuracy') instance.fitted = True target_times, readings = self._get_data() instance.predict(target_times, readings) diff --git a/tutorials/01_GreenGuard_Machine_Learning.ipynb b/tutorials/01_Draco_Machine_Learning.ipynb similarity index 89% rename from tutorials/01_GreenGuard_Machine_Learning.ipynb rename to tutorials/01_Draco_Machine_Learning.ipynb index 03a2aa0..4a5fde7 100644 --- a/tutorials/01_GreenGuard_Machine_Learning.ipynb +++ b/tutorials/01_Draco_Machine_Learning.ipynb @@ -4,21 +4,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# GreenGuard Machine Learning" + "# Draco Machine Learning" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this tutorial we will show you how to use GreenGuard to solve a Machine Learning problem\n", + "In this tutorial we will show you how to use Draco to solve a Machine Learning problem\n", "defined via a Target Times table.\n", "\n", "During the next steps we will:\n", "\n", "- Load demo target times and readings\n", "- Find available pipelines and load two of them as templates\n", - "- Use GreenGuard AutoML to select the best template and hyperparameters for our problem\n", + "- Use Draco AutoML to select the best template and hyperparameters for our problem\n", "- Build and fit a Machine Learning pipeline based on the found template and hyperparameters\n", "- Make predictions using the fitted pipeline\n", "- Evaluate how good the predictions are" @@ -31,7 +31,7 @@ "## 0. Setup the logging\n", "\n", "This step sets up logging in our environment to increase our visibility over\n", - "the steps that GreenGuard performs." + "the steps that Draco performs." ] }, { @@ -57,7 +57,7 @@ "\n", "The first step is to load the data that we are going to use.\n", "\n", - "In order to use the demo data included in GreenGuard, the `greenguard.demo.load_demo` function can be used." + "In order to use the demo data included in Draco, the `draco.demo.load_demo` function can be used." ] }, { @@ -66,7 +66,7 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.demo import load_demo\n", + "from draco.demo import load_demo\n", "\n", "target_times, readings = load_demo()" ] @@ -75,11 +75,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This will download some demo data from [GreenGuard S3 demo Bucket](\n", - "https://d3-ai-greenguard.s3.amazonaws.com/index.html)%20and%20load%20it%20as/n", + "This will download some demo data from [Draco S3 demo Bucket](\n", + "https://d3-ai-draco.s3.amazonaws.com/index.html)%20and%20load%20it%20as/n", "the necessary `target_times` and `readings` tables.\n", "\n", - "The exact format of these tables is described in the GreenGuard README and docs:" + "The exact format of these tables is described in the Draco README and docs:" ] }, { @@ -399,11 +399,11 @@ "## 3. Finding the available Templates\n", "\n", "The next step will be to select a collection of templates from the ones\n", - "available in GreenGuard.\n", + "available in Draco.\n", "\n", - "For this, we can use the `greenguard.get_pipelines` function, which will\n", + "For this, we can use the `draco.get_pipelines` function, which will\n", "return us the list of all the available MLBlocks pipelines found in the\n", - "GreenGuard system." + "Draco system." ] }, { @@ -444,7 +444,7 @@ } ], "source": [ - "from greenguard import get_pipelines\n", + "from draco import get_pipelines\n", "\n", "get_pipelines()" ] @@ -505,20 +505,20 @@ { "data": { "text/plain": [ - "{'unstacked.unstacked_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json',\n", - " 'unstacked.unstacked_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/unstacked/unstacked_dfs_xgb_classifier.json',\n", - " 'classes.unstack_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/classes/unstack_dfs_xgb_classifier.json',\n", - " 'classes.normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/classes/normalize_dfs_xgb_classifier.json',\n", - " 'classes.unstack_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json',\n", - " 'disabled.resample_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json',\n", - " 'disabled.resample_unstack_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json',\n", - " 'disabled.normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/normalize_dfs_xgb_classifier.json',\n", - " 'disabled.resample_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/resample_dfs_xgb_classifier.json',\n", - " 'disabled.resample_unstack_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json',\n", - " 'disabled.dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/disabled/dfs_xgb_classifier.json',\n", - " 'probability.unstack_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/probability/unstack_dfs_xgb_classifier.json',\n", - " 'probability.normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/probability/normalize_dfs_xgb_classifier.json',\n", - " 'probability.unstack_normalize_dfs_xgb_classifier': '/GreenGuard/greenguard/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json'}" + "{'unstacked.unstacked_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json',\n", + " 'unstacked.unstacked_dfs_xgb_classifier': '/Draco/draco/pipelines/unstacked/unstacked_dfs_xgb_classifier.json',\n", + " 'classes.unstack_dfs_xgb_classifier': '/Draco/draco/pipelines/classes/unstack_dfs_xgb_classifier.json',\n", + " 'classes.normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/classes/normalize_dfs_xgb_classifier.json',\n", + " 'classes.unstack_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json',\n", + " 'disabled.resample_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json',\n", + " 'disabled.resample_unstack_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json',\n", + " 'disabled.normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/normalize_dfs_xgb_classifier.json',\n", + " 'disabled.resample_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/resample_dfs_xgb_classifier.json',\n", + " 'disabled.resample_unstack_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json',\n", + " 'disabled.dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/dfs_xgb_classifier.json',\n", + " 'probability.unstack_dfs_xgb_classifier': '/Draco/draco/pipelines/probability/unstack_dfs_xgb_classifier.json',\n", + " 'probability.normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/probability/normalize_dfs_xgb_classifier.json',\n", + " 'probability.unstack_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json'}" ] }, "execution_count": 12, @@ -568,7 +568,7 @@ "source": [ "## 4. Finding the best Pipeline\n", "\n", - "Once we have loaded the data, we create a **GreenGuardPipeline** instance by passing:\n", + "Once we have loaded the data, we create a **DracoPipeline** instance by passing:\n", "\n", "* `templates (string or list)`: the name of a template, the path to a template json file or\n", "a list that can combine both of them.\n", @@ -589,9 +589,9 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.pipeline import GreenGuardPipeline\n", + "from draco.pipeline import DracoPipeline\n", "\n", - "pipeline = GreenGuardPipeline(templates, metric='f1', cv_splits=3)" + "pipeline = DracoPipeline(templates, metric='f1', cv_splits=3)" ] }, { @@ -632,7 +632,7 @@ "output_type": "stream", "text": [ "INFO:btb.session:Obtaining default configuration for classes.unstack_normalize_dfs_xgb_classifier\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: classes.unstack_normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", @@ -817,7 +817,7 @@ "text": [ "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: classes.unstack_normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n", @@ -986,7 +986,7 @@ "will probably want to save a fitted instance and load it later to analyze new signals\n", "instead of fitting pipelines over and over again.\n", "\n", - "This can be done by using the `save` and `load` methods from the `GreenGuardPipeline`.\n", + "This can be done by using the `save` and `load` methods from the `DracoPipeline`.\n", "\n", "In order to save an instance, call its `save` method passing it the path and filename\n", "where the model should be saved." @@ -1007,8 +1007,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Once the pipeline is saved, it can be loaded back as a new `GreenGuardPipeline` by using the\n", - "`GreenGuardPipeline.load` method:" + "Once the pipeline is saved, it can be loaded back as a new `DracoPipeline` by using the\n", + "`DracoPipeline.load` method:" ] }, { @@ -1017,7 +1017,7 @@ "metadata": {}, "outputs": [], "source": [ - "new_pipeline = GreenGuardPipeline.load(path)" + "new_pipeline = DracoPipeline.load(path)" ] }, { diff --git a/tutorials/02_Extract_Readings.ipynb b/tutorials/02_Extract_Readings.ipynb index a454648..1713a07 100644 --- a/tutorials/02_Extract_Readings.ipynb +++ b/tutorials/02_Extract_Readings.ipynb @@ -10,7 +10,7 @@ "from a folder that contains readings in the raw CSV format.\n", "\n", "The Raw CSV format es briefly explained below, but more details can be found in [the documentation site](\n", - "https://signals-dev.github.io/GreenGuard/advanced_usage/csv.html)/n", + "https://sintel-dev.github.io/Draco/advanced_usage/csv.html)/n", "\n", "During the next steps we will:\n", "\n", @@ -29,7 +29,7 @@ "## 0. Setup the logging\n", "\n", "This step sets up logging in our environment to increase our visibility over\n", - "the steps that GreenGuard performs." + "the steps that Draco performs." ] }, { @@ -70,23 +70,23 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.demo:Generating file readings/T001/2013-01.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-02.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-03.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-04.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-05.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-06.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-07.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-08.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-09.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-10.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-11.csv\n", - "INFO:greenguard.demo:Generating file readings/T001/2013-12.csv\n" + "INFO:draco.demo:Generating file readings/T001/2013-01.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-02.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-03.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-04.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-05.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-06.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-07.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-08.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-09.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-10.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-11.csv\n", + "INFO:draco.demo:Generating file readings/T001/2013-12.csv\n" ] } ], "source": [ - "from greenguard.demo import generate_raw_readings\n", + "from draco.demo import generate_raw_readings\n", "\n", "target_times = generate_raw_readings('readings')" ] @@ -393,7 +393,7 @@ "them into memory all at once.\n", "\n", "In order to load them in an efficient way so that we can use them to solve Machine Learning\n", - "problems, GeenGuard provides the `greenguard.loaders.CVSLoader` class.\n", + "problems, GeenGuard provides the `draco.loaders.CVSLoader` class.\n", "\n", "This class is prepared to, given a target times table, explore a collection of raw readings\n", "and extract only the information needed to solve that particular problem.\n", @@ -419,7 +419,7 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.loaders import CSVLoader\n", + "from draco.loaders import CSVLoader\n", "\n", "readings_path = 'readings'\n", "\n", @@ -453,9 +453,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.loaders.csv:Loaded 1306052 readings from turbine T001\n", - "INFO:greenguard.loaders.csv:Loaded 1306052 turbine readings\n", - "INFO:greenguard.targets:Dropped 0 targets without enough data. Final target_times size: 353\n" + "INFO:draco.loaders.csv:Loaded 1306052 readings from turbine T001\n", + "INFO:draco.loaders.csv:Loaded 1306052 turbine readings\n", + "INFO:draco.targets:Dropped 0 targets without enough data. Final target_times size: 353\n" ] } ], @@ -646,9 +646,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.loaders.csv:Loaded 1309796 readings from turbine T001\n", - "INFO:greenguard.loaders.csv:Loaded 1309796 turbine readings\n", - "INFO:greenguard.targets:Dropped 28 targets without enough data. Final target_times size: 325\n" + "INFO:draco.loaders.csv:Loaded 1309796 readings from turbine T001\n", + "INFO:draco.loaders.csv:Loaded 1309796 turbine readings\n", + "INFO:draco.targets:Dropped 28 targets without enough data. Final target_times size: 325\n" ] } ], @@ -763,33 +763,33 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:81749 readings reduced to 3432\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:110938 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:112118 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:111862 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:114400 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:105321 readings reduced to 4550\n", - "INFO:greenguard.loaders.csv:108371 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:115615 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:115647 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:103319 readings reduced to 4368\n", - "INFO:greenguard.loaders.csv:115979 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:114477 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:Loaded 55250 readings from turbine T001\n", - "INFO:greenguard.loaders.csv:Loaded 55250 turbine readings\n", - "INFO:greenguard.targets:Dropped 12 targets without enough data. Final target_times size: 341\n" + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:81749 readings reduced to 3432\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:110938 readings reduced to 4680\n", + "INFO:draco.loaders.csv:112118 readings reduced to 4680\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:111862 readings reduced to 4680\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:114400 readings reduced to 4836\n", + "INFO:draco.loaders.csv:105321 readings reduced to 4550\n", + "INFO:draco.loaders.csv:108371 readings reduced to 4680\n", + "INFO:draco.loaders.csv:115615 readings reduced to 4836\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:115647 readings reduced to 4836\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:103319 readings reduced to 4368\n", + "INFO:draco.loaders.csv:115979 readings reduced to 4836\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:114477 readings reduced to 4836\n", + "INFO:draco.loaders.csv:Loaded 55250 readings from turbine T001\n", + "INFO:draco.loaders.csv:Loaded 55250 turbine readings\n", + "INFO:draco.targets:Dropped 12 targets without enough data. Final target_times size: 341\n" ] } ], @@ -940,7 +940,7 @@ "source": [ "## 4. Unstacking\n", "\n", - "Some of the pipelines included in **GreenGuard** expect a slightly different input format\n", + "Some of the pipelines included in **Draco** expect a slightly different input format\n", "where the data has been unstacked by `signal_id`, putting the values of each signal in a\n", "different column instead of having all of them in a single one.\n", "\n", @@ -959,33 +959,33 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:108371 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:115647 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:103319 readings reduced to 4368\n", - "INFO:greenguard.loaders.csv:115615 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:114400 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:114477 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:115979 readings reduced to 4836\n", - "INFO:greenguard.loaders.csv:111862 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:81749 readings reduced to 3432\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:105321 readings reduced to 4550\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:Resampling: 4h - mean\n", - "INFO:greenguard.loaders.csv:112118 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:110938 readings reduced to 4680\n", - "INFO:greenguard.loaders.csv:Loaded 2125 readings from turbine T001\n", - "INFO:greenguard.loaders.csv:Loaded 2125 turbine readings\n", - "INFO:greenguard.targets:Dropped 12 targets without enough data. Final target_times size: 341\n" + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:108371 readings reduced to 4680\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:115647 readings reduced to 4836\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:103319 readings reduced to 4368\n", + "INFO:draco.loaders.csv:115615 readings reduced to 4836\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:114400 readings reduced to 4836\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:114477 readings reduced to 4836\n", + "INFO:draco.loaders.csv:115979 readings reduced to 4836\n", + "INFO:draco.loaders.csv:111862 readings reduced to 4680\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:81749 readings reduced to 3432\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:105321 readings reduced to 4550\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:Resampling: 4h - mean\n", + "INFO:draco.loaders.csv:112118 readings reduced to 4680\n", + "INFO:draco.loaders.csv:110938 readings reduced to 4680\n", + "INFO:draco.loaders.csv:Loaded 2125 readings from turbine T001\n", + "INFO:draco.loaders.csv:Loaded 2125 turbine readings\n", + "INFO:draco.targets:Dropped 12 targets without enough data. Final target_times size: 341\n" ] } ], diff --git a/tutorials/03_Benchmarking.ipynb b/tutorials/03_Benchmarking.ipynb index 56e8701..c7bce62 100644 --- a/tutorials/03_Benchmarking.ipynb +++ b/tutorials/03_Benchmarking.ipynb @@ -14,7 +14,7 @@ "## 0. Setup the logging\n", "\n", "This step sets up logging in our environment to increase our visibility over\n", - "the steps that GreenGuard performs." + "the steps that Draco performs." ] }, { @@ -27,7 +27,7 @@ "\n", "logging.basicConfig(level=logging.INFO)\n", "logging.getLogger().setLevel(level=logging.ERROR)\n", - "logging.getLogger('greenguard').setLevel(level=logging.INFO)\n", + "logging.getLogger('draco').setLevel(level=logging.INFO)\n", "\n", "import warnings\n", "warnings.simplefilter(\"ignore\")" @@ -40,12 +40,12 @@ "\n", "## Running the Benchmarking\n", "\n", - "The user API for the GreenGuard Benchmarking is the `greenguard.benchmark.evaluate_templates` function.\n", + "The user API for the Draco Benchmarking is the `draco.benchmark.evaluate_templates` function.\n", "\n", "The `evaluate_templates` function accepts the following arguments:\n", "* `templates (list)`: List of templates to try.\n", "* `window_size_rule (list)`: List of tupples (int, str or Timedelta object).\n", - "* `metric (function or str)`: Metric to use. If an ``str`` is give it must be one of the metrics defined in the `greenguard.metrics.METRICS` dictionary.\n", + "* `metric (function or str)`: Metric to use. If an ``str`` is give it must be one of the metrics defined in the `draco.metrics.METRICS` dictionary.\n", "* `tuning_iterations (int)`: Number of iterations to be used.\n", "* `init_params (dict)`: Initialization parameters for the pipelines.\n", "* `target_times (DataFrame)`: Contains the specefication problem that we are solving, which has three columns:\n", @@ -100,45 +100,45 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:greenguard.benchmark:Evaluating template probability.unstack_lstm_timeseries_classifier on problem None (1d, 1h)\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.benchmark:Evaluating template probability.unstack_lstm_timeseries_classifier on problem None (1d, 1h)\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 287\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.565737233372491\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 145\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 269\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.5973752345055594\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 412\n", - "INFO:greenguard.benchmark:Evaluating template probability.unstack_lstm_timeseries_classifier on problem None (2d, 2h)\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.benchmark:Evaluating template probability.unstack_lstm_timeseries_classifier on problem None (2d, 2h)\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.unstack_lstm_timeseries_classifier \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 114\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.016427744327526084\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 224\n", - "INFO:greenguard.benchmark:Evaluating template probability.normalize_dfs_xgb_classifier on problem None (1d, 1h)\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.benchmark:Evaluating template probability.normalize_dfs_xgb_classifier on problem None (1d, 1h)\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", @@ -147,7 +147,7 @@ " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.1\n", " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.0\n", " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 1\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 11\n", @@ -156,7 +156,7 @@ " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.554989010368875\n", " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.909957492053926\n", " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 7\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 61\n", @@ -165,8 +165,8 @@ " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.6840927016151666\n", " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.5480298094360865\n", " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 6\n", - "INFO:greenguard.benchmark:Evaluating template probability.normalize_dfs_xgb_classifier on problem None (2d, 2h)\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.benchmark:Evaluating template probability.normalize_dfs_xgb_classifier on problem None (2d, 2h)\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", @@ -175,7 +175,7 @@ " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.1\n", " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.0\n", " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 1\n", - "INFO:greenguard.pipeline:New configuration found:\n", + "INFO:draco.pipeline:New configuration found:\n", " Template: probability.normalize_dfs_xgb_classifier \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 99\n", @@ -188,7 +188,7 @@ } ], "source": [ - "from greenguard.benchmark import evaluate_templates\n", + "from draco.benchmark import evaluate_templates\n", "\n", "results = evaluate_templates(\n", " templates=templates,\n", diff --git a/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb b/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb index 5fc510e..ca00d58 100644 --- a/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb +++ b/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb @@ -13,7 +13,7 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.demo import load_demo\n", + "from draco.demo import load_demo\n", "\n", "target_times, readings = load_demo()" ] @@ -33,9 +33,9 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.pipeline import GreenGuardPipeline\n", + "from draco.pipeline import DracoPipeline\n", "\n", - "pipeline = GreenGuardPipeline(pipeline_name)" + "pipeline = DracoPipeline(pipeline_name)" ] }, { @@ -256,7 +256,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Data Preparation (part of GreenGuard Pipeline)\n", + "## Data Preparation (part of Draco Pipeline)\n", "\n", "* Input: target_times, readings, turbines\n", "* Output: X, y, readings, turbines\n", diff --git a/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb b/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb index f44377b..f539e89 100644 --- a/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb +++ b/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb @@ -13,7 +13,7 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.demo import load_demo\n", + "from draco.demo import load_demo\n", "\n", "target_times, readings = load_demo()" ] @@ -33,9 +33,9 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.pipeline import GreenGuardPipeline\n", + "from draco.pipeline import DracoPipeline\n", "\n", - "pipeline = GreenGuardPipeline(pipeline_name)" + "pipeline = DracoPipeline(pipeline_name)" ] }, { @@ -260,7 +260,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Data Preparation (part of GreenGuard Pipeline)\n", + "## Data Preparation (part of Draco Pipeline)\n", "\n", "* Input: target_times, readings, turbines\n", "* Output: X, y, readings, turbines\n", diff --git a/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb b/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb index ec68b0e..1a10480 100644 --- a/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb +++ b/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb @@ -13,7 +13,7 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.demo import load_demo\n", + "from draco.demo import load_demo\n", "\n", "target_times, readings = load_demo()" ] @@ -33,9 +33,9 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.pipeline import GreenGuardPipeline\n", + "from draco.pipeline import DracoPipeline\n", "\n", - "pipeline = GreenGuardPipeline(pipeline_name)" + "pipeline = DracoPipeline(pipeline_name)" ] }, { @@ -260,7 +260,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Data Preparation (part of GreenGuard Pipeline)\n", + "## Data Preparation (part of Draco Pipeline)\n", "\n", "* Input: target_times, readings, turbines\n", "* Output: X, y, readings, turbines\n", diff --git a/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb b/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb index 8fc6c8b..84530a2 100644 --- a/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb +++ b/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb @@ -13,7 +13,7 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.demo import load_demo\n", + "from draco.demo import load_demo\n", "\n", "target_times, readings = load_demo()" ] @@ -33,9 +33,9 @@ "metadata": {}, "outputs": [], "source": [ - "from greenguard.pipeline import GreenGuardPipeline\n", + "from draco.pipeline import DracoPipeline\n", "\n", - "pipeline = GreenGuardPipeline(pipeline_name)" + "pipeline = DracoPipeline(pipeline_name)" ] }, { @@ -256,7 +256,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Data Preparation (part of GreenGuard Pipeline)\n", + "## Data Preparation (part of Draco Pipeline)\n", "\n", "* Input: target_times, readings, turbines\n", "* Output: X, y, readings, turbines\n", From d211141373e51d63a23bbbc8d833f5ccd28b7eb5 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Sun, 19 Dec 2021 04:48:36 +0300 Subject: [PATCH 144/171] Remove support python 3.6 (#62) --- .github/workflows/tests.yml | 10 +++++----- setup.py | 3 +-- tox.ini | 4 +--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 97dbb0e..5659465 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 @@ -30,7 +30,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 @@ -52,7 +52,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] os: [ubuntu-latest, macos-latest] steps: - uses: actions/checkout@v1 @@ -71,7 +71,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 @@ -90,7 +90,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.6, 3.7, 3.8] + python-version: [3.7, 3.8] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 diff --git a/setup.py b/setup.py index 1305aaf..60c095f 100644 --- a/setup.py +++ b/setup.py @@ -87,7 +87,6 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ], @@ -110,7 +109,7 @@ long_description_content_type='text/markdown', name='draco-ml', packages=find_packages(include=['draco', 'draco.*']), - python_requires='>=3.6,<3.9', + python_requires='>=3.7,<3.9', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index 0068931..88295ee 100644 --- a/tox.ini +++ b/tox.ini @@ -5,19 +5,17 @@ commands = /usr/bin/env make docs [tox] -envlist = py3{6,7,8}-{lint,readme,unit,minimum} +envlist = py3{7,8}-{lint,readme,unit,minimum} [travis] python = 3.8: py38-lint, py38-readme, py38-unit, py38-minimum, py38-tutorials 3.7: py37-lint, py37-readme, py37-unit, py37-minimum, py37-tutorials - 3.6: py36-lint, py36-readme, py36-unit, py36-minimum, py36-tutorials [gh-actions] python = 3.8: py38-lint, py38-readme, py38-unit, py38-minimum, py38-tutorials 3.7: py37-lint, py37-readme, py37-unit, py37-minimum, py37-tutorials - 3.6: py36-lint, py36-readme, py36-unit, py36-minimum, py36-tutorials [testenv] passenv = CI TRAVIS TRAVIS_* From 1ba40d2518f75f5484707bcdf0322bce56454fad Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sat, 1 Jan 2022 11:50:51 +0300 Subject: [PATCH 145/171] =?UTF-8?q?Bump=20version:=200.0.1.dev0=20?= =?UTF-8?q?=E2=86=92=200.0.1.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 885fac1..6abe781 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.0.1.dev0' +__version__ = '0.0.1.dev1' import os diff --git a/setup.cfg b/setup.cfg index d2829c0..1ac6a7a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.1.dev0 +current_version = 0.0.1.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 60c095f..e7502d8 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.0.1.dev0', + version='0.0.1.dev1', zip_safe=False, ) From ea3499b4b92c2de8eb5d075520d261b2a106c8eb Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sat, 1 Jan 2022 12:15:59 +0300 Subject: [PATCH 146/171] prepare release notes --- HISTORY.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/HISTORY.md b/HISTORY.md index e656d1a..c4a54ff 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,11 @@ # History +## 0.1.0 - 2021-01-01 + +Rename ``GreenGuard`` to ``Draco``. + +* First release on ``draco-ml`` PyPI + ## 0.3.0 - 2021-01-22 This release increases the supported version of python to `3.8` and also includes changes From cf88c3b703640674022f79b5003036deea32edbf Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sat, 1 Jan 2022 12:16:20 +0300 Subject: [PATCH 147/171] =?UTF-8?q?Bump=20version:=200.0.1.dev1=20?= =?UTF-8?q?=E2=86=92=200.1.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 6abe781..4269cd2 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.0.1.dev1' +__version__ = '0.1.0.dev0' import os diff --git a/setup.cfg b/setup.cfg index 1ac6a7a..792c650 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.1.dev1 +current_version = 0.1.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index e7502d8..cdfd471 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.0.1.dev1', + version='0.1.0.dev0', zip_safe=False, ) From d904b10d884163097cb90f3f10884ff4e619f7e8 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sat, 1 Jan 2022 12:18:14 +0300 Subject: [PATCH 148/171] =?UTF-8?q?Bump=20version:=200.1.0.dev0=20?= =?UTF-8?q?=E2=86=92=200.1.0.dev1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 4269cd2..3caad34 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.0.dev0' +__version__ = '0.1.0.dev1' import os diff --git a/setup.cfg b/setup.cfg index 792c650..30db171 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0.dev0 +current_version = 0.1.0.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index cdfd471..aa66a24 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.1.0.dev0', + version='0.1.0.dev1', zip_safe=False, ) From 3ac5e0d64a3aa2f2e0fad59bfa9b44c22a6a4ae3 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sat, 1 Jan 2022 22:48:38 +0300 Subject: [PATCH 149/171] prepare draco release --- HISTORY.md | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index c4a54ff..c03e597 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -2,18 +2,18 @@ ## 0.1.0 - 2021-01-01 -Rename ``GreenGuard`` to ``Draco``. - * First release on ``draco-ml`` PyPI -## 0.3.0 - 2021-01-22 +## Previous GreenGuard development + +### 0.3.0 - 2021-01-22 This release increases the supported version of python to `3.8` and also includes changes in the installation requirements, where ``pandas`` and ``scikit-optimize`` packages have been updated to support higher versions. This changes come together with the newer versions of ``MLBlocks`` and ``MLPrimitives``. -### Internal Improvements +#### Internal Improvements * Fix ``run_benchmark`` generating properly the ``init_hyperparameters`` for the pipelines. * New ``FPR`` metric. @@ -21,16 +21,16 @@ of ``MLBlocks`` and ``MLPrimitives``. * Multiple benchmarking metrics allowed. * Multiple ``tpr`` or ``threshold`` values allowed for the benchmark. -## 0.2.6 - 2020-10-23 +### 0.2.6 - 2020-10-23 * Fix ``mkdir`` when exporting to ``csv`` file the benchmark results. * Intermediate steps for the pipelines with demo notebooks for each pipeline. -### Resolved Issues +#### Resolved Issues * Issue #50: Expose partial outputs and executions in the ``GreenGuardPipeline``. -## 0.2.5 - 2020-10-09 +### 0.2.5 - 2020-10-09 With this release we include: @@ -39,50 +39,50 @@ templates against problems with different window size and resample rules. * `summarize_results`: A function that given a `csv` file generates a `xlsx` file with a summary tab and a detailed tab with the results from `run_benchmark`. -## 0.2.4 - 2020-09-25 +### 0.2.4 - 2020-09-25 * Fix dependency errors -## 0.2.3 - 2020-08-10 +### 0.2.3 - 2020-08-10 * Added benchmarking module. -## 0.2.2 - 2020-07-10 +### 0.2.2 - 2020-07-10 -### Internal Improvements +#### Internal Improvements * Added github actions. -### Resolved Issues +#### Resolved Issues * Issue #27: Cache Splits pre-processed data on disk -## 0.2.1 - 2020-06-16 +### 0.2.1 - 2020-06-16 With this release we give the possibility to the user to specify more than one template when creating a GreenGuardPipeline. When the `tune` method of this is called, an instance of BTBSession is returned and it is in charge of selecting the templates and tuning their hyperparameters until achieving the best pipeline. -### Internal Improvements +#### Internal Improvements * Resample by filename inside the `CSVLoader` to avoid oversampling of data that will not be used. * Select targets now allows them to be equal. * Fixed the csv filename format. * Upgraded to BTB. -### Bug Fixes +#### Bug Fixes * Issue #33: Wrong default datetime format -### Resolved Issues +#### Resolved Issues * Issue #35: Select targets is too strict * Issue #36: resample by filename inside csvloader * Issue #39: Upgrade BTB * Issue #41: Fix CSV filename format -## 0.2.0 - 2020-02-14 +### 0.2.0 - 2020-02-14 First stable release: @@ -91,6 +91,6 @@ First stable release: * optimized pipeline tuning * documentation and tutorials -## 0.1.0 +### 0.1.0 * First release on PyPI From 76e7b73b075d785d7d635de105b16599a71f1383 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sat, 1 Jan 2022 22:55:21 +0300 Subject: [PATCH 150/171] =?UTF-8?q?Bump=20version:=200.1.0.dev1=20?= =?UTF-8?q?=E2=86=92=200.1.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 3caad34..759653d 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.0.dev1' +__version__ = '0.1.0' import os diff --git a/setup.cfg b/setup.cfg index 30db171..3ea3079 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0.dev1 +current_version = 0.1.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index aa66a24..8e2b494 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.1.0.dev1', + version='0.1.0', zip_safe=False, ) From be822782964ace5e3bfaf19cbce37b770ece7dac Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sat, 1 Jan 2022 23:02:26 +0300 Subject: [PATCH 151/171] change python 3.6 to 3.7 --- docker/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a086f2b..c750e55 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.6 +FROM python:3.7 ARG UID=1000 EXPOSE 8888 From b06a6baf59bc3d0be7f446c264a73105e0da5ecf Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sat, 1 Jan 2022 23:19:28 +0300 Subject: [PATCH 152/171] =?UTF-8?q?Bump=20version:=200.1.0=20=E2=86=92=200?= =?UTF-8?q?.1.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 759653d..db53888 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.0' +__version__ = '0.1.1.dev0' import os diff --git a/setup.cfg b/setup.cfg index 3ea3079..597575d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.0 +current_version = 0.1.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 8e2b494..fefbd90 100644 --- a/setup.py +++ b/setup.py @@ -114,6 +114,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.1.0', + version='0.1.1.dev0', zip_safe=False, ) From 5776afa819b5dc490eda0181c0f6e3217a324d41 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Sun, 2 Jan 2022 19:52:19 +0300 Subject: [PATCH 153/171] amend date --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index c03e597..84a28d5 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,6 @@ # History -## 0.1.0 - 2021-01-01 +## 0.1.0 - 2022-01-01 * First release on ``draco-ml`` PyPI From 2c120276acf665de03ea949ffb1037ed17e7628d Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Mon, 31 Jan 2022 11:16:34 +0000 Subject: [PATCH 154/171] python3.6 tests (#65) * add python3.6 tests --- .github/workflows/tests.yml | 12 ++++++------ setup.py | 3 ++- tox.ini | 4 +++- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5659465..d4c79b9 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -11,7 +11,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.7, 3.8] + python-version: [3.6, 3.7, 3.8] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 @@ -30,7 +30,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.7, 3.8] + python-version: [3.6, 3.7, 3.8] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 @@ -52,8 +52,8 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.7, 3.8] - os: [ubuntu-latest, macos-latest] + python-version: [3.6, 3.7, 3.8] + os: [ubuntu-latest, macos-10.15] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} @@ -71,7 +71,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.7, 3.8] + python-version: [3.6, 3.7, 3.8] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 @@ -90,7 +90,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.7, 3.8] + python-version: [3.6, 3.7, 3.8] os: [ubuntu-latest] steps: - uses: actions/checkout@v1 diff --git a/setup.py b/setup.py index fefbd90..0157b7c 100644 --- a/setup.py +++ b/setup.py @@ -87,6 +87,7 @@ 'License :: OSI Approved :: MIT License', 'Natural Language :: English', 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ], @@ -109,7 +110,7 @@ long_description_content_type='text/markdown', name='draco-ml', packages=find_packages(include=['draco', 'draco.*']), - python_requires='>=3.7,<3.9', + python_requires='>=3.6,<3.9', setup_requires=setup_requires, test_suite='tests', tests_require=tests_require, diff --git a/tox.ini b/tox.ini index 88295ee..0068931 100644 --- a/tox.ini +++ b/tox.ini @@ -5,17 +5,19 @@ commands = /usr/bin/env make docs [tox] -envlist = py3{7,8}-{lint,readme,unit,minimum} +envlist = py3{6,7,8}-{lint,readme,unit,minimum} [travis] python = 3.8: py38-lint, py38-readme, py38-unit, py38-minimum, py38-tutorials 3.7: py37-lint, py37-readme, py37-unit, py37-minimum, py37-tutorials + 3.6: py36-lint, py36-readme, py36-unit, py36-minimum, py36-tutorials [gh-actions] python = 3.8: py38-lint, py38-readme, py38-unit, py38-minimum, py38-tutorials 3.7: py37-lint, py37-readme, py37-unit, py37-minimum, py37-tutorials + 3.6: py36-lint, py36-readme, py36-unit, py36-minimum, py36-tutorials [testenv] passenv = CI TRAVIS TRAVIS_* From 97ef930a4be356b4aa5cc64374122515c5cf53cb Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Mon, 7 Mar 2022 11:25:21 -0500 Subject: [PATCH 155/171] Reorganize pipelines (#64) * change greenguard to draco * update badge to gh workflow * fix logo * draco preprocessing sub-pipelining * add pipelines (wip) * reorganize (wip) * rename pipelines * fix lstm pipeline * fix lint * add regression pipeline * add RUL dataset and notebooks * return to master docker configuration * remove draco pipeline as primitive --- .gitignore | 2 + README.md | 14 +- draco/__init__.py | 6 +- draco/demo.py | 35 +- draco/pipeline.py | 32 +- .../classes/normalize_dfs_xgb_classifier.json | 65 - .../classes/unstack_dfs_xgb_classifier.json | 78 - .../unstack_normalize_dfs_xgb_classifier.json | 69 - draco/pipelines/dfs_xgb/dfs_xgb.json | 29 + ...fs_xgb_prob_with_double_normalization.json | 42 + .../dfs_xgb/dfs_xgb_prob_with_unstack.json | 50 + ...s_xgb_prob_with_unstack_normalization.json | 49 + .../dfs_xgb_with_double_normalization.json | 37 + .../dfs_xgb/dfs_xgb_with_normalization.json | 29 + .../dfs_xgb/dfs_xgb_with_unstack.json | 45 + .../dfs_xgb_with_unstack_normalization.json | 44 + .../disabled/dfs_xgb_classifier.json | 64 - .../normalize_dfs_xgb_classifier.json | 46 - .../disabled/resample_dfs_xgb_classifier.json | 83 - .../resample_unstack_dfs_xgb_classifier.json | 78 - ...ack_double_lstm_timeseries_classifier.json | 123 - ...le_unstack_lstm_timeseries_classifier.json | 123 - ..._unstack_normalize_dfs_xgb_classifier.json | 69 - .../double_lstm.json} | 12 +- .../double_lstm/double_lstm_prob.json | 98 + .../double_lstm_prob_with_unstack.json} | 37 +- .../double_lstm_with_unstack.json} | 37 +- .../lstm.json} | 10 +- draco/pipelines/lstm/lstm_prob.json | 98 + .../lstm_prob_with_unstack.json} | 37 +- .../lstm_with_unstack.json} | 37 +- .../lstm_regressor/lstm_regressor.json | 91 + .../lstm_regressor_with_unstack.json | 106 + .../double_entity_normalization.json} | 16 +- .../entity_dataframe.json} | 24 +- .../preprocessing/entity_normalization.json | 20 + draco/pipelines/preprocessing/unstack.json | 43 + .../normalize_dfs_xgb_classifier.json | 70 - .../unstack_dfs_xgb_classifier.json | 83 - .../unstack_normalize_dfs_xgb_classifier.json | 74 - ...nstacked_normalize_dfs_xgb_classifier.json | 39 - draco/primitives/mlblocks.MLPipeline.json | 37 + tests/test_benchmark.py | 2 +- tests/test_pipeline.py | 20 +- tutorials/01_Draco_Machine_Learning.ipynb | 246 +- tutorials/03_Benchmarking.ipynb | 10 +- tutorials/04_Draco_Regression_Pipeline.ipynb | 793 ++++++ .../Convert NASA CMAPSS to Draco Format.ipynb | 406 +++ ...> dfs_xgb_with_double_normalization.ipynb} | 1270 ++++----- ... dfs_xgb_with_unstack_normalization.ipynb} | 946 +++---- .../pipelines/double_lstm_with_unstack.ipynb | 2375 ++++++++++++++++ .../lstm_regressor_with_unstack.ipynb | 2499 +++++++++++++++++ tutorials/pipelines/lstm_with_unstack.ipynb | 2249 +++++++++++++++ ...ck_double_lstm_timeseries_classifier.ipynb | 2481 ---------------- .../unstack_lstm_timeseries_classifier.ipynb | 2355 ---------------- 55 files changed, 10270 insertions(+), 7563 deletions(-) delete mode 100644 draco/pipelines/classes/normalize_dfs_xgb_classifier.json delete mode 100644 draco/pipelines/classes/unstack_dfs_xgb_classifier.json delete mode 100644 draco/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json create mode 100644 draco/pipelines/dfs_xgb/dfs_xgb.json create mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json create mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json create mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json create mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json create mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json create mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json create mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json delete mode 100644 draco/pipelines/disabled/dfs_xgb_classifier.json delete mode 100644 draco/pipelines/disabled/normalize_dfs_xgb_classifier.json delete mode 100644 draco/pipelines/disabled/resample_dfs_xgb_classifier.json delete mode 100644 draco/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json delete mode 100644 draco/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json delete mode 100644 draco/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json delete mode 100644 draco/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json rename draco/pipelines/{unstacked/unstacked_double_lstm_timeseries_classifier.json => double_lstm/double_lstm.json} (91%) create mode 100644 draco/pipelines/double_lstm/double_lstm_prob.json rename draco/pipelines/{probability/unstack_double_lstm_timeseries_classifier.json => double_lstm/double_lstm_prob_with_unstack.json} (79%) rename draco/pipelines/{classes/unstack_double_lstm_timeseries_classifier.json => double_lstm/double_lstm_with_unstack.json} (77%) rename draco/pipelines/{unstacked/unstacked_lstm_timeseries_classifier.json => lstm/lstm.json} (92%) create mode 100644 draco/pipelines/lstm/lstm_prob.json rename draco/pipelines/{probability/unstack_lstm_timeseries_classifier.json => lstm/lstm_prob_with_unstack.json} (79%) rename draco/pipelines/{classes/unstack_lstm_timeseries_classifier.json => lstm/lstm_with_unstack.json} (77%) create mode 100644 draco/pipelines/lstm_regressor/lstm_regressor.json create mode 100644 draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json rename draco/pipelines/{disabled/resample_normalize_dfs_xgb_classifier.json => preprocessing/double_entity_normalization.json} (73%) rename draco/pipelines/{unstacked/unstacked_dfs_xgb_classifier.json => preprocessing/entity_dataframe.json} (50%) create mode 100644 draco/pipelines/preprocessing/entity_normalization.json create mode 100644 draco/pipelines/preprocessing/unstack.json delete mode 100644 draco/pipelines/probability/normalize_dfs_xgb_classifier.json delete mode 100644 draco/pipelines/probability/unstack_dfs_xgb_classifier.json delete mode 100644 draco/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json delete mode 100644 draco/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json create mode 100644 draco/primitives/mlblocks.MLPipeline.json create mode 100644 tutorials/04_Draco_Regression_Pipeline.ipynb create mode 100644 tutorials/Convert NASA CMAPSS to Draco Format.ipynb rename tutorials/pipelines/{normalize_dfs_xgb_classifier.ipynb => dfs_xgb_with_double_normalization.ipynb} (58%) rename tutorials/pipelines/{unstack_normalize_dfs_xgb_classifier.ipynb => dfs_xgb_with_unstack_normalization.ipynb} (68%) create mode 100644 tutorials/pipelines/double_lstm_with_unstack.ipynb create mode 100644 tutorials/pipelines/lstm_regressor_with_unstack.ipynb create mode 100644 tutorials/pipelines/lstm_with_unstack.ipynb delete mode 100644 tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb delete mode 100644 tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb diff --git a/.gitignore b/.gitignore index fe2c47a..b4e035b 100644 --- a/.gitignore +++ b/.gitignore @@ -112,3 +112,5 @@ notebooks-private/ scripts/ dask-worker-space/ tutorials/*.pkl + +*.DS_Store diff --git a/README.md b/README.md index 70eb0fe..2d398e4 100644 --- a/README.md +++ b/README.md @@ -220,18 +220,18 @@ The returned `pipeline` variable will be `list` containing the names of all the available in the Draco system: ``` -['classes.unstack_double_lstm_timeseries_classifier', - 'classes.unstack_lstm_timeseries_classifier', - 'classes.unstack_normalize_dfs_xgb_classifier', - 'classes.unstack_dfs_xgb_classifier', - 'classes.normalize_dfs_xgb_classifier'] +['dfs_xgb', + 'dfs_xgb_with_unstack', + 'dfs_xgb_with_normalization', + 'dfs_xgb_with_unstack_normalization', + 'dfs_xgb_prob_with_unstack_normalization'] ``` For the rest of this tutorial, we will select and use the pipeline -`classes.normalize_dfs_xgb_classifier` as our template. +`dfs_xgb_with_unstack_normalization` as our template. ```python3 -pipeline_name = 'classes.normalize_dfs_xgb_classifier' +pipeline_name = 'dfs_xgb_with_unstack_normalization' ``` ## 3. Fitting the Pipeline diff --git a/draco/__init__.py b/draco/__init__.py index db53888..a1a5e8a 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -11,9 +11,11 @@ from draco.pipeline import DracoPipeline, get_pipelines _BASE_PATH = os.path.abspath(os.path.dirname(__file__)) -MLBLOCKS_PIPELINES = os.path.join(_BASE_PATH, 'pipelines') MLBLOCKS_PRIMITIVES = os.path.join(_BASE_PATH, 'primitives') - +MLBLOCKS_PIPELINES = tuple( + dirname + for dirname, _, _ in os.walk(os.path.join(_BASE_PATH, 'pipelines')) +) __all__ = ( 'DracoPipeline', diff --git a/draco/demo.py b/draco/demo.py index f7848e5..dcd4126 100644 --- a/draco/demo.py +++ b/draco/demo.py @@ -10,6 +10,17 @@ S3_URL = 'https://d3-ai-greenguard.s3.amazonaws.com/' DEMO_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'demo') +_FILES = { + 'DEFAULT': [ + ('target_times', 'cutoff_time'), + ('readings', 'timestamp') + ], + 'RUL': [ + ('rul_train_target_times', 'cutoff_time'), + ('rul_test_target_times', 'cutoff_time'), + ('rul_readings', 'timestamp') + ] +} def _load_or_download(filename, dates): filename += '.csv.gz' @@ -27,23 +38,35 @@ def _load_or_download(filename, dates): return data -def load_demo(load_readings=True): +def load_demo(name='default', load_readings=True): """Load the demo included in the Draco project. The first time that this function is executed, the data will be downloaded and cached inside the `draco/demo` folder. Subsequent calls will load the cached data instead of downloading it again. + + Args: + rul (str): + Name of the dataset to load. If "RUL", load NASA's CMAPSS dataset + https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan. + If "default" then load default demo. + load_readings (bool): + Whether to load the ``readings`` table or not. Returns: tuple[pandas.DataFrame]: target_times and readings tables """ - target_times = _load_or_download('target_times', 'cutoff_time') - if load_readings: - readings = _load_or_download('readings', 'timestamp') - return target_times, readings + files = _FILES[name.upper()] - return target_times + if not load_readings: + files = files[:-1] + + output = list() + for filename, dates in files: + output.append(_load_or_download(filename, dates)) + + return tuple(output) def generate_raw_readings(output_path='demo'): diff --git a/draco/pipeline.py b/draco/pipeline.py index b50567b..98fb3d7 100644 --- a/draco/pipeline.py +++ b/draco/pipeline.py @@ -54,7 +54,7 @@ def __setstate__(self, state): Sequential.__setstate__ = __setstate__ -def get_pipelines(pattern='', path=False, pipeline_type='classes'): +def get_pipelines(pattern='', path=False, pipeline_type=None): """Get the list of available pipelines. Optionally filter the names using a patter or obtain @@ -66,9 +66,8 @@ def get_pipelines(pattern='', path=False, pipeline_type='classes'): path (bool): Whether to return a dictionary containing the pipeline paths instead of only a list with the names. - pipeline_type (str): - The pipeline category to filter by (`classes`, `probability` and `unstacked`). - Defaults to `classes`. + pipeline_type (str or list[str]): + The pipeline category to filter. Defaults to `None`. Return: list or dict: @@ -76,15 +75,24 @@ def get_pipelines(pattern='', path=False, pipeline_type='classes'): If `path=True`, return a dict containing the pipeline names as keys and their absolute paths as values. """ + if isinstance(pipeline_type, str): + pipeline_type = [pipeline_type] + elif pipeline_type is None: + pipeline_type = os.listdir(PIPELINES_DIR) + pipelines = dict() - pipelines_dir = os.path.join(PIPELINES_DIR, pipeline_type) - - for filename in os.listdir(pipelines_dir): - if filename.endswith('.json') and pattern in filename: - name = os.path.basename(filename)[:-len('.json')] - name = f'{pipeline_type}.{name}' - pipeline_path = os.path.join(pipelines_dir, filename) - pipelines[name] = pipeline_path + pipelines_dir = [ + os.path.join(PIPELINES_DIR, ptype) + for ptype in pipeline_type + if ptype != 'preprocessing' + ] + + for pdir in pipelines_dir: + for filename in os.listdir(pdir): + if filename.endswith('.json') and pattern in filename: + name = os.path.basename(filename)[:-len('.json')] + pipeline_path = os.path.join(pdir, filename) + pipelines[name] = pipeline_path if not path: pipelines = list(pipelines) diff --git a/draco/pipelines/classes/normalize_dfs_xgb_classifier.json b/draco/pipelines/classes/normalize_dfs_xgb_classifier.json deleted file mode 100644 index 8039d12..0000000 --- a/draco/pipelines/classes/normalize_dfs_xgb_classifier.json +++ /dev/null @@ -1,65 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity", - "featuretools.EntitySet.normalize_entity", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - }, - "featuretools.EntitySet.normalize_entity#2": { - "base_entity_id": "readings", - "new_entity_id": "signals", - "index": "signal_id", - "make_time_index": false - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/classes/unstack_dfs_xgb_classifier.json b/draco/pipelines/classes/unstack_dfs_xgb_classifier.json deleted file mode 100644 index 60be686..0000000 --- a/draco/pipelines/classes/unstack_dfs_xgb_classifier.json +++ /dev/null @@ -1,78 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.add_relationship", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "entity_id": "turbines", - "index": "turbine_id", - "make_index": false - }, - "featuretools.EntitySet.add_relationship#1": { - "parent": "turbines", - "parent_column": "turbine_id", - "child": "readings", - "child_column": "turbine_id" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "1d" - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "dataframe": "turbines" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json b/draco/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json deleted file mode 100644 index 5c82d77..0000000 --- a/draco/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb.json b/draco/pipelines/dfs_xgb/dfs_xgb.json new file mode 100644 index 0000000..4cb3cbf --- /dev/null +++ b/draco/pipelines/dfs_xgb/dfs_xgb.json @@ -0,0 +1,29 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "preprocessing.entity_dataframe" + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "dataframe": "readings" + } + } +} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json new file mode 100644 index 0000000..4231115 --- /dev/null +++ b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json @@ -0,0 +1,42 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier:probabilities", + "numpy.take" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "preprocessing.double_entity_normalization", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "dataframe": "readings" + } + } + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "X": "readings" + } + } +} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json new file mode 100644 index 0000000..03ef141 --- /dev/null +++ b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json @@ -0,0 +1,50 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "mlblocks.MLPipeline", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier:probabilities", + "numpy.take" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "preprocessing.unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } + }, + "mlblocks.MLPipeline#2": { + "pipeline": "preprocessing.entity_dataframe" + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "X": "readings" + }, + "mlblocks.MLPipeline#2": { + "dataframe": "readings", + "turbines": "turbines" + } + } +} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json new file mode 100644 index 0000000..ca0c4fa --- /dev/null +++ b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json @@ -0,0 +1,49 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "mlblocks.MLPipeline", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier:probabilities", + "numpy.take" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "preprocessing.unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } + }, + "mlblocks.MLPipeline#2": { + "pipeline": "preprocessing.entity_normalization" + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "X": "readings" + }, + "mlblocks.MLPipeline#2": { + "dataframe": "readings" + } + } +} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json new file mode 100644 index 0000000..82ae325 --- /dev/null +++ b/draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json @@ -0,0 +1,37 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "preprocessing.double_entity_normalization", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "featuretools.EntitySet.entity_from_dataframe#1": { + "dataframe": "readings" + } + } + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "X": "readings" + } + } +} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json new file mode 100644 index 0000000..d9d7911 --- /dev/null +++ b/draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json @@ -0,0 +1,29 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "preprocessing.entity_normalization" + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "dataframe": "readings" + } + } +} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json b/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json new file mode 100644 index 0000000..dd01f23 --- /dev/null +++ b/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json @@ -0,0 +1,45 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "mlblocks.MLPipeline", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "preprocessing.unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } + }, + "mlblocks.MLPipeline#2": { + "pipeline": "preprocessing.entity_dataframe" + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "X": "readings" + }, + "mlblocks.MLPipeline#2": { + "dataframe": "readings", + "turbines": "turbines" + } + } +} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json new file mode 100644 index 0000000..87e6999 --- /dev/null +++ b/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json @@ -0,0 +1,44 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "mlblocks.MLPipeline", + "featuretools.dfs", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "xgboost.XGBClassifier" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "preprocessing.unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } + }, + "mlblocks.MLPipeline#2": { + "pipeline": "preprocessing.entity_normalization" + }, + "featuretools.dfs#1": { + "target_entity": "turbines", + "index": "turbine_id", + "time_index": "cutoff_time", + "encode": false, + "max_depth": -1, + "copy": true, + "verbose": false, + "n_jobs": 1, + "training_window": "1d" + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "X": "readings" + }, + "mlblocks.MLPipeline#2": { + "dataframe": "readings" + } + } +} diff --git a/draco/pipelines/disabled/dfs_xgb_classifier.json b/draco/pipelines/disabled/dfs_xgb_classifier.json deleted file mode 100644 index f725e3b..0000000 --- a/draco/pipelines/disabled/dfs_xgb_classifier.json +++ /dev/null @@ -1,64 +0,0 @@ -{ - "primitives": [ - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.add_relationship", - "featuretools.EntitySet.add_relationship", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "entity_id": "turbines", - "index": "turbine_id", - "make_index": false - }, - "featuretools.EntitySet.entity_from_dataframe#3": { - "entity_id": "signals", - "index": "signal_id", - "make_index": false - }, - "featuretools.EntitySet.add_relationship#1": { - "parent": "turbines", - "parent_column": "turbine_id", - "child": "readings", - "child_column": "turbine_id" - }, - "featuretools.EntitySet.add_relationship#2": { - "parent": "signals", - "parent_column": "signal_id", - "child": "readings", - "child_column": "signal_id" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "3d" - } - }, - "input_names": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "dataframe": "turbines" - }, - "featuretools.EntitySet.entity_from_dataframe#3": { - "dataframe": "signals" - } - } -} diff --git a/draco/pipelines/disabled/normalize_dfs_xgb_classifier.json b/draco/pipelines/disabled/normalize_dfs_xgb_classifier.json deleted file mode 100644 index 0622163..0000000 --- a/draco/pipelines/disabled/normalize_dfs_xgb_classifier.json +++ /dev/null @@ -1,46 +0,0 @@ -{ - "primitives": [ - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity", - "featuretools.EntitySet.normalize_entity", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - }, - "featuretools.EntitySet.normalize_entity#2": { - "base_entity_id": "readings", - "new_entity_id": "signals", - "index": "signal_id", - "make_time_index": false - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "3d" - } - }, - "input_names": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - } -} diff --git a/draco/pipelines/disabled/resample_dfs_xgb_classifier.json b/draco/pipelines/disabled/resample_dfs_xgb_classifier.json deleted file mode 100644 index 390e0b4..0000000 --- a/draco/pipelines/disabled/resample_dfs_xgb_classifier.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.add_relationship", - "featuretools.EntitySet.add_relationship", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "1h", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "entity_id": "turbines", - "index": "turbine_id", - "make_index": false - }, - "featuretools.EntitySet.entity_from_dataframe#3": { - "entity_id": "signals", - "index": "signal_id", - "make_index": false - }, - "featuretools.EntitySet.add_relationship#1": { - "parent": "turbines", - "parent_column": "turbine_id", - "child": "readings", - "child_column": "turbine_id" - }, - "featuretools.EntitySet.add_relationship#2": { - "parent": "signals", - "parent_column": "signal_id", - "child": "readings", - "child_column": "signal_id" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "3d" - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "dataframe": "turbines" - }, - "featuretools.EntitySet.entity_from_dataframe#3": { - "dataframe": "signals" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json b/draco/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json deleted file mode 100644 index 7775208..0000000 --- a/draco/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json +++ /dev/null @@ -1,78 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.add_relationship", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "1h", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "entity_id": "turbines", - "index": "turbine_id", - "make_index": false - }, - "featuretools.EntitySet.add_relationship#1": { - "parent": "turbines", - "parent_column": "turbine_id", - "child": "readings", - "child_column": "turbine_id" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "3d" - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "dataframe": "turbines" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json b/draco/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json deleted file mode 100644 index 75dadc9..0000000 --- a/draco/pipelines/disabled/resample_unstack_double_lstm_timeseries_classifier.json +++ /dev/null @@ -1,123 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "pandas.DataFrame.pop", - "pandas.DataFrame.pop", - "sklearn.impute.SimpleImputer", - "sklearn.preprocessing.MinMaxScaler", - "pandas.DataFrame", - "pandas.DataFrame.set", - "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", - "keras.Sequential.DoubleLSTMTimeSeriesClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "3600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "pandas.DataFrame.pop#1": { - "item": "turbine_id" - }, - "pandas.DataFrame.pop#2": { - "item": "timestamp" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "feature_range": [ - -1, - 1 - ] - }, - "pandas.DataFrame#1": { - "index": null, - "columns": null - }, - "pandas.DataFrame.set#1": { - "key": "turbine_id" - }, - "pandas.DataFrame.set#2": { - "key": "timestamp" - }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "window_size": 72, - "cutoff_time": "cutoff_time", - "time_index": "timestamp" - }, - "keras.Sequential.DoubleLSTMTimeSeriesClassifier": { - "epochs": 20, - "verbose": true, - "input_shape": [ - 72, - 97 - ] - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#2": { - "X": "readings" - }, - "sklearn.impute.SimpleImputer#1": { - "X": "readings" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "X": "readings" - }, - "pandas.DataFrame#1": { - "X": "readings" - }, - "pandas.DataFrame.set#1": { - "X": "readings", - "value": "turbine_id" - }, - "pandas.DataFrame.set#2": { - "X": "readings", - "value": "timestamp" - }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "timeseries": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#1": { - "item": "turbine_id" - }, - "pandas.DataFrame.pop#2": { - "item": "timestamp" - }, - "sklearn.impute.SimpleImputer#1": { - "X": "readings" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "X": "readings" - }, - "pandas.DataFrame#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json b/draco/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json deleted file mode 100644 index e33e83b..0000000 --- a/draco/pipelines/disabled/resample_unstack_lstm_timeseries_classifier.json +++ /dev/null @@ -1,123 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "pandas.DataFrame.pop", - "pandas.DataFrame.pop", - "sklearn.impute.SimpleImputer", - "sklearn.preprocessing.MinMaxScaler", - "pandas.DataFrame", - "pandas.DataFrame.set", - "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", - "keras.Sequential.LSTMTimeSeriesClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "pandas.DataFrame.pop#1": { - "item": "turbine_id" - }, - "pandas.DataFrame.pop#2": { - "item": "timestamp" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "feature_range": [ - -1, - 1 - ] - }, - "pandas.DataFrame#1": { - "index": null, - "columns": null - }, - "pandas.DataFrame.set#1": { - "key": "turbine_id" - }, - "pandas.DataFrame.set#2": { - "key": "timestamp" - }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "window_size": 144, - "cutoff_time": "cutoff_time", - "time_index": "timestamp" - }, - "keras.Sequential.LSTMTimeSeriesClassifier": { - "epochs": 35, - "verbose": true, - "input_shape": [ - 144, - 26 - ] - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#2": { - "X": "readings" - }, - "sklearn.impute.SimpleImputer#1": { - "X": "readings" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "X": "readings" - }, - "pandas.DataFrame#1": { - "X": "readings" - }, - "pandas.DataFrame.set#1": { - "X": "readings", - "value": "turbine_id" - }, - "pandas.DataFrame.set#2": { - "X": "readings", - "value": "timestamp" - }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "timeseries": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "pandas.DataFrame.pop#1": { - "item": "turbine_id" - }, - "pandas.DataFrame.pop#2": { - "item": "timestamp" - }, - "sklearn.impute.SimpleImputer#1": { - "X": "readings" - }, - "sklearn.preprocessing.MinMaxScaler#1": { - "X": "readings" - }, - "pandas.DataFrame#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json b/draco/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json deleted file mode 100644 index 4cbe1df..0000000 --- a/draco/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json +++ /dev/null @@ -1,69 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "1h", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "3d" - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json b/draco/pipelines/double_lstm/double_lstm.json similarity index 91% rename from draco/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json rename to draco/pipelines/double_lstm/double_lstm.json index bf3065f..e3be8a5 100644 --- a/draco/pipelines/unstacked/unstacked_double_lstm_timeseries_classifier.json +++ b/draco/pipelines/double_lstm/double_lstm.json @@ -34,17 +34,13 @@ "key": "timestamp" }, "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "window_size": 72, + "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" }, - "keras.Sequential.DoubleLSTMTimeSeriesClassifier": { - "epochs": 20, - "verbose": true, - "input_shape": [ - 72, - 97 - ] + "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { + "epochs": 35, + "verbose": false } }, "input_names": { diff --git a/draco/pipelines/double_lstm/double_lstm_prob.json b/draco/pipelines/double_lstm/double_lstm_prob.json new file mode 100644 index 0000000..a118af0 --- /dev/null +++ b/draco/pipelines/double_lstm/double_lstm_prob.json @@ -0,0 +1,98 @@ +{ + "primitives": [ + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.DoubleLSTMTimeSeriesClassifier", + "numpy.take" + ], + "init_params": { + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 24, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.DoubleLSTMTimeSeriesClassifier#1": { + "epochs": 35, + "verbose": false, + "classification": false, + "loss": "keras.losses.binary_crossentropy" + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/draco/pipelines/probability/unstack_double_lstm_timeseries_classifier.json b/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json similarity index 79% rename from draco/pipelines/probability/unstack_double_lstm_timeseries_classifier.json rename to draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json index ea48a87..289a794 100644 --- a/draco/pipelines/probability/unstack_double_lstm_timeseries_classifier.json +++ b/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json @@ -1,7 +1,6 @@ { "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", + "mlblocks.MLPipeline", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -14,19 +13,16 @@ "numpy.take" ], "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "3600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true + "mlblocks.MLPipeline#1": { + "pipeline": "unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -67,10 +63,7 @@ } }, "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { + "mlblocks.MLPipeline#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -101,12 +94,6 @@ } }, "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/classes/unstack_double_lstm_timeseries_classifier.json b/draco/pipelines/double_lstm/double_lstm_with_unstack.json similarity index 77% rename from draco/pipelines/classes/unstack_double_lstm_timeseries_classifier.json rename to draco/pipelines/double_lstm/double_lstm_with_unstack.json index dede502..1d08259 100644 --- a/draco/pipelines/classes/unstack_double_lstm_timeseries_classifier.json +++ b/draco/pipelines/double_lstm/double_lstm_with_unstack.json @@ -1,7 +1,6 @@ { "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", + "mlblocks.MLPipeline", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -13,19 +12,16 @@ "keras.Sequential.DoubleLSTMTimeSeriesClassifier" ], "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "3600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true + "mlblocks.MLPipeline#1": { + "pipeline": "unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -60,10 +56,7 @@ } }, "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { + "mlblocks.MLPipeline#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -94,12 +87,6 @@ } }, "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json b/draco/pipelines/lstm/lstm.json similarity index 92% rename from draco/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json rename to draco/pipelines/lstm/lstm.json index d2cbed7..c29b1c7 100644 --- a/draco/pipelines/unstacked/unstacked_lstm_timeseries_classifier.json +++ b/draco/pipelines/lstm/lstm.json @@ -34,17 +34,13 @@ "key": "timestamp" }, "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { - "window_size": 72, + "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" }, - "keras.Sequential.LSTMTimeSeriesClassifier": { + "keras.Sequential.LSTMTimeSeriesClassifier#1": { "epochs": 35, - "verbose": true, - "input_shape": [ - 72, - 97 - ] + "verbose": false } }, "input_names": { diff --git a/draco/pipelines/lstm/lstm_prob.json b/draco/pipelines/lstm/lstm_prob.json new file mode 100644 index 0000000..17da404 --- /dev/null +++ b/draco/pipelines/lstm/lstm_prob.json @@ -0,0 +1,98 @@ +{ + "primitives": [ + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.LSTMTimeSeriesClassifier", + "numpy.take" + ], + "init_params": { + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 24, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.LSTMTimeSeriesClassifier#1": { + "epochs": 35, + "verbose": false, + "classification": false, + "loss": "keras.losses.binary_crossentropy" + }, + "numpy.take#1": { + "indices": 1, + "axis": 1 + } + }, + "input_names": { + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/draco/pipelines/probability/unstack_lstm_timeseries_classifier.json b/draco/pipelines/lstm/lstm_prob_with_unstack.json similarity index 79% rename from draco/pipelines/probability/unstack_lstm_timeseries_classifier.json rename to draco/pipelines/lstm/lstm_prob_with_unstack.json index 9272257..1ad69bc 100644 --- a/draco/pipelines/probability/unstack_lstm_timeseries_classifier.json +++ b/draco/pipelines/lstm/lstm_prob_with_unstack.json @@ -1,7 +1,6 @@ { "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", + "mlblocks.MLPipeline", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -14,19 +13,16 @@ "numpy.take" ], "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "3600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true + "mlblocks.MLPipeline#1": { + "pipeline": "unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -67,10 +63,7 @@ } }, "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { + "mlblocks.MLPipeline#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -101,12 +94,6 @@ } }, "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/classes/unstack_lstm_timeseries_classifier.json b/draco/pipelines/lstm/lstm_with_unstack.json similarity index 77% rename from draco/pipelines/classes/unstack_lstm_timeseries_classifier.json rename to draco/pipelines/lstm/lstm_with_unstack.json index ab9dd99..18c486a 100644 --- a/draco/pipelines/classes/unstack_lstm_timeseries_classifier.json +++ b/draco/pipelines/lstm/lstm_with_unstack.json @@ -1,7 +1,6 @@ { "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", + "mlblocks.MLPipeline", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -13,19 +12,16 @@ "keras.Sequential.LSTMTimeSeriesClassifier" ], "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "3600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true + "mlblocks.MLPipeline#1": { + "pipeline": "unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -60,10 +56,7 @@ } }, "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { + "mlblocks.MLPipeline#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -94,12 +87,6 @@ } }, "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/lstm_regressor/lstm_regressor.json b/draco/pipelines/lstm_regressor/lstm_regressor.json new file mode 100644 index 0000000..77ddb1e --- /dev/null +++ b/draco/pipelines/lstm_regressor/lstm_regressor.json @@ -0,0 +1,91 @@ +{ + "primitives": [ + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.LSTMTimeSeriesRegressor" + ], + "init_params": { + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 24, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.LSTMTimeSeriesRegressor#1": { + "epochs": 35, + "verbose": false + } + }, + "input_names": { + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json b/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json new file mode 100644 index 0000000..d546827 --- /dev/null +++ b/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json @@ -0,0 +1,106 @@ +{ + "primitives": [ + "mlblocks.MLPipeline", + "pandas.DataFrame.pop", + "pandas.DataFrame.pop", + "sklearn.impute.SimpleImputer", + "sklearn.preprocessing.MinMaxScaler", + "pandas.DataFrame", + "pandas.DataFrame.set", + "pandas.DataFrame.set", + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "keras.Sequential.LSTMTimeSeriesRegressor" + ], + "init_params": { + "mlblocks.MLPipeline#1": { + "pipeline": "unstack", + "input_names": { + "pandas.DataFrame.resample#1": { + "X": "df" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + } + }, + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "feature_range": [ + -1, + 1 + ] + }, + "pandas.DataFrame#1": { + "index": null, + "columns": null + }, + "pandas.DataFrame.set#1": { + "key": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "key": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "window_size": 24, + "cutoff_time": "cutoff_time", + "time_index": "timestamp" + }, + "keras.Sequential.LSTMTimeSeriesRegressor#1": { + "epochs": 35, + "verbose": false + } + }, + "input_names": { + "mlblocks.MLPipeline#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#1": { + "X": "readings" + }, + "pandas.DataFrame.pop#2": { + "X": "readings" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + }, + "pandas.DataFrame.set#1": { + "X": "readings", + "value": "turbine_id" + }, + "pandas.DataFrame.set#2": { + "X": "readings", + "value": "timestamp" + }, + "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "timeseries": "readings" + } + }, + "output_names": { + "pandas.DataFrame.pop#1": { + "item": "turbine_id" + }, + "pandas.DataFrame.pop#2": { + "item": "timestamp" + }, + "sklearn.impute.SimpleImputer#1": { + "X": "readings" + }, + "sklearn.preprocessing.MinMaxScaler#1": { + "X": "readings" + }, + "pandas.DataFrame#1": { + "X": "readings" + } + } +} diff --git a/draco/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json b/draco/pipelines/preprocessing/double_entity_normalization.json similarity index 73% rename from draco/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json rename to draco/pipelines/preprocessing/double_entity_normalization.json index 3d7d4d2..1438bbe 100644 --- a/draco/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json +++ b/draco/pipelines/preprocessing/double_entity_normalization.json @@ -3,10 +3,7 @@ "pandas.DataFrame.resample", "featuretools.EntitySet.entity_from_dataframe", "featuretools.EntitySet.normalize_entity", - "featuretools.EntitySet.normalize_entity", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" + "featuretools.EntitySet.normalize_entity" ], "init_params": { "pandas.DataFrame.resample#1": { @@ -36,17 +33,6 @@ "new_entity_id": "signals", "index": "signal_id", "make_time_index": false - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "1d" } }, "input_names": { diff --git a/draco/pipelines/unstacked/unstacked_dfs_xgb_classifier.json b/draco/pipelines/preprocessing/entity_dataframe.json similarity index 50% rename from draco/pipelines/unstacked/unstacked_dfs_xgb_classifier.json rename to draco/pipelines/preprocessing/entity_dataframe.json index e26fa00..0bd238a 100644 --- a/draco/pipelines/unstacked/unstacked_dfs_xgb_classifier.json +++ b/draco/pipelines/preprocessing/entity_dataframe.json @@ -2,10 +2,7 @@ "primitives": [ "featuretools.EntitySet.entity_from_dataframe", "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.add_relationship", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" + "featuretools.EntitySet.add_relationship" ], "init_params": { "featuretools.EntitySet.entity_from_dataframe#1": { @@ -24,25 +21,6 @@ "parent_column": "turbine_id", "child": "readings", "child_column": "turbine_id" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "3d" - } - }, - "input_names": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "dataframe": "turbines" } } } diff --git a/draco/pipelines/preprocessing/entity_normalization.json b/draco/pipelines/preprocessing/entity_normalization.json new file mode 100644 index 0000000..9f3f3ab --- /dev/null +++ b/draco/pipelines/preprocessing/entity_normalization.json @@ -0,0 +1,20 @@ +{ + "primitives": [ + "featuretools.EntitySet.entity_from_dataframe", + "featuretools.EntitySet.normalize_entity" + ], + "init_params": { + "featuretools.EntitySet.entity_from_dataframe#1": { + "entity_id": "readings", + "index": "reading_id", + "make_index": true, + "time_index": "timestamp" + }, + "featuretools.EntitySet.normalize_entity#1": { + "base_entity_id": "readings", + "new_entity_id": "turbines", + "index": "turbine_id", + "make_time_index": false + } + } +} diff --git a/draco/pipelines/preprocessing/unstack.json b/draco/pipelines/preprocessing/unstack.json new file mode 100644 index 0000000..1acd833 --- /dev/null +++ b/draco/pipelines/preprocessing/unstack.json @@ -0,0 +1,43 @@ +{ + "primitives": [ + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack" + ], + "init_params": { + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true + } + }, + "input_names": { + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + }, + "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + } + }, + "outputs": { + "default": [ + { + "name": "readings", + "variable": "pandas.DataFrame.unstack#1.readings" + } + ] + } +} diff --git a/draco/pipelines/probability/normalize_dfs_xgb_classifier.json b/draco/pipelines/probability/normalize_dfs_xgb_classifier.json deleted file mode 100644 index 495a5d9..0000000 --- a/draco/pipelines/probability/normalize_dfs_xgb_classifier.json +++ /dev/null @@ -1,70 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity", - "featuretools.EntitySet.normalize_entity", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier:probabilities", - "numpy.take" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - }, - "featuretools.EntitySet.normalize_entity#2": { - "base_entity_id": "readings", - "new_entity_id": "signals", - "index": "signal_id", - "make_time_index": false - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - }, - "numpy.take#1": { - "indices": 1, - "axis": 1 - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/probability/unstack_dfs_xgb_classifier.json b/draco/pipelines/probability/unstack_dfs_xgb_classifier.json deleted file mode 100644 index aedbada..0000000 --- a/draco/pipelines/probability/unstack_dfs_xgb_classifier.json +++ /dev/null @@ -1,83 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.add_relationship", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier:probabilities", - "numpy.take" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "entity_id": "turbines", - "index": "turbine_id", - "make_index": false - }, - "featuretools.EntitySet.add_relationship#1": { - "parent": "turbines", - "parent_column": "turbine_id", - "child": "readings", - "child_column": "turbine_id" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "1d" - }, - "numpy.take#1": { - "indices": 1, - "axis": 1 - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "dataframe": "turbines" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json b/draco/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json deleted file mode 100644 index eddddd8..0000000 --- a/draco/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json +++ /dev/null @@ -1,74 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier:probabilities", - "numpy.take" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - }, - "numpy.take#1": { - "indices": 1, - "axis": 1 - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json b/draco/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json deleted file mode 100644 index dafefd3..0000000 --- a/draco/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "primitives": [ - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": true, - "n_jobs": 1, - "training_window": "3d" - } - }, - "input_names": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - } -} diff --git a/draco/primitives/mlblocks.MLPipeline.json b/draco/primitives/mlblocks.MLPipeline.json new file mode 100644 index 0000000..aee1da2 --- /dev/null +++ b/draco/primitives/mlblocks.MLPipeline.json @@ -0,0 +1,37 @@ +{ + "name": "mlblocks.MLPipeline", + "primitive": "mlblocks.MLPipeline", + "fit": { + "method": "fit", + "args": "get_fit_args" + }, + "produce": { + "method": "predict", + "args": "get_predict_args", + "output": "get_outputs" + }, + "hyperparameters": { + "fixed": { + "pipeline": { + "type": "str", + "default": null + }, + "primitives": { + "type": "list", + "default": [] + }, + "init_params": { + "type": "dict", + "default": {} + }, + "input_names": { + "type": "dict", + "default": {} + }, + "output_names": { + "type": "dict", + "default": {} + } + } + } +} \ No newline at end of file diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index 5d6f116..d88425b 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -8,7 +8,7 @@ def test_predict(): # setup templates = [ - 'probability.unstack_lstm_timeseries_classifier' + 'dfs_xgb_prob_with_unstack_normalization' ] window_size_rule = [ diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 725d299..f8526c9 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -6,8 +6,26 @@ from unittest.mock import patch import pandas as pd +import pytest -from draco.pipeline import DracoPipeline +from draco.pipeline import DracoPipeline, get_pipelines + + +def test_get_pipelines(): + output = get_pipelines() + assert isinstance(output, list) + + +def test_get_pipelines_type(): + output = get_pipelines(pipeline_type='lstm') + assert isinstance(output, list) + for path in output: + assert 'lstm' in path + + +def test_get_pipelines_type_error(): + with pytest.raises(FileNotFoundError): + get_pipelines(pipeline_type='does-not-exist') class TestDracoPipeline(TestCase): diff --git a/tutorials/01_Draco_Machine_Learning.ipynb b/tutorials/01_Draco_Machine_Learning.ipynb index 4a5fde7..6b1089e 100644 --- a/tutorials/01_Draco_Machine_Learning.ipynb +++ b/tutorials/01_Draco_Machine_Learning.ipynb @@ -414,28 +414,22 @@ { "data": { "text/plain": [ - "['unstacked.unstacked_normalize_dfs_xgb_classifier',\n", - " 'unstacked.unstacked_double_lstm_timeseries_classifier',\n", - " 'unstacked.unstacked_lstm_timeseries_classifier',\n", - " 'unstacked.unstacked_dfs_xgb_classifier',\n", - " 'classes.unstack_dfs_xgb_classifier',\n", - " 'classes.unstack_double_lstm_timeseries_classifier',\n", - " 'classes.normalize_dfs_xgb_classifier',\n", - " 'classes.unstack_lstm_timeseries_classifier',\n", - " 'classes.unstack_normalize_dfs_xgb_classifier',\n", - " 'disabled.resample_normalize_dfs_xgb_classifier',\n", - " 'disabled.resample_unstack_lstm_timeseries_classifier',\n", - " 'disabled.resample_unstack_normalize_dfs_xgb_classifier',\n", - " 'disabled.normalize_dfs_xgb_classifier',\n", - " 'disabled.resample_unstack_double_lstm_timeseries_classifier',\n", - " 'disabled.resample_dfs_xgb_classifier',\n", - " 'disabled.resample_unstack_dfs_xgb_classifier',\n", - " 'disabled.dfs_xgb_classifier',\n", - " 'probability.unstack_dfs_xgb_classifier',\n", - " 'probability.unstack_double_lstm_timeseries_classifier',\n", - " 'probability.normalize_dfs_xgb_classifier',\n", - " 'probability.unstack_lstm_timeseries_classifier',\n", - " 'probability.unstack_normalize_dfs_xgb_classifier']" + "['dfs_xgb_prob_with_unstack',\n", + " 'dfs_xgb_with_normalization',\n", + " 'dfs_xgb',\n", + " 'dfs_xgb_with_unstack',\n", + " 'dfs_xgb_prob_with_unstack_normalization',\n", + " 'dfs_xgb_with_unstack_normalization',\n", + " 'dfs_xgb_prob_with_double_normalization',\n", + " 'dfs_xgb_with_double_normalization',\n", + " 'double_lstm_prob_with_unstack',\n", + " 'double_lstm_prob',\n", + " 'double_lstm',\n", + " 'double_lstm_with_unstack',\n", + " 'lstm_prob_with_unstack',\n", + " 'lstm_with_unstack',\n", + " 'lstm_prob',\n", + " 'lstm']" ] }, "execution_count": 10, @@ -464,20 +458,14 @@ { "data": { "text/plain": [ - "['unstacked.unstacked_normalize_dfs_xgb_classifier',\n", - " 'unstacked.unstacked_dfs_xgb_classifier',\n", - " 'classes.unstack_dfs_xgb_classifier',\n", - " 'classes.normalize_dfs_xgb_classifier',\n", - " 'classes.unstack_normalize_dfs_xgb_classifier',\n", - " 'disabled.resample_normalize_dfs_xgb_classifier',\n", - " 'disabled.resample_unstack_normalize_dfs_xgb_classifier',\n", - " 'disabled.normalize_dfs_xgb_classifier',\n", - " 'disabled.resample_dfs_xgb_classifier',\n", - " 'disabled.resample_unstack_dfs_xgb_classifier',\n", - " 'disabled.dfs_xgb_classifier',\n", - " 'probability.unstack_dfs_xgb_classifier',\n", - " 'probability.normalize_dfs_xgb_classifier',\n", - " 'probability.unstack_normalize_dfs_xgb_classifier']" + "['dfs_xgb_prob_with_unstack',\n", + " 'dfs_xgb_with_normalization',\n", + " 'dfs_xgb',\n", + " 'dfs_xgb_with_unstack',\n", + " 'dfs_xgb_prob_with_unstack_normalization',\n", + " 'dfs_xgb_with_unstack_normalization',\n", + " 'dfs_xgb_prob_with_double_normalization',\n", + " 'dfs_xgb_with_double_normalization']" ] }, "execution_count": 11, @@ -505,20 +493,14 @@ { "data": { "text/plain": [ - "{'unstacked.unstacked_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/unstacked/unstacked_normalize_dfs_xgb_classifier.json',\n", - " 'unstacked.unstacked_dfs_xgb_classifier': '/Draco/draco/pipelines/unstacked/unstacked_dfs_xgb_classifier.json',\n", - " 'classes.unstack_dfs_xgb_classifier': '/Draco/draco/pipelines/classes/unstack_dfs_xgb_classifier.json',\n", - " 'classes.normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/classes/normalize_dfs_xgb_classifier.json',\n", - " 'classes.unstack_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/classes/unstack_normalize_dfs_xgb_classifier.json',\n", - " 'disabled.resample_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/resample_normalize_dfs_xgb_classifier.json',\n", - " 'disabled.resample_unstack_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/resample_unstack_normalize_dfs_xgb_classifier.json',\n", - " 'disabled.normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/normalize_dfs_xgb_classifier.json',\n", - " 'disabled.resample_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/resample_dfs_xgb_classifier.json',\n", - " 'disabled.resample_unstack_dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/resample_unstack_dfs_xgb_classifier.json',\n", - " 'disabled.dfs_xgb_classifier': '/Draco/draco/pipelines/disabled/dfs_xgb_classifier.json',\n", - " 'probability.unstack_dfs_xgb_classifier': '/Draco/draco/pipelines/probability/unstack_dfs_xgb_classifier.json',\n", - " 'probability.normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/probability/normalize_dfs_xgb_classifier.json',\n", - " 'probability.unstack_normalize_dfs_xgb_classifier': '/Draco/draco/pipelines/probability/unstack_normalize_dfs_xgb_classifier.json'}" + "{'dfs_xgb_prob_with_unstack': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json',\n", + " 'dfs_xgb_with_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json',\n", + " 'dfs_xgb': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb.json',\n", + " 'dfs_xgb_with_unstack': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json',\n", + " 'dfs_xgb_prob_with_unstack_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json',\n", + " 'dfs_xgb_with_unstack_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json',\n", + " 'dfs_xgb_prob_with_double_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json',\n", + " 'dfs_xgb_with_double_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json'}" ] }, "execution_count": 12, @@ -557,8 +539,8 @@ "outputs": [], "source": [ "templates = [\n", - " 'classes.unstack_normalize_dfs_xgb_classifier', \n", - " 'classes.normalize_dfs_xgb_classifier'\n", + " 'dfs_xgb_with_unstack_normalization', \n", + " 'dfs_xgb_with_double_normalization'\n", "]" ] }, @@ -631,9 +613,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:btb.session:Obtaining default configuration for classes.unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Obtaining default configuration for dfs_xgb_with_unstack_normalization\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: classes.unstack_normalize_dfs_xgb_classifier \n", + " Template: dfs_xgb_with_unstack_normalization \n", " Hyperparameters: \n", " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", @@ -641,26 +623,46 @@ " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "INFO:btb.session:New optimal found: classes.unstack_normalize_dfs_xgb_classifier - 0.611234532127027\n", - "INFO:btb.session:Obtaining default configuration for classes.normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n" + "INFO:btb.session:New optimal found: dfs_xgb_with_unstack_normalization - 0.6117760013143775\n", + "INFO:btb.session:Obtaining default configuration for dfs_xgb_with_double_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", + "INFO:draco.pipeline:New configuration found:\n", + " Template: dfs_xgb_with_unstack_normalization \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 90\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 342\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 6\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.9043352048331922\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.5258350872963311\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9\n", + "INFO:btb.session:New optimal found: dfs_xgb_with_unstack_normalization - 0.6205571445297473\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", + "INFO:draco.pipeline:New configuration found:\n", + " Template: dfs_xgb_with_double_normalization \n", + " Hyperparameters: \n", + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 80\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 66\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 10\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6434375682152088\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.14135407511034503\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", + "INFO:btb.session:New optimal found: dfs_xgb_with_double_normalization - 0.629513025867624\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n" ] }, { "data": { "text/plain": [ - "{'id': 'afc8e912142bc6c384231600df9874fc',\n", - " 'name': 'classes.unstack_normalize_dfs_xgb_classifier',\n", + "{'id': '452a22a136f67c575aee3341c9dc2395',\n", + " 'name': 'dfs_xgb_with_double_normalization',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 0,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.611234532127027}" + " 'max_labels'): 80,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 66,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6434375682152088,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.14135407511034503,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", + " 'score': 0.629513025867624}" ] }, "execution_count": 16, @@ -688,16 +690,16 @@ { "data": { "text/plain": [ - "{'id': 'afc8e912142bc6c384231600df9874fc',\n", - " 'name': 'classes.unstack_normalize_dfs_xgb_classifier',\n", + "{'id': '452a22a136f67c575aee3341c9dc2395',\n", + " 'name': 'dfs_xgb_with_double_normalization',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 0,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1},\n", - " 'score': 0.611234532127027}" + " 'max_labels'): 80,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 66,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6434375682152088,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.14135407511034503,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", + " 'score': 0.629513025867624}" ] }, "execution_count": 17, @@ -725,12 +727,12 @@ "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 0,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 100,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.0,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1}" + " 'max_labels'): 80,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 66,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6434375682152088,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.14135407511034503,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" ] }, "execution_count": 18, @@ -757,7 +759,7 @@ { "data": { "text/plain": [ - "'classes.unstack_normalize_dfs_xgb_classifier'" + "'dfs_xgb_with_double_normalization'" ] }, "execution_count": 19, @@ -785,7 +787,7 @@ { "data": { "text/plain": [ - "0.611234532127027" + "0.629513025867624" ] }, "execution_count": 20, @@ -815,41 +817,41 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: classes.unstack_normalize_dfs_xgb_classifier \n", + " Template: dfs_xgb_with_unstack_normalization \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 97\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 364\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 7\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6635800510691365\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9852977392614163\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 2\n", - "INFO:btb.session:New optimal found: classes.unstack_normalize_dfs_xgb_classifier - 0.6379648413546719\n", - "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.unstack_normalize_dfs_xgb_classifier\n", - "INFO:btb.session:Generating new proposal configuration for classes.normalize_dfs_xgb_classifier\n" + " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 48\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 130\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 8\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7437898568465957\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9963350624783064\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", + "INFO:btb.session:New optimal found: dfs_xgb_with_unstack_normalization - 0.651642052400304\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", + "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n" ] }, { "data": { "text/plain": [ - "{'id': '7e6de03286fd71179e2a2f7b3f089ffb',\n", - " 'name': 'classes.unstack_normalize_dfs_xgb_classifier',\n", + "{'id': '22ec731234212508b7b4413ccce34294',\n", + " 'name': 'dfs_xgb_with_unstack_normalization',\n", " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 97,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 364,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 7,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6635800510691365,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9852977392614163,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 2},\n", - " 'score': 0.6379648413546719}" + " 'max_labels'): 48,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 130,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 8,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7437898568465957,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9963350624783064,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", + " 'score': 0.651642052400304}" ] }, "execution_count": 21, @@ -869,7 +871,7 @@ { "data": { "text/plain": [ - "0.6379648413546719" + "0.651642052400304" ] }, "execution_count": 22, @@ -890,12 +892,12 @@ "data": { "text/plain": [ "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 97,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 364,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 7,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6635800510691365,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9852977392614163,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 2}" + " 'max_labels'): 48,\n", + " ('xgboost.XGBClassifier#1', 'n_estimators'): 130,\n", + " ('xgboost.XGBClassifier#1', 'max_depth'): 8,\n", + " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7437898568465957,\n", + " ('xgboost.XGBClassifier#1', 'gamma'): 0.9963350624783064,\n", + " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" ] }, "execution_count": 23, @@ -962,7 +964,7 @@ { "data": { "text/plain": [ - "0.7346938775510203" + "0.608695652173913" ] }, "execution_count": 26, @@ -1051,7 +1053,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1065,7 +1067,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.9" + "version": "3.7.11" } }, "nbformat": 4, diff --git a/tutorials/03_Benchmarking.ipynb b/tutorials/03_Benchmarking.ipynb index c7bce62..12c5e47 100644 --- a/tutorials/03_Benchmarking.ipynb +++ b/tutorials/03_Benchmarking.ipynb @@ -76,12 +76,12 @@ "outputs": [], "source": [ "templates = [\n", - " 'probability.unstack_lstm_timeseries_classifier',\n", - " 'probability.normalize_dfs_xgb_classifier'\n", + " 'lstm_prob_with_unstack',\n", + " 'dfs_xgb_prob_with_double_normalization'\n", "]\n", "window_size_rule = [('1d', '1h'), ('2d', '2h')]\n", "init_params = {\n", - " 'unstack_lstm_timeseries_classifier': {\n", + " 'lstm_prob_with_unstack': {\n", " 'keras.Sequential.LSTMTimeSeriesClassifier#1': {\n", " 'epochs': 1,\n", " }\n", @@ -389,7 +389,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -403,7 +403,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.7.11" } }, "nbformat": 4, diff --git a/tutorials/04_Draco_Regression_Pipeline.ipynb b/tutorials/04_Draco_Regression_Pipeline.ipynb new file mode 100644 index 0000000..709c839 --- /dev/null +++ b/tutorials/04_Draco_Regression_Pipeline.ipynb @@ -0,0 +1,793 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Draco Regression Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this tutorial we will show you how to use Draco Regression pipelines to solve a Machine Learning problem\n", + "defined via a Target Times table.\n", + "\n", + "During the next steps we will:\n", + "\n", + "- Load demo Remaining Useful Life (dataset) with training and testing target times and readings\n", + "- Find available pipelines and load one of them\n", + "- Build and fit a Machine Learning pipeline\n", + "- Make predictions using the fitted pipeline\n", + "- Evaluate how good the predictions are" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 0. Setup the logging\n", + "\n", + "This step sets up logging in our environment to increase our visibility over\n", + "the steps that Draco performs." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import logging;\n", + "\n", + "logging.basicConfig(level=logging.INFO)\n", + "logging.getLogger().setLevel(level=logging.INFO)\n", + "\n", + "import warnings\n", + "warnings.simplefilter(\"ignore\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load the Data\n", + "\n", + "The first step is to load the data that we are going to use.\n", + "\n", + "In order to use the demo data included in Draco, the `draco.demo.load_demo` function can be used." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from draco.demo import load_demo\n", + "\n", + "train_target_times, test_target_times, readings = load_demo(name='rul')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This will download some demo data from [Draco S3 demo Bucket](\n", + "https://d3-ai-draco.s3.amazonaws.com/index.html)%20and%20load%20it%20as/n", + "the necessary `target_times` and `readings` tables.\n", + "\n", + "The exact format of these tables is described in the Draco README and docs:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
012013-01-12 04:20:00166
112013-01-12 04:30:00165
212013-01-12 04:40:00164
312013-01-12 04:50:00163
412013-01-12 05:00:00162
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 1 2013-01-12 04:20:00 166\n", + "1 1 2013-01-12 04:30:00 165\n", + "2 1 2013-01-12 04:40:00 164\n", + "3 1 2013-01-12 04:50:00 163\n", + "4 1 2013-01-12 05:00:00 162" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_target_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(18131, 3)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_target_times.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "turbine_id int64\n", + "cutoff_time datetime64[ns]\n", + "target int64\n", + "dtype: object" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_target_times.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
012013-01-13 13:10:00112.0
122013-01-14 08:00:0098.0
232013-01-14 02:50:0069.0
342013-01-14 01:10:0082.0
452013-01-14 13:10:0091.0
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 1 2013-01-13 13:10:00 112.0\n", + "1 2 2013-01-14 08:00:00 98.0\n", + "2 3 2013-01-14 02:50:00 69.0\n", + "3 4 2013-01-14 01:10:00 82.0\n", + "4 5 2013-01-14 13:10:00 91.0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_target_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(100, 3)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_target_times.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "turbine_id int64\n", + "cutoff_time datetime64[ns]\n", + "target float64\n", + "dtype: object" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_target_times.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampsignal_idvalue
012013-01-12 00:10:00operational setting 1-0.0007
112013-01-12 00:20:00operational setting 10.0019
212013-01-12 00:30:00operational setting 1-0.0043
312013-01-12 00:40:00operational setting 10.0007
412013-01-12 00:50:00operational setting 1-0.0019
\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp signal_id value\n", + "0 1 2013-01-12 00:10:00 operational setting 1 -0.0007\n", + "1 1 2013-01-12 00:20:00 operational setting 1 0.0019\n", + "2 1 2013-01-12 00:30:00 operational setting 1 -0.0043\n", + "3 1 2013-01-12 00:40:00 operational setting 1 0.0007\n", + "4 1 2013-01-12 00:50:00 operational setting 1 -0.0019" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(809448, 4)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "turbine_id int64\n", + "timestamp datetime64[ns]\n", + "signal_id object\n", + "value float64\n", + "dtype: object" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.dtypes" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load your own Dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Alternatively, if you want to load your own dataset, all you have to do is load the\n", + "`target_times` and `readings` tables as `pandas.DataFrame` objects.\n", + "\n", + "Make sure to parse the corresponding datetime fields!\n", + "\n", + "```python\n", + "import pandas as pd\n", + "\n", + "target_times = pd.read_csv('path/to/your/target_times.csv', parse_dates=['cutoff_time'])\n", + "readings = pd.read_csv('path/to/your/readings.csv', parse_dates=['timestamp'])\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Finding the available Pipelines\n", + "\n", + "The next step will be to select a collection of templates from the ones\n", + "available in Draco.\n", + "\n", + "For this, we can use the `draco.get_pipelines` function, which will\n", + "return us the list of all the available MLBlocks pipelines found in the\n", + "Draco system." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['dfs_xgb_prob_with_unstack',\n", + " 'dfs_xgb_with_normalization',\n", + " 'dfs_xgb',\n", + " 'dfs_xgb_with_unstack',\n", + " 'dfs_xgb_prob_with_unstack_normalization',\n", + " 'dfs_xgb_with_unstack_normalization',\n", + " 'dfs_xgb_prob_with_double_normalization',\n", + " 'dfs_xgb_with_double_normalization',\n", + " 'lstm_regressor_with_unstack',\n", + " 'lstm_regressor',\n", + " 'double_lstm_prob_with_unstack',\n", + " 'double_lstm_prob',\n", + " 'double_lstm',\n", + " 'double_lstm_with_unstack',\n", + " 'lstm_prob_with_unstack',\n", + " 'lstm_with_unstack',\n", + " 'lstm_prob',\n", + " 'lstm']" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from draco import get_pipelines\n", + "\n", + "get_pipelines()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Optionally, we can pass a string to select the pipelines that contain it:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['lstm_regressor_with_unstack', 'lstm_regressor']" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "get_pipelines('regressor')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will use the regression pipeline `lstm_regressor_with_unstack`\n", + "\n", + "The `lstm_regressor_with_unstack` pipeline contains the following steps:\n", + "\n", + "- Resample the data using a 10 minute average aggregation\n", + "- Unstack the data by signal, so each signal is in a different column\n", + "- Impute missing values in the readings table\n", + "- Normalize (scale) the data between [-1, 1].\n", + "- Create window sequences using target times.\n", + "- Apply an LSTM Regressor" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_name = 'lstm_regressor_with_unstack'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Fitting a Draco Pipeline\n", + "\n", + "Once we have loaded the data, we create a **DracoPipeline** instance by passing `pipeline_name` which is the name of a pipeline, the path to a template json file, or a list that can combine both of them." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from draco.pipeline import DracoPipeline\n", + "\n", + "pipeline = DracoPipeline(pipeline_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To train a pipeline we use the `fit` method passing the `target_times` and the `readings` table:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-02-01 15:05:13.365367: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-02-01 15:05:13.379993: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fe6a0ec50a0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2022-02-01 15:05:13.380010: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + ] + } + ], + "source": [ + "pipeline.fit(train_target_times, readings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Use the fitted pipeline\n", + "\n", + "After fitting the pipeline, we are ready to make predictions on new data:" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = pipeline.predict(test_target_times, readings)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And evaluate its prediction performance:" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.6362969806460871" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.metrics import r2_score\n", + "\n", + "r2_score(test_target_times['target'], predictions)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Save and load the pipeline\n", + "\n", + "Since the tuning and fitting process takes time to execute and requires a lot of data, you\n", + "will probably want to save a fitted instance and load it later to analyze new signals\n", + "instead of fitting pipelines over and over again.\n", + "\n", + "This can be done by using the `save` and `load` methods from the `DracoPipeline`.\n", + "\n", + "In order to save an instance, call its `save` method passing it the path and filename\n", + "where the model should be saved." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "path = 'my_pipeline.pkl'\n", + "\n", + "pipeline.save(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once the pipeline is saved, it can be loaded back as a new `DracoPipeline` by using the\n", + "`DracoPipeline.load` method:" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "new_pipeline = DracoPipeline.load(path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once loaded, it can be directly used to make predictions on new data." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[129.89064 ],\n", + " [139.89001 ],\n", + " [ 39.425865],\n", + " [110.67838 ],\n", + " [ 98.52903 ]], dtype=float32)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "predictions = new_pipeline.predict(test_target_times, readings)\n", + "predictions[0:5]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/Convert NASA CMAPSS to Draco Format.ipynb b/tutorials/Convert NASA CMAPSS to Draco Format.ipynb new file mode 100644 index 0000000..bf5af31 --- /dev/null +++ b/tutorials/Convert NASA CMAPSS to Draco Format.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2f3d8acf", + "metadata": {}, + "source": [ + "# Convert CMAPSS to Draco Format\n", + "\n", + "In this notebook we download [CMAPSS](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan) data and reformat it as Draco pipelines expect." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "f39b805c", + "metadata": {}, + "outputs": [], + "source": [ + "import datetime\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "id": "626a2da0", + "metadata": {}, + "source": [ + "## 1. Download Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "ff641cff", + "metadata": {}, + "outputs": [], + "source": [ + "import io\n", + "import os\n", + "import urllib\n", + "import zipfile\n", + "\n", + "DATA_URL = 'https://d3-ai-greenguard.s3.amazonaws.com/CMAPSSData.zip'\n", + "\n", + "response = urllib.request.urlopen(DATA_URL)\n", + "bytes_io = io.BytesIO(response.read())\n", + "\n", + "with zipfile.ZipFile(bytes_io) as zf:\n", + " zf.extractall('CMAPSSData')" + ] + }, + { + "cell_type": "markdown", + "id": "9c435699", + "metadata": {}, + "source": [ + "## 2. Read Data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1bb002ac", + "metadata": {}, + "outputs": [], + "source": [ + "# columns\n", + "\n", + "index = ['unit number', 'time, in cycles']\n", + "setting = ['operational setting {}'.format(i + 1) for i in range(0, 3)]\n", + "sensor = ['sensor measurement {}'.format(i + 1) for i in range(0, 21)]\n", + "\n", + "all_columns = index + setting + sensor" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "74478b0f", + "metadata": {}, + "outputs": [], + "source": [ + "train = pd.read_csv('CMAPSSData/train_FD001.txt', sep=' ', header=None)\n", + "train = train.dropna(axis=1)\n", + "train.columns = all_columns\n", + "\n", + "test = pd.read_csv('CMAPSSData/test_FD001.txt', sep=' ', header=None)\n", + "test = test.dropna(axis=1)\n", + "test.columns = all_columns\n", + "\n", + "y_test = pd.read_csv('CMAPSSData/RUL_FD001.txt', sep=' ', header=None)\n", + "y_test = y_test.dropna(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "dd480185", + "metadata": {}, + "source": [ + "## 3. Create columns\n", + "\n", + "### 3.a create `RUL` column\n", + "How do we create **Remaining Useful Life (RUL)** column for the training dataset? We can assume that the last entry in the training dataset is the maximum life expectancy for that unit. Then each cycle we have will decrease by that number." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "eb0270ba", + "metadata": {}, + "outputs": [], + "source": [ + "def get_max(x):\n", + " return cycles_max[x]\n", + "\n", + "cycles_max = train.groupby(\"unit number\")[\"time, in cycles\"].max().to_dict()\n", + "cycles_max = train['unit number'].apply(get_max)\n", + "\n", + "train['RUL'] = cycles_max - train[\"time, in cycles\"]" + ] + }, + { + "cell_type": "markdown", + "id": "57fbd3b9", + "metadata": {}, + "source": [ + "### 3.b create `cutoff_time` column\n", + "`cutoff_time` is a datetime column with relation to the `cycle` number. We pick a start date and start incrementing from there." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "3e320356", + "metadata": {}, + "outputs": [], + "source": [ + "def get_timestamp(x):\n", + " return start + datetime.timedelta(minutes=x * 10)\n", + "\n", + "start = datetime.datetime(2013, 1, 12)\n", + "train['timestamp'] = train['time, in cycles'].apply(get_timestamp)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "11f78b71", + "metadata": {}, + "outputs": [], + "source": [ + "def get_timestamp_test(x):\n", + " return last[x['unit number']] + datetime.timedelta(minutes=x['time, in cycles'] * 10)\n", + "\n", + "last = train.groupby('unit number').last()['timestamp'].to_dict()\n", + "test['timestamp'] = test.apply(get_timestamp_test, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "95bec88f", + "metadata": {}, + "source": [ + "### 4. Format Data\n", + "\n", + "make `label_times` have three columns, namely: `['turbine_id', 'cutoff_time', 'target']`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1ce4320e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
2512013-01-12 04:20:00166
2612013-01-12 04:30:00165
2712013-01-12 04:40:00164
2812013-01-12 04:50:00163
2912013-01-12 05:00:00162
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "25 1 2013-01-12 04:20:00 166\n", + "26 1 2013-01-12 04:30:00 165\n", + "27 1 2013-01-12 04:40:00 164\n", + "28 1 2013-01-12 04:50:00 163\n", + "29 1 2013-01-12 05:00:00 162" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_label_times = train[['unit number', 'timestamp', 'RUL']].copy()\n", + "train_label_times.columns = ['turbine_id', 'cutoff_time', 'target']\n", + "\n", + "# drop first 24 occurances\n", + "train_label_times = train_label_times[train_label_times.groupby('turbine_id').cumcount('turbine_id') > 24]\n", + "train_label_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "f320e753", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
012013-01-13 13:10:00112.0
122013-01-14 08:00:0098.0
232013-01-14 02:50:0069.0
342013-01-14 01:10:0082.0
452013-01-14 13:10:0091.0
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 1 2013-01-13 13:10:00 112.0\n", + "1 2 2013-01-14 08:00:00 98.0\n", + "2 3 2013-01-14 02:50:00 69.0\n", + "3 4 2013-01-14 01:10:00 82.0\n", + "4 5 2013-01-14 13:10:00 91.0" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_label_times = test[['unit number', 'timestamp']].groupby('unit number').last().reset_index()\n", + "test_label_times.columns = ['turbine_id', 'cutoff_time']\n", + "test_label_times['target'] = np.array(y_test).astype('float32')\n", + "test_label_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "50be8dc4", + "metadata": {}, + "outputs": [], + "source": [ + "reading_columns = ['unit number', 'timestamp'] + setting + sensor\n", + "readings = pd.concat([train, test])[reading_columns]\n", + "readings = readings.melt(id_vars=['unit number', 'timestamp'])\n", + "readings.columns = ['turbine_id', 'timestamp', 'signal_id', 'value']" + ] + }, + { + "cell_type": "markdown", + "id": "01a77e60", + "metadata": {}, + "source": [ + "## 5. Save Data" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "5f622ff7", + "metadata": {}, + "outputs": [], + "source": [ + "readings.to_csv('rul_readings.csv.gz', compression='gzip', index=False)\n", + "train_label_times.to_csv('rul_train_target_times.csv.gz', compression='gzip', index=False)\n", + "test_label_times.to_csv('rul_test_target_times.csv.gz', compression='gzip', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb b/tutorials/pipelines/dfs_xgb_with_double_normalization.ipynb similarity index 58% rename from tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb rename to tutorials/pipelines/dfs_xgb_with_double_normalization.ipynb index ca00d58..6fd5f1e 100644 --- a/tutorials/pipelines/normalize_dfs_xgb_classifier.ipynb +++ b/tutorials/pipelines/dfs_xgb_with_double_normalization.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# normalize_dfs_xgb_classifier" + "# dfs_xgb_with_double_normalization" ] }, { @@ -24,14 +24,23 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_name = 'classes.normalize_dfs_xgb_classifier'" + "pipeline_name = 'dfs_xgb_with_double_normalization'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/dask/dataframe/utils.py:14: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", + " import pandas.util.testing as tm\n" + ] + } + ], "source": [ "from draco.pipeline import DracoPipeline\n", "\n", @@ -46,10 +55,7 @@ { "data": { "text/plain": [ - "['pandas.DataFrame.resample',\n", - " 'featuretools.EntitySet.entity_from_dataframe',\n", - " 'featuretools.EntitySet.normalize_entity',\n", - " 'featuretools.EntitySet.normalize_entity',\n", + "['mlblocks.MLPipeline',\n", " 'featuretools.dfs',\n", " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", " 'xgboost.XGBClassifier']" @@ -267,360 +273,28 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## pandas.DataFrame.resample\n", + "## mlblocks.MLPipeline\n", + "\n", + "### pandas.DataFrame.resample\n", "\n", "* Input: readings\n", "* Output: readings (resampled)\n", "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "step = 0\n", - "context = pipeline.fit(target_times, readings, output_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y'])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10 00:00:00323.0
1T001S012013-01-10 00:10:00346.0
2T001S012013-01-10 00:20:00407.0
3T001S012013-01-10 00:30:00257.0
4T001S012013-01-10 00:40:00267.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 00:00:00 323.0\n", - "1 T001 S01 2013-01-10 00:10:00 346.0\n", - "2 T001 S01 2013-01-10 00:20:00 407.0\n", - "3 T001 S01 2013-01-10 00:30:00 257.0\n", - "4 T001 S01 2013-01-10 00:40:00 267.0" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## featuretools.EntitySet.entity_from_dataframe\n", + " signal_id and timestamp have been set as a multi-index\n", + "\n", + "### featuretools.EntitySet.entity_from_dataframe\n", "\n", "* Input: readings (resampled)\n", "* Output: entityset\n", - "* Effect: Entityset has been generated from readings" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "step = 1\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Entityset: entityset\n", - " Entities:\n", - " readings [Rows: 1329146, Columns: 5]\n", - " Relationships:\n", - " No relationships" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['entityset']" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10 00:00:00323.0
1T001S012013-01-10 00:10:00346.0
2T001S012013-01-10 00:20:00407.0
3T001S012013-01-10 00:30:00257.0
4T001S012013-01-10 00:40:00267.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 00:00:00 323.0\n", - "1 T001 S01 2013-01-10 00:10:00 346.0\n", - "2 T001 S01 2013-01-10 00:20:00 407.0\n", - "3 T001 S01 2013-01-10 00:30:00 257.0\n", - "4 T001 S01 2013-01-10 00:40:00 267.0" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## featuretools.EntitySet.normalize_entity\n", + "* Effect: Entityset has been generated from readings\n", + "\n", + "### featuretools.EntitySet.normalize_entity\n", "\n", "* Input: entityset\n", "* Output: entityset with relationship (readings.turbine_id with turbines.turbine_id)\n", - "* Effect: establish relation between readings and turbines" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "step = 2\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Entityset: entityset\n", - " Entities:\n", - " readings [Rows: 1329146, Columns: 5]\n", - " turbines [Rows: 1, Columns: 1]\n", - " Relationships:\n", - " readings.turbine_id -> turbines.turbine_id" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['entityset']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## featuretools.EntitySet.normalize_entity\n", + "* Effect: establish relation between readings and turbines\n", + "\n", + "### featuretools.EntitySet.normalize_entity\n", "\n", "* Input: entityset\n", "* Output: entityset with relationship (readings.signal_id with signals.signal_id)\n", @@ -629,26 +303,26 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "step = 3\n", - "context = pipeline.fit(**context, output_=step, start_=step)" + "step = 0\n", + "context = pipeline.fit(target_times, readings, output_=step)" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + "dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])" ] }, - "execution_count": 18, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -659,7 +333,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -675,7 +349,7 @@ " readings.signal_id -> signals.signal_id" ] }, - "execution_count": 19, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -697,17 +371,17 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "step = 4\n", + "step = 1\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -716,7 +390,7 @@ "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" ] }, - "execution_count": 21, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -727,7 +401,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -751,27 +425,27 @@ " \n", " \n", " \n", - " COUNT(readings)\n", + " SUM(readings.value)\n", + " STD(readings.value)\n", " MAX(readings.value)\n", - " MEAN(readings.value)\n", + " SKEW(readings.value)\n", " MIN(readings.value)\n", - " MODE(readings.signal_id)\n", + " MEAN(readings.value)\n", + " COUNT(readings)\n", " NUM_UNIQUE(readings.signal_id)\n", - " SKEW(readings.value)\n", - " STD(readings.value)\n", - " SUM(readings.value)\n", - " MODE(readings.DAY(timestamp))\n", + " MODE(readings.signal_id)\n", + " NUM_UNIQUE(readings.DAY(timestamp))\n", " ...\n", - " SKEW(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", - " SKEW(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", - " STD(readings.signals.NUM_UNIQUE(readings.DAY(timestamp)))\n", - " STD(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", - " STD(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", - " STD(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", - " SUM(readings.signals.NUM_UNIQUE(readings.DAY(timestamp)))\n", - " SUM(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", - " SUM(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", - " SUM(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", + " MEAN(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", + " MEAN(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", + " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp)))\n", + " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp)))\n", + " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp)))\n", + " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp)))\n", + " MODE(readings.signals.MODE(readings.DAY(timestamp)))\n", + " MODE(readings.signals.MODE(readings.YEAR(timestamp)))\n", + " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp)))\n", + " MODE(readings.signals.MODE(readings.MONTH(timestamp)))\n", " \n", " \n", " turbine_id\n", @@ -801,123 +475,123 @@ " \n", " \n", " T001\n", - " 3744\n", + " 3.433649e+09\n", + " 1.456860e+06\n", " 3448719.0\n", - " 917107.079193\n", - " 0.0\n", - " S01\n", - " 26\n", " 1.019214\n", - " 1.456860e+06\n", - " 3.433649e+09\n", - " 11\n", - " ...\n", - " 0\n", - " 0\n", - " 0.0\n", " 0.0\n", - " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", + " 917107.079193\n", " 3744\n", + " 26\n", + " S01\n", + " 2\n", + " ...\n", + " 2.0\n", + " 1.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 11\n", + " 2013\n", + " 4\n", + " 1\n", " \n", " \n", " T001\n", - " 3744\n", + " 3.441489e+09\n", + " 1.459865e+06\n", " 3453777.0\n", - " 919201.162179\n", - " 0.0\n", - " S01\n", - " 26\n", " 1.018761\n", - " 1.459865e+06\n", - " 3.441489e+09\n", - " 12\n", - " ...\n", - " 0\n", - " 0\n", - " 0.0\n", - " 0.0\n", " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", - " 3744\n", - " \n", - " \n", - " T001\n", + " 919201.162179\n", " 3744\n", - " 3463880.0\n", - " 922935.352244\n", - " 2.7\n", - " S01\n", " 26\n", - " 1.018192\n", - " 1.465277e+06\n", - " 3.455470e+09\n", - " 13\n", + " S01\n", + " 2\n", " ...\n", - " 0\n", - " 0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", - " 3744\n", + " 2.0\n", + " 1.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 12\n", + " 2013\n", + " 5\n", + " 1\n", " \n", " \n", " T001\n", + " 3.455470e+09\n", + " 1.465277e+06\n", + " 3463880.0\n", + " 1.018192\n", + " 2.7\n", + " 922935.352244\n", " 3744\n", - " 3474703.0\n", - " 928248.092869\n", - " -1.0\n", - " S01\n", " 26\n", - " 1.017666\n", - " 1.473337e+06\n", - " 3.475361e+09\n", - " 14\n", + " S01\n", + " 2\n", " ...\n", - " 0\n", - " 0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", - " 3744\n", + " 2.0\n", + " 1.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 13\n", + " 2013\n", + " 6\n", + " 1\n", " \n", " \n", " T001\n", + " 3.475361e+09\n", + " 1.473337e+06\n", + " 3474703.0\n", + " 1.017666\n", + " -1.0\n", + " 928248.092869\n", " 3744\n", - " 3485019.0\n", - " 924186.531200\n", - " 0.0\n", - " S01\n", " 26\n", - " 1.032002\n", - " 1.477958e+06\n", - " 2.888083e+09\n", - " 15\n", + " S01\n", + " 2\n", " ...\n", + " 2.0\n", + " 1.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 14\n", + " 2013\n", " 0\n", - " 0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", + " 1\n", + " \n", + " \n", + " T001\n", + " 2.888083e+09\n", + " 1.477958e+06\n", + " 3485019.0\n", + " 1.032002\n", " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", + " 924186.531200\n", " 3744\n", + " 26\n", + " S01\n", + " 2\n", + " ...\n", + " 2.0\n", + " 1.0\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 15\n", + " 2013\n", + " 1\n", + " 1\n", " \n", " \n", "\n", @@ -925,130 +599,122 @@ "" ], "text/plain": [ - " COUNT(readings) MAX(readings.value) MEAN(readings.value) \\\n", - "turbine_id \n", - "T001 3744 3448719.0 917107.079193 \n", - "T001 3744 3453777.0 919201.162179 \n", - "T001 3744 3463880.0 922935.352244 \n", - "T001 3744 3474703.0 928248.092869 \n", - "T001 3744 3485019.0 924186.531200 \n", - "\n", - " MIN(readings.value) MODE(readings.signal_id) \\\n", - "turbine_id \n", - "T001 0.0 S01 \n", - "T001 0.0 S01 \n", - "T001 2.7 S01 \n", - "T001 -1.0 S01 \n", - "T001 0.0 S01 \n", + " SUM(readings.value) STD(readings.value) MAX(readings.value) \\\n", + "turbine_id \n", + "T001 3.433649e+09 1.456860e+06 3448719.0 \n", + "T001 3.441489e+09 1.459865e+06 3453777.0 \n", + "T001 3.455470e+09 1.465277e+06 3463880.0 \n", + "T001 3.475361e+09 1.473337e+06 3474703.0 \n", + "T001 2.888083e+09 1.477958e+06 3485019.0 \n", "\n", - " NUM_UNIQUE(readings.signal_id) SKEW(readings.value) \\\n", - "turbine_id \n", - "T001 26 1.019214 \n", - "T001 26 1.018761 \n", - "T001 26 1.018192 \n", - "T001 26 1.017666 \n", - "T001 26 1.032002 \n", + " SKEW(readings.value) MIN(readings.value) MEAN(readings.value) \\\n", + "turbine_id \n", + "T001 1.019214 0.0 917107.079193 \n", + "T001 1.018761 0.0 919201.162179 \n", + "T001 1.018192 2.7 922935.352244 \n", + "T001 1.017666 -1.0 928248.092869 \n", + "T001 1.032002 0.0 924186.531200 \n", "\n", - " STD(readings.value) SUM(readings.value) \\\n", - "turbine_id \n", - "T001 1.456860e+06 3.433649e+09 \n", - "T001 1.459865e+06 3.441489e+09 \n", - "T001 1.465277e+06 3.455470e+09 \n", - "T001 1.473337e+06 3.475361e+09 \n", - "T001 1.477958e+06 2.888083e+09 \n", + " COUNT(readings) NUM_UNIQUE(readings.signal_id) \\\n", + "turbine_id \n", + "T001 3744 26 \n", + "T001 3744 26 \n", + "T001 3744 26 \n", + "T001 3744 26 \n", + "T001 3744 26 \n", "\n", - " MODE(readings.DAY(timestamp)) ... \\\n", - "turbine_id ... \n", - "T001 11 ... \n", - "T001 12 ... \n", - "T001 13 ... \n", - "T001 14 ... \n", - "T001 15 ... \n", + " MODE(readings.signal_id) NUM_UNIQUE(readings.DAY(timestamp)) ... \\\n", + "turbine_id ... \n", + "T001 S01 2 ... \n", + "T001 S01 2 ... \n", + "T001 S01 2 ... \n", + "T001 S01 2 ... \n", + "T001 S01 2 ... \n", "\n", - " SKEW(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", + " MEAN(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", "turbine_id \n", - "T001 0 \n", - "T001 0 \n", - "T001 0 \n", - "T001 0 \n", - "T001 0 \n", + "T001 2.0 \n", + "T001 2.0 \n", + "T001 2.0 \n", + "T001 2.0 \n", + "T001 2.0 \n", "\n", - " SKEW(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", + " MEAN(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", "turbine_id \n", - "T001 0 \n", - "T001 0 \n", - "T001 0 \n", - "T001 0 \n", - "T001 0 \n", + "T001 1.0 \n", + "T001 1.0 \n", + "T001 1.0 \n", + "T001 1.0 \n", + "T001 1.0 \n", "\n", - " STD(readings.signals.NUM_UNIQUE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", + " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " STD(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", + " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", "turbine_id \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " STD(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", + " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " STD(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", + " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " SUM(readings.signals.NUM_UNIQUE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", + " MODE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 11 \n", + "T001 12 \n", + "T001 13 \n", + "T001 14 \n", + "T001 15 \n", "\n", - " SUM(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", + " MODE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", "\n", - " SUM(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", + " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 4 \n", + "T001 5 \n", + "T001 6 \n", + "T001 0 \n", + "T001 1 \n", "\n", - " SUM(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \n", - "turbine_id \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", + " MODE(readings.signals.MODE(readings.MONTH(timestamp))) \n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", "[5 rows x 99 columns]" ] }, - "execution_count": 22, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -1059,7 +725,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -1068,7 +734,7 @@ "99" ] }, - "execution_count": 23, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -1080,7 +746,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1115,51 +781,51 @@ " 0\n", " T001\n", " S01\n", - " 2013-01-10 00:00:00\n", + " 2013-01-10\n", " 323.0\n", " \n", " \n", " 1\n", " T001\n", - " S01\n", - " 2013-01-10 00:10:00\n", - " 346.0\n", + " S02\n", + " 2013-01-10\n", + " 320.0\n", " \n", " \n", " 2\n", " T001\n", - " S01\n", - " 2013-01-10 00:20:00\n", - " 407.0\n", + " S03\n", + " 2013-01-10\n", + " 284.0\n", " \n", " \n", " 3\n", " T001\n", - " S01\n", - " 2013-01-10 00:30:00\n", - " 257.0\n", + " S04\n", + " 2013-01-10\n", + " 348.0\n", " \n", " \n", " 4\n", " T001\n", - " S01\n", - " 2013-01-10 00:40:00\n", - " 267.0\n", + " S05\n", + " 2013-01-10\n", + " 273.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 00:00:00 323.0\n", - "1 T001 S01 2013-01-10 00:10:00 346.0\n", - "2 T001 S01 2013-01-10 00:20:00 407.0\n", - "3 T001 S01 2013-01-10 00:30:00 257.0\n", - "4 T001 S01 2013-01-10 00:40:00 267.0" + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" ] }, - "execution_count": 24, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1181,17 +847,17 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "step = 5\n", + "step = 2\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1200,7 +866,7 @@ "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" ] }, - "execution_count": 26, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1211,7 +877,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1235,25 +901,25 @@ " \n", " \n", " \n", - " COUNT(readings)\n", + " SUM(readings.value)\n", + " STD(readings.value)\n", " MAX(readings.value)\n", - " MEAN(readings.value)\n", + " SKEW(readings.value)\n", " MIN(readings.value)\n", + " MEAN(readings.value)\n", + " COUNT(readings)\n", " NUM_UNIQUE(readings.signal_id)\n", - " SKEW(readings.value)\n", - " STD(readings.value)\n", - " SUM(readings.value)\n", - " MODE(readings.DAY(timestamp))\n", - " MODE(readings.MONTH(timestamp))\n", + " NUM_UNIQUE(readings.DAY(timestamp))\n", + " NUM_UNIQUE(readings.MONTH(timestamp))\n", " ...\n", - " STD(readings.signals.NUM_UNIQUE(readings.DAY(timestamp)))\n", - " STD(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", - " STD(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", - " STD(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", - " SUM(readings.signals.NUM_UNIQUE(readings.DAY(timestamp)))\n", - " SUM(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp)))\n", - " SUM(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))\n", - " SUM(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))\n", + " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp)))\n", + " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp)))\n", + " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp)))\n", + " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp)))\n", + " MODE(readings.signals.MODE(readings.DAY(timestamp)))\n", + " MODE(readings.signals.MODE(readings.YEAR(timestamp)))\n", + " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp)))\n", + " MODE(readings.signals.MODE(readings.MONTH(timestamp)))\n", " MODE(readings.signal_id)=S01\n", " MODE(readings.signals.MODE(readings.turbine_id))=T001\n", " \n", @@ -1285,121 +951,121 @@ " \n", " \n", " T001\n", - " 3744\n", + " 3.433649e+09\n", + " 1.456860e+06\n", " 3448719.0\n", - " 917107.079193\n", + " 1.019214\n", " 0.0\n", + " 917107.079193\n", + " 3744\n", " 26\n", - " 1.019214\n", - " 1.456860e+06\n", - " 3.433649e+09\n", - " 11\n", + " 2\n", " 1\n", " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", - " 3744\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 11\n", + " 2013\n", + " 4\n", + " 1\n", " 1\n", " 1\n", " \n", " \n", " T001\n", - " 3744\n", + " 3.441489e+09\n", + " 1.459865e+06\n", " 3453777.0\n", - " 919201.162179\n", + " 1.018761\n", " 0.0\n", + " 919201.162179\n", + " 3744\n", " 26\n", - " 1.018761\n", - " 1.459865e+06\n", - " 3.441489e+09\n", - " 12\n", + " 2\n", " 1\n", " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", - " 3744\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 12\n", + " 2013\n", + " 5\n", + " 1\n", " 1\n", " 1\n", " \n", " \n", " T001\n", - " 3744\n", + " 3.455470e+09\n", + " 1.465277e+06\n", " 3463880.0\n", - " 922935.352244\n", + " 1.018192\n", " 2.7\n", + " 922935.352244\n", + " 3744\n", " 26\n", - " 1.018192\n", - " 1.465277e+06\n", - " 3.455470e+09\n", - " 13\n", + " 2\n", " 1\n", " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", - " 3744\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 13\n", + " 2013\n", + " 6\n", + " 1\n", " 1\n", " 1\n", " \n", " \n", " T001\n", - " 3744\n", + " 3.475361e+09\n", + " 1.473337e+06\n", " 3474703.0\n", - " 928248.092869\n", + " 1.017666\n", " -1.0\n", + " 928248.092869\n", + " 3744\n", " 26\n", - " 1.017666\n", - " 1.473337e+06\n", - " 3.475361e+09\n", - " 14\n", + " 2\n", " 1\n", " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", - " 3744\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 14\n", + " 2013\n", + " 0\n", + " 1\n", " 1\n", " 1\n", " \n", " \n", " T001\n", - " 3744\n", + " 2.888083e+09\n", + " 1.477958e+06\n", " 3485019.0\n", - " 924186.531200\n", + " 1.032002\n", " 0.0\n", + " 924186.531200\n", + " 3744\n", " 26\n", - " 1.032002\n", - " 1.477958e+06\n", - " 2.888083e+09\n", - " 15\n", + " 2\n", " 1\n", " ...\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 0.0\n", - " 7488\n", - " 3744\n", - " 7488\n", - " 3744\n", + " 1\n", + " 1\n", + " 1\n", + " 1\n", + " 15\n", + " 2013\n", + " 1\n", + " 1\n", " 1\n", " 1\n", " \n", @@ -1409,101 +1075,109 @@ "" ], "text/plain": [ - " COUNT(readings) MAX(readings.value) MEAN(readings.value) \\\n", - "turbine_id \n", - "T001 3744 3448719.0 917107.079193 \n", - "T001 3744 3453777.0 919201.162179 \n", - "T001 3744 3463880.0 922935.352244 \n", - "T001 3744 3474703.0 928248.092869 \n", - "T001 3744 3485019.0 924186.531200 \n", + " SUM(readings.value) STD(readings.value) MAX(readings.value) \\\n", + "turbine_id \n", + "T001 3.433649e+09 1.456860e+06 3448719.0 \n", + "T001 3.441489e+09 1.459865e+06 3453777.0 \n", + "T001 3.455470e+09 1.465277e+06 3463880.0 \n", + "T001 3.475361e+09 1.473337e+06 3474703.0 \n", + "T001 2.888083e+09 1.477958e+06 3485019.0 \n", "\n", - " MIN(readings.value) NUM_UNIQUE(readings.signal_id) \\\n", - "turbine_id \n", - "T001 0.0 26 \n", - "T001 0.0 26 \n", - "T001 2.7 26 \n", - "T001 -1.0 26 \n", - "T001 0.0 26 \n", + " SKEW(readings.value) MIN(readings.value) MEAN(readings.value) \\\n", + "turbine_id \n", + "T001 1.019214 0.0 917107.079193 \n", + "T001 1.018761 0.0 919201.162179 \n", + "T001 1.018192 2.7 922935.352244 \n", + "T001 1.017666 -1.0 928248.092869 \n", + "T001 1.032002 0.0 924186.531200 \n", "\n", - " SKEW(readings.value) STD(readings.value) SUM(readings.value) \\\n", - "turbine_id \n", - "T001 1.019214 1.456860e+06 3.433649e+09 \n", - "T001 1.018761 1.459865e+06 3.441489e+09 \n", - "T001 1.018192 1.465277e+06 3.455470e+09 \n", - "T001 1.017666 1.473337e+06 3.475361e+09 \n", - "T001 1.032002 1.477958e+06 2.888083e+09 \n", + " COUNT(readings) NUM_UNIQUE(readings.signal_id) \\\n", + "turbine_id \n", + "T001 3744 26 \n", + "T001 3744 26 \n", + "T001 3744 26 \n", + "T001 3744 26 \n", + "T001 3744 26 \n", "\n", - " MODE(readings.DAY(timestamp)) MODE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 11 1 \n", - "T001 12 1 \n", - "T001 13 1 \n", - "T001 14 1 \n", - "T001 15 1 \n", + " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", + "turbine_id \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "T001 2 \n", + "\n", + " NUM_UNIQUE(readings.MONTH(timestamp)) ... \\\n", + "turbine_id ... \n", + "T001 1 ... \n", + "T001 1 ... \n", + "T001 1 ... \n", + "T001 1 ... \n", + "T001 1 ... \n", "\n", - " ... STD(readings.signals.NUM_UNIQUE(readings.DAY(timestamp))) \\\n", - "turbine_id ... \n", - "T001 ... 0.0 \n", - "T001 ... 0.0 \n", - "T001 ... 0.0 \n", - "T001 ... 0.0 \n", - "T001 ... 0.0 \n", + " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " STD(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", + " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", "turbine_id \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " STD(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", + " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " STD(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", - "T001 0.0 \n", + " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " SUM(readings.signals.NUM_UNIQUE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", + " MODE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", + "turbine_id \n", + "T001 11 \n", + "T001 12 \n", + "T001 13 \n", + "T001 14 \n", + "T001 15 \n", "\n", - " SUM(readings.signals.NUM_UNIQUE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", + " MODE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", + "turbine_id \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", + "T001 2013 \n", "\n", - " SUM(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", - "T001 7488 \n", + " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", + "turbine_id \n", + "T001 4 \n", + "T001 5 \n", + "T001 6 \n", + "T001 0 \n", + "T001 1 \n", "\n", - " SUM(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", - "T001 3744 \n", + " MODE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", " MODE(readings.signal_id)=S01 \\\n", "turbine_id \n", @@ -1524,7 +1198,7 @@ "[5 rows x 99 columns]" ] }, - "execution_count": 27, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1535,7 +1209,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1570,51 +1244,51 @@ " 0\n", " T001\n", " S01\n", - " 2013-01-10 00:00:00\n", + " 2013-01-10\n", " 323.0\n", " \n", " \n", " 1\n", " T001\n", - " S01\n", - " 2013-01-10 00:10:00\n", - " 346.0\n", + " S02\n", + " 2013-01-10\n", + " 320.0\n", " \n", " \n", " 2\n", " T001\n", - " S01\n", - " 2013-01-10 00:20:00\n", - " 407.0\n", + " S03\n", + " 2013-01-10\n", + " 284.0\n", " \n", " \n", " 3\n", " T001\n", - " S01\n", - " 2013-01-10 00:30:00\n", - " 257.0\n", + " S04\n", + " 2013-01-10\n", + " 348.0\n", " \n", " \n", " 4\n", " T001\n", - " S01\n", - " 2013-01-10 00:40:00\n", - " 267.0\n", + " S05\n", + " 2013-01-10\n", + " 273.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 00:00:00 323.0\n", - "1 T001 S01 2013-01-10 00:10:00 346.0\n", - "2 T001 S01 2013-01-10 00:20:00 407.0\n", - "3 T001 S01 2013-01-10 00:30:00 257.0\n", - "4 T001 S01 2013-01-10 00:40:00 267.0" + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" ] }, - "execution_count": 28, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1636,17 +1310,17 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ - "step = 6\n", + "step = 3\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1655,7 +1329,7 @@ "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" ] }, - "execution_count": 30, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1667,7 +1341,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1681,7 +1355,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.7.11" } }, "nbformat": 4, diff --git a/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb b/tutorials/pipelines/dfs_xgb_with_unstack_normalization.ipynb similarity index 68% rename from tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb rename to tutorials/pipelines/dfs_xgb_with_unstack_normalization.ipynb index 84530a2..5731706 100644 --- a/tutorials/pipelines/unstack_normalize_dfs_xgb_classifier.ipynb +++ b/tutorials/pipelines/dfs_xgb_with_unstack_normalization.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# unstack_normalize_dfs_xgb_classifier" + "# dfs_xgb_with_unstack_normalization" ] }, { @@ -24,14 +24,23 @@ "metadata": {}, "outputs": [], "source": [ - "pipeline_name = 'classes.unstack_normalize_dfs_xgb_classifier'" + "pipeline_name = 'dfs_xgb_with_unstack_normalization'" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/dask/dataframe/utils.py:14: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", + " import pandas.util.testing as tm\n" + ] + } + ], "source": [ "from draco.pipeline import DracoPipeline\n", "\n", @@ -46,10 +55,8 @@ { "data": { "text/plain": [ - "['pandas.DataFrame.resample',\n", - " 'pandas.DataFrame.unstack',\n", - " 'featuretools.EntitySet.entity_from_dataframe',\n", - " 'featuretools.EntitySet.normalize_entity',\n", + "['mlblocks.MLPipeline',\n", + " 'mlblocks.MLPipeline',\n", " 'featuretools.dfs',\n", " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", " 'xgboost.XGBClassifier']" @@ -267,12 +274,20 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## pandas.DataFrame.resample\n", + "## mlblocks.MLPipeline 1\n", + "\n", + "### pandas.DataFrame.resample\n", "\n", "* Input: readings\n", "* Output: readings (resampled)\n", "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index" + " signal_id and timestamp have been set as a multi-index\n", + " \n", + "### pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" ] }, { @@ -309,130 +324,6 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value
turbine_idsignal_idtimestamp
T001S012013-01-10 00:00:00323.0
2013-01-10 00:10:00346.0
2013-01-10 00:20:00407.0
2013-01-10 00:30:00257.0
2013-01-10 00:40:00267.0
\n", - "
" - ], - "text/plain": [ - " value\n", - "turbine_id signal_id timestamp \n", - "T001 S01 2013-01-10 00:00:00 323.0\n", - " 2013-01-10 00:10:00 346.0\n", - " 2013-01-10 00:20:00 407.0\n", - " 2013-01-10 00:30:00 257.0\n", - " 2013-01-10 00:40:00 267.0" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.unstack\n", - "\n", - "* Input: readings (resampled)\n", - "* Output: readings (unstacked)\n", - "* Effect: readings have been unstacked" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "step = 1\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, "outputs": [ { "data": { @@ -636,7 +527,7 @@ "[5 rows x 28 columns]" ] }, - "execution_count": 12, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -649,72 +540,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## featuretools.EntitySet.entity_from_dataframe\n", + "## mlblocks.MLPipeline 2\n", + "\n", + "### featuretools.EntitySet.entity_from_dataframe\n", "\n", "* Input: readings (resampled)\n", "* Output: entityset\n", - "* Effect: Entityset has been generated from readings" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "step = 2\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Entityset: entityset\n", - " Entities:\n", - " readings [Rows: 51121, Columns: 29]\n", - " Relationships:\n", - " No relationships" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['entityset']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## featuretools.EntitySet.normalize_entity\n", + "* Effect: Entityset has been generated from readings\n", + "\n", + "### featuretools.EntitySet.normalize_entity\n", "\n", "* Input: entityset\n", "* Output: entityset with relationship (readings.turbine_id with turbines.turbine_id)\n", @@ -723,26 +557,26 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ - "step = 3\n", + "step = 1\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" + "dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])" ] }, - "execution_count": 17, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -753,7 +587,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -767,7 +601,7 @@ " readings.turbine_id -> turbines.turbine_id" ] }, - "execution_count": 18, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -789,17 +623,17 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "step = 4\n", + "step = 2\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -808,7 +642,7 @@ "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" ] }, - "execution_count": 20, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -819,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -843,27 +677,27 @@ " \n", " \n", " \n", - " COUNT(readings)\n", - " MAX(readings.value_S01)\n", - " MAX(readings.value_S02)\n", - " MAX(readings.value_S03)\n", - " MAX(readings.value_S04)\n", - " MAX(readings.value_S05)\n", - " MAX(readings.value_S06)\n", - " MAX(readings.value_S07)\n", - " MAX(readings.value_S08)\n", - " MAX(readings.value_S09)\n", - " ...\n", + " SUM(readings.value_S14)\n", + " SUM(readings.value_S11)\n", " SUM(readings.value_S25)\n", - " SUM(readings.value_S26)\n", - " MODE(readings.DAY(timestamp))\n", - " MODE(readings.MONTH(timestamp))\n", - " MODE(readings.WEEKDAY(timestamp))\n", - " MODE(readings.YEAR(timestamp))\n", - " NUM_UNIQUE(readings.DAY(timestamp))\n", + " SUM(readings.value_S23)\n", + " SUM(readings.value_S17)\n", + " SUM(readings.value_S19)\n", + " SUM(readings.value_S04)\n", + " SUM(readings.value_S05)\n", + " SUM(readings.value_S21)\n", + " SUM(readings.value_S16)\n", + " ...\n", + " MEAN(readings.value_S20)\n", + " COUNT(readings)\n", + " NUM_UNIQUE(readings.YEAR(timestamp))\n", " NUM_UNIQUE(readings.MONTH(timestamp))\n", + " NUM_UNIQUE(readings.DAY(timestamp))\n", " NUM_UNIQUE(readings.WEEKDAY(timestamp))\n", - " NUM_UNIQUE(readings.YEAR(timestamp))\n", + " MODE(readings.YEAR(timestamp))\n", + " MODE(readings.MONTH(timestamp))\n", + " MODE(readings.DAY(timestamp))\n", + " MODE(readings.WEEKDAY(timestamp))\n", " \n", " \n", " turbine_id\n", @@ -893,122 +727,122 @@ " \n", " \n", " T001\n", - " 144\n", - " 369.0\n", - " 376.0\n", - " 378.0\n", - " 401.0\n", - " 317.0\n", - " 324.0\n", - " 301.0\n", - " 3209069.0\n", - " 706654.0\n", - " ...\n", + " 465421817.0\n", + " 496362516.0\n", " 2743.0\n", - " 20569.0\n", - " 11\n", + " 2780.0\n", + " 994.6\n", + " 3174.0\n", + " 19412.0\n", + " 17083.0\n", + " 3061.0\n", + " 550.4\n", + " ...\n", + " 22.326389\n", + " 144\n", " 1\n", - " 4\n", - " 2013\n", - " 2\n", " 1\n", " 2\n", + " 2\n", + " 2013\n", " 1\n", + " 11\n", + " 4\n", " \n", " \n", " T001\n", - " 144\n", - " 505.0\n", - " 426.0\n", - " 393.0\n", - " 517.0\n", - " 469.0\n", - " 407.0\n", - " 459.0\n", - " 3214181.0\n", - " 711718.0\n", - " ...\n", + " 465897578.0\n", + " 496952628.0\n", " 4237.0\n", - " 32991.0\n", - " 12\n", + " 4640.0\n", + " 1166.7\n", + " 5112.0\n", + " 38289.0\n", + " 34344.0\n", + " 4919.0\n", + " 713.7\n", + " ...\n", + " 35.166667\n", + " 144\n", " 1\n", - " 5\n", - " 2013\n", - " 2\n", " 1\n", " 2\n", + " 2\n", + " 2013\n", " 1\n", + " 12\n", + " 5\n", " \n", " \n", " T001\n", - " 144\n", - " 827.0\n", - " 794.0\n", - " 839.0\n", - " 848.0\n", - " 843.0\n", - " 843.0\n", - " 844.0\n", - " 3223315.0\n", - " 719405.0\n", - " ...\n", + " 466806830.0\n", + " 498019072.0\n", " 9008.0\n", - " 63463.0\n", - " 13\n", + " 9179.0\n", + " 1581.7\n", + " 9134.0\n", + " 86707.0\n", + " 78749.0\n", + " 9863.0\n", + " 916.3\n", + " ...\n", + " 53.381944\n", + " 144\n", " 1\n", - " 6\n", - " 2013\n", - " 2\n", " 1\n", " 2\n", + " 2\n", + " 2013\n", " 1\n", + " 13\n", + " 6\n", " \n", " \n", " T001\n", - " 144\n", - " 848.0\n", - " 841.0\n", - " 838.0\n", - " 849.0\n", - " 850.0\n", - " 848.0\n", - " 850.0\n", - " 3233989.0\n", - " 728250.0\n", - " ...\n", + " 468250434.0\n", + " 499530451.0\n", " 10073.0\n", - " 70393.0\n", - " 14\n", + " 10310.0\n", + " 1690.9\n", + " 10674.0\n", + " 87907.0\n", + " 83264.0\n", + " 10638.0\n", + " 970.6\n", + " ...\n", + " 61.423611\n", + " 144\n", " 1\n", - " 0\n", - " 2013\n", - " 2\n", " 1\n", " 2\n", + " 2\n", + " 2013\n", " 1\n", + " 14\n", + " 0\n", " \n", " \n", " T001\n", - " 144\n", - " 825.0\n", - " 840.0\n", - " 840.0\n", - " 844.0\n", - " 844.0\n", - " 830.0\n", - " 839.0\n", - " 3242820.0\n", - " 738155.0\n", - " ...\n", + " 371675934.0\n", + " 400196323.0\n", " 7381.0\n", - " 59954.0\n", - " 15\n", + " 8228.0\n", + " 1666.0\n", + " 8831.0\n", + " 68811.0\n", + " 64088.0\n", + " 8629.0\n", + " 948.8\n", + " ...\n", + " 87.575221\n", + " 144\n", " 1\n", " 1\n", - " 2013\n", " 2\n", - " 1\n", " 2\n", + " 2013\n", + " 1\n", + " 15\n", " 1\n", " \n", " \n", @@ -1017,69 +851,69 @@ "" ], "text/plain": [ - " COUNT(readings) MAX(readings.value_S01) MAX(readings.value_S02) \\\n", - "turbine_id \n", - "T001 144 369.0 376.0 \n", - "T001 144 505.0 426.0 \n", - "T001 144 827.0 794.0 \n", - "T001 144 848.0 841.0 \n", - "T001 144 825.0 840.0 \n", + " SUM(readings.value_S14) SUM(readings.value_S11) \\\n", + "turbine_id \n", + "T001 465421817.0 496362516.0 \n", + "T001 465897578.0 496952628.0 \n", + "T001 466806830.0 498019072.0 \n", + "T001 468250434.0 499530451.0 \n", + "T001 371675934.0 400196323.0 \n", "\n", - " MAX(readings.value_S03) MAX(readings.value_S04) \\\n", + " SUM(readings.value_S25) SUM(readings.value_S23) \\\n", "turbine_id \n", - "T001 378.0 401.0 \n", - "T001 393.0 517.0 \n", - "T001 839.0 848.0 \n", - "T001 838.0 849.0 \n", - "T001 840.0 844.0 \n", + "T001 2743.0 2780.0 \n", + "T001 4237.0 4640.0 \n", + "T001 9008.0 9179.0 \n", + "T001 10073.0 10310.0 \n", + "T001 7381.0 8228.0 \n", "\n", - " MAX(readings.value_S05) MAX(readings.value_S06) \\\n", + " SUM(readings.value_S17) SUM(readings.value_S19) \\\n", "turbine_id \n", - "T001 317.0 324.0 \n", - "T001 469.0 407.0 \n", - "T001 843.0 843.0 \n", - "T001 850.0 848.0 \n", - "T001 844.0 830.0 \n", + "T001 994.6 3174.0 \n", + "T001 1166.7 5112.0 \n", + "T001 1581.7 9134.0 \n", + "T001 1690.9 10674.0 \n", + "T001 1666.0 8831.0 \n", "\n", - " MAX(readings.value_S07) MAX(readings.value_S08) \\\n", + " SUM(readings.value_S04) SUM(readings.value_S05) \\\n", "turbine_id \n", - "T001 301.0 3209069.0 \n", - "T001 459.0 3214181.0 \n", - "T001 844.0 3223315.0 \n", - "T001 850.0 3233989.0 \n", - "T001 839.0 3242820.0 \n", + "T001 19412.0 17083.0 \n", + "T001 38289.0 34344.0 \n", + "T001 86707.0 78749.0 \n", + "T001 87907.0 83264.0 \n", + "T001 68811.0 64088.0 \n", "\n", - " MAX(readings.value_S09) ... SUM(readings.value_S25) \\\n", - "turbine_id ... \n", - "T001 706654.0 ... 2743.0 \n", - "T001 711718.0 ... 4237.0 \n", - "T001 719405.0 ... 9008.0 \n", - "T001 728250.0 ... 10073.0 \n", - "T001 738155.0 ... 7381.0 \n", + " SUM(readings.value_S21) SUM(readings.value_S16) ... \\\n", + "turbine_id ... \n", + "T001 3061.0 550.4 ... \n", + "T001 4919.0 713.7 ... \n", + "T001 9863.0 916.3 ... \n", + "T001 10638.0 970.6 ... \n", + "T001 8629.0 948.8 ... \n", "\n", - " SUM(readings.value_S26) MODE(readings.DAY(timestamp)) \\\n", - "turbine_id \n", - "T001 20569.0 11 \n", - "T001 32991.0 12 \n", - "T001 63463.0 13 \n", - "T001 70393.0 14 \n", - "T001 59954.0 15 \n", + " MEAN(readings.value_S20) COUNT(readings) \\\n", + "turbine_id \n", + "T001 22.326389 144 \n", + "T001 35.166667 144 \n", + "T001 53.381944 144 \n", + "T001 61.423611 144 \n", + "T001 87.575221 144 \n", "\n", - " MODE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " NUM_UNIQUE(readings.YEAR(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " MODE(readings.WEEKDAY(timestamp)) MODE(readings.YEAR(timestamp)) \\\n", - "turbine_id \n", - "T001 4 2013 \n", - "T001 5 2013 \n", - "T001 6 2013 \n", - "T001 0 2013 \n", - "T001 1 2013 \n", + " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", "turbine_id \n", @@ -1089,14 +923,6 @@ "T001 2 \n", "T001 2 \n", "\n", - " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", "turbine_id \n", "T001 2 \n", @@ -1105,18 +931,26 @@ "T001 2 \n", "T001 2 \n", "\n", - " NUM_UNIQUE(readings.YEAR(timestamp)) \n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " MODE(readings.YEAR(timestamp)) MODE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "\n", + " MODE(readings.DAY(timestamp)) MODE(readings.WEEKDAY(timestamp)) \n", + "turbine_id \n", + "T001 11 4 \n", + "T001 12 5 \n", + "T001 13 6 \n", + "T001 14 0 \n", + "T001 15 1 \n", "\n", "[5 rows x 165 columns]" ] }, - "execution_count": 21, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -1127,7 +961,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1136,7 +970,7 @@ "165" ] }, - "execution_count": 22, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1159,17 +993,17 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "step = 5\n", + "step = 3\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -1178,7 +1012,7 @@ "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" ] }, - "execution_count": 24, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1189,7 +1023,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -1213,27 +1047,27 @@ " \n", " \n", " \n", - " COUNT(readings)\n", - " MAX(readings.value_S01)\n", - " MAX(readings.value_S02)\n", - " MAX(readings.value_S03)\n", - " MAX(readings.value_S04)\n", - " MAX(readings.value_S05)\n", - " MAX(readings.value_S06)\n", - " MAX(readings.value_S07)\n", - " MAX(readings.value_S08)\n", - " MAX(readings.value_S09)\n", - " ...\n", + " SUM(readings.value_S14)\n", + " SUM(readings.value_S11)\n", " SUM(readings.value_S25)\n", - " SUM(readings.value_S26)\n", - " MODE(readings.DAY(timestamp))\n", - " MODE(readings.MONTH(timestamp))\n", - " MODE(readings.WEEKDAY(timestamp))\n", - " MODE(readings.YEAR(timestamp))\n", - " NUM_UNIQUE(readings.DAY(timestamp))\n", + " SUM(readings.value_S23)\n", + " SUM(readings.value_S17)\n", + " SUM(readings.value_S19)\n", + " SUM(readings.value_S04)\n", + " SUM(readings.value_S05)\n", + " SUM(readings.value_S21)\n", + " SUM(readings.value_S16)\n", + " ...\n", + " MEAN(readings.value_S20)\n", + " COUNT(readings)\n", + " NUM_UNIQUE(readings.YEAR(timestamp))\n", " NUM_UNIQUE(readings.MONTH(timestamp))\n", + " NUM_UNIQUE(readings.DAY(timestamp))\n", " NUM_UNIQUE(readings.WEEKDAY(timestamp))\n", - " NUM_UNIQUE(readings.YEAR(timestamp))\n", + " MODE(readings.YEAR(timestamp))\n", + " MODE(readings.MONTH(timestamp))\n", + " MODE(readings.DAY(timestamp))\n", + " MODE(readings.WEEKDAY(timestamp))\n", " \n", " \n", " turbine_id\n", @@ -1263,122 +1097,122 @@ " \n", " \n", " T001\n", - " 144\n", - " 369.0\n", - " 376.0\n", - " 378.0\n", - " 401.0\n", - " 317.0\n", - " 324.0\n", - " 301.0\n", - " 3209069.0\n", - " 706654.0\n", - " ...\n", + " 465421817.0\n", + " 496362516.0\n", " 2743.0\n", - " 20569.0\n", - " 11\n", + " 2780.0\n", + " 994.6\n", + " 3174.0\n", + " 19412.0\n", + " 17083.0\n", + " 3061.0\n", + " 550.4\n", + " ...\n", + " 22.326389\n", + " 144\n", " 1\n", - " 4\n", - " 2013\n", - " 2\n", " 1\n", " 2\n", + " 2\n", + " 2013\n", " 1\n", + " 11\n", + " 4\n", " \n", " \n", " T001\n", - " 144\n", - " 505.0\n", - " 426.0\n", - " 393.0\n", - " 517.0\n", - " 469.0\n", - " 407.0\n", - " 459.0\n", - " 3214181.0\n", - " 711718.0\n", - " ...\n", + " 465897578.0\n", + " 496952628.0\n", " 4237.0\n", - " 32991.0\n", - " 12\n", + " 4640.0\n", + " 1166.7\n", + " 5112.0\n", + " 38289.0\n", + " 34344.0\n", + " 4919.0\n", + " 713.7\n", + " ...\n", + " 35.166667\n", + " 144\n", " 1\n", - " 5\n", - " 2013\n", - " 2\n", " 1\n", " 2\n", + " 2\n", + " 2013\n", " 1\n", + " 12\n", + " 5\n", " \n", " \n", " T001\n", - " 144\n", - " 827.0\n", - " 794.0\n", - " 839.0\n", - " 848.0\n", - " 843.0\n", - " 843.0\n", - " 844.0\n", - " 3223315.0\n", - " 719405.0\n", - " ...\n", + " 466806830.0\n", + " 498019072.0\n", " 9008.0\n", - " 63463.0\n", - " 13\n", + " 9179.0\n", + " 1581.7\n", + " 9134.0\n", + " 86707.0\n", + " 78749.0\n", + " 9863.0\n", + " 916.3\n", + " ...\n", + " 53.381944\n", + " 144\n", " 1\n", - " 6\n", - " 2013\n", - " 2\n", " 1\n", " 2\n", + " 2\n", + " 2013\n", " 1\n", + " 13\n", + " 6\n", " \n", " \n", " T001\n", - " 144\n", - " 848.0\n", - " 841.0\n", - " 838.0\n", - " 849.0\n", - " 850.0\n", - " 848.0\n", - " 850.0\n", - " 3233989.0\n", - " 728250.0\n", - " ...\n", + " 468250434.0\n", + " 499530451.0\n", " 10073.0\n", - " 70393.0\n", - " 14\n", + " 10310.0\n", + " 1690.9\n", + " 10674.0\n", + " 87907.0\n", + " 83264.0\n", + " 10638.0\n", + " 970.6\n", + " ...\n", + " 61.423611\n", + " 144\n", " 1\n", - " 0\n", - " 2013\n", - " 2\n", " 1\n", " 2\n", + " 2\n", + " 2013\n", " 1\n", + " 14\n", + " 0\n", " \n", " \n", " T001\n", - " 144\n", - " 825.0\n", - " 840.0\n", - " 840.0\n", - " 844.0\n", - " 844.0\n", - " 830.0\n", - " 839.0\n", - " 3242820.0\n", - " 738155.0\n", - " ...\n", + " 371675934.0\n", + " 400196323.0\n", " 7381.0\n", - " 59954.0\n", - " 15\n", + " 8228.0\n", + " 1666.0\n", + " 8831.0\n", + " 68811.0\n", + " 64088.0\n", + " 8629.0\n", + " 948.8\n", + " ...\n", + " 87.575221\n", + " 144\n", " 1\n", " 1\n", - " 2013\n", " 2\n", - " 1\n", " 2\n", + " 2013\n", + " 1\n", + " 15\n", " 1\n", " \n", " \n", @@ -1387,69 +1221,69 @@ "" ], "text/plain": [ - " COUNT(readings) MAX(readings.value_S01) MAX(readings.value_S02) \\\n", - "turbine_id \n", - "T001 144 369.0 376.0 \n", - "T001 144 505.0 426.0 \n", - "T001 144 827.0 794.0 \n", - "T001 144 848.0 841.0 \n", - "T001 144 825.0 840.0 \n", + " SUM(readings.value_S14) SUM(readings.value_S11) \\\n", + "turbine_id \n", + "T001 465421817.0 496362516.0 \n", + "T001 465897578.0 496952628.0 \n", + "T001 466806830.0 498019072.0 \n", + "T001 468250434.0 499530451.0 \n", + "T001 371675934.0 400196323.0 \n", "\n", - " MAX(readings.value_S03) MAX(readings.value_S04) \\\n", + " SUM(readings.value_S25) SUM(readings.value_S23) \\\n", "turbine_id \n", - "T001 378.0 401.0 \n", - "T001 393.0 517.0 \n", - "T001 839.0 848.0 \n", - "T001 838.0 849.0 \n", - "T001 840.0 844.0 \n", + "T001 2743.0 2780.0 \n", + "T001 4237.0 4640.0 \n", + "T001 9008.0 9179.0 \n", + "T001 10073.0 10310.0 \n", + "T001 7381.0 8228.0 \n", "\n", - " MAX(readings.value_S05) MAX(readings.value_S06) \\\n", + " SUM(readings.value_S17) SUM(readings.value_S19) \\\n", "turbine_id \n", - "T001 317.0 324.0 \n", - "T001 469.0 407.0 \n", - "T001 843.0 843.0 \n", - "T001 850.0 848.0 \n", - "T001 844.0 830.0 \n", + "T001 994.6 3174.0 \n", + "T001 1166.7 5112.0 \n", + "T001 1581.7 9134.0 \n", + "T001 1690.9 10674.0 \n", + "T001 1666.0 8831.0 \n", "\n", - " MAX(readings.value_S07) MAX(readings.value_S08) \\\n", + " SUM(readings.value_S04) SUM(readings.value_S05) \\\n", "turbine_id \n", - "T001 301.0 3209069.0 \n", - "T001 459.0 3214181.0 \n", - "T001 844.0 3223315.0 \n", - "T001 850.0 3233989.0 \n", - "T001 839.0 3242820.0 \n", + "T001 19412.0 17083.0 \n", + "T001 38289.0 34344.0 \n", + "T001 86707.0 78749.0 \n", + "T001 87907.0 83264.0 \n", + "T001 68811.0 64088.0 \n", "\n", - " MAX(readings.value_S09) ... SUM(readings.value_S25) \\\n", - "turbine_id ... \n", - "T001 706654.0 ... 2743.0 \n", - "T001 711718.0 ... 4237.0 \n", - "T001 719405.0 ... 9008.0 \n", - "T001 728250.0 ... 10073.0 \n", - "T001 738155.0 ... 7381.0 \n", + " SUM(readings.value_S21) SUM(readings.value_S16) ... \\\n", + "turbine_id ... \n", + "T001 3061.0 550.4 ... \n", + "T001 4919.0 713.7 ... \n", + "T001 9863.0 916.3 ... \n", + "T001 10638.0 970.6 ... \n", + "T001 8629.0 948.8 ... \n", "\n", - " SUM(readings.value_S26) MODE(readings.DAY(timestamp)) \\\n", - "turbine_id \n", - "T001 20569.0 11 \n", - "T001 32991.0 12 \n", - "T001 63463.0 13 \n", - "T001 70393.0 14 \n", - "T001 59954.0 15 \n", + " MEAN(readings.value_S20) COUNT(readings) \\\n", + "turbine_id \n", + "T001 22.326389 144 \n", + "T001 35.166667 144 \n", + "T001 53.381944 144 \n", + "T001 61.423611 144 \n", + "T001 87.575221 144 \n", "\n", - " MODE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " NUM_UNIQUE(readings.YEAR(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", - " MODE(readings.WEEKDAY(timestamp)) MODE(readings.YEAR(timestamp)) \\\n", - "turbine_id \n", - "T001 4 2013 \n", - "T001 5 2013 \n", - "T001 6 2013 \n", - "T001 0 2013 \n", - "T001 1 2013 \n", + " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", + "T001 1 \n", "\n", " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", "turbine_id \n", @@ -1459,14 +1293,6 @@ "T001 2 \n", "T001 2 \n", "\n", - " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", "turbine_id \n", "T001 2 \n", @@ -1475,18 +1301,26 @@ "T001 2 \n", "T001 2 \n", "\n", - " NUM_UNIQUE(readings.YEAR(timestamp)) \n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", + " MODE(readings.YEAR(timestamp)) MODE(readings.MONTH(timestamp)) \\\n", + "turbine_id \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "T001 2013 1 \n", + "\n", + " MODE(readings.DAY(timestamp)) MODE(readings.WEEKDAY(timestamp)) \n", + "turbine_id \n", + "T001 11 4 \n", + "T001 12 5 \n", + "T001 13 6 \n", + "T001 14 0 \n", + "T001 15 1 \n", "\n", "[5 rows x 165 columns]" ] }, - "execution_count": 25, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1497,7 +1331,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -1702,7 +1536,7 @@ "[5 rows x 28 columns]" ] }, - "execution_count": 26, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1724,17 +1558,17 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ - "step = 6\n", + "step = 4\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1743,7 +1577,7 @@ "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" ] }, - "execution_count": 28, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1755,7 +1589,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1769,7 +1603,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.12" + "version": "3.7.11" } }, "nbformat": 4, diff --git a/tutorials/pipelines/double_lstm_with_unstack.ipynb b/tutorials/pipelines/double_lstm_with_unstack.ipynb new file mode 100644 index 0000000..4bc7d0f --- /dev/null +++ b/tutorials/pipelines/double_lstm_with_unstack.ipynb @@ -0,0 +1,2375 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# double_lstm_with_unstack" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from draco.demo import load_demo\n", + "\n", + "target_times, readings = load_demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_name = 'double_lstm_with_unstack'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from draco.pipeline import DracoPipeline\n", + "\n", + "pipeline = DracoPipeline(pipeline_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['mlblocks.MLPipeline',\n", + " 'pandas.DataFrame.pop',\n", + " 'pandas.DataFrame.pop',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'sklearn.preprocessing.MinMaxScaler',\n", + " 'pandas.DataFrame',\n", + " 'pandas.DataFrame.set',\n", + " 'pandas.DataFrame.set',\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences',\n", + " 'keras.Sequential.DoubleLSTMTimeSeriesClassifier']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.template['primitives']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step by Step execution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Input Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation (part of Draco Pipeline)\n", + "\n", + "* Input: target_times, readings, turbines\n", + "* Output: X, y, readings, turbines\n", + "* Effect: target_times has been split into X and y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlblocks.MLPipeline\n", + "\n", + "### pandas.DataFrame.resample\n", + "\n", + "* Input: readings\n", + "* Output: readings (resampled)\n", + "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", + " signal_id and timestamp have been set as a multi-index\n", + " \n", + "### pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "context = pipeline.fit(target_times, readings, output_=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00323.0320.0284.0348.0273.0342.0280.03197842.0...11.73131020.055.055.047.058.045.058.047.0356.0
1T0012013-01-10 00:10:00346.0384.0367.0411.0331.0360.0249.03197900.0...10.23131420.058.063.062.067.055.061.042.0400.0
2T0012013-01-10 00:20:00407.0363.0407.0393.0275.0335.0270.03197968.0...9.53131822.068.061.067.066.046.055.045.0402.0
3T0012013-01-10 00:30:00257.0307.0315.0361.0317.0354.0271.03198011.0...10.53132179.043.051.053.062.053.060.045.0357.0
4T0012013-01-10 00:40:00267.0309.0314.0355.0262.0246.0212.03198056.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", + "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", + "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", + "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", + "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", + "\n", + " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", + "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", + "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", + "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", + "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", + "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", + "\n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + "\n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (unstacked)\n", + "* Output: readings (without turbine_id), turbine_id\n", + "* Effect: turbine_id has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 T001\n", + "1 T001\n", + "2 T001\n", + "3 T001\n", + "4 T001\n", + "Name: turbine_id, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['turbine_id'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
02013-01-10 00:00:00323.0320.0284.0348.0273.0342.0280.03197842.0695000.0...11.73131020.055.055.047.058.045.058.047.0356.0
12013-01-10 00:10:00346.0384.0367.0411.0331.0360.0249.03197900.0695063.0...10.23131420.058.063.062.067.055.061.042.0400.0
22013-01-10 00:20:00407.0363.0407.0393.0275.0335.0270.03197968.0695124.0...9.53131822.068.061.067.066.046.055.045.0402.0
32013-01-10 00:30:00257.0307.0315.0361.0317.0354.0271.03198011.0695175.0...10.53132179.043.051.053.062.053.060.045.0357.0
42013-01-10 00:40:00267.0309.0314.0355.0262.0246.0212.03198056.0695226.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " timestamp value_S01 value_S02 value_S03 value_S04 value_S05 \\\n", + "0 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 273.0 \n", + "1 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 331.0 \n", + "2 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 275.0 \n", + "3 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 317.0 \n", + "4 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 262.0 \n", + "\n", + " value_S06 value_S07 value_S08 value_S09 ... value_S17 value_S18 \\\n", + "0 342.0 280.0 3197842.0 695000.0 ... 11.7 3131020.0 \n", + "1 360.0 249.0 3197900.0 695063.0 ... 10.2 3131420.0 \n", + "2 335.0 270.0 3197968.0 695124.0 ... 9.5 3131822.0 \n", + "3 354.0 271.0 3198011.0 695175.0 ... 10.5 3132179.0 \n", + "4 246.0 212.0 3198056.0 695226.0 ... 9.6 3132501.0 \n", + "\n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + "\n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (without turbine_id)\n", + "* Output: readings (without timestamp), timestamp\n", + "* Effect: timestamp has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "step = 2\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2013-01-10 00:00:00\n", + "1 2013-01-10 00:10:00\n", + "2 2013-01-10 00:20:00\n", + "3 2013-01-10 00:30:00\n", + "4 2013-01-10 00:40:00\n", + "Name: timestamp, dtype: datetime64[ns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['timestamp'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09value_S10...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0323.0320.0284.0348.0273.0342.0280.03197842.0695000.03348234.0...11.73131020.055.055.047.058.045.058.047.0356.0
1346.0384.0367.0411.0331.0360.0249.03197900.0695063.03348296.0...10.23131420.058.063.062.067.055.061.042.0400.0
2407.0363.0407.0393.0275.0335.0270.03197968.0695124.03348363.0...9.53131822.068.061.067.066.046.055.045.0402.0
3257.0307.0315.0361.0317.0354.0271.03198011.0695175.03348416.0...10.53132179.043.051.053.062.053.060.045.0357.0
4267.0309.0314.0355.0262.0246.0212.03198056.0695226.03348470.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", + "0 323.0 320.0 284.0 348.0 273.0 342.0 \n", + "1 346.0 384.0 367.0 411.0 331.0 360.0 \n", + "2 407.0 363.0 407.0 393.0 275.0 335.0 \n", + "3 257.0 307.0 315.0 361.0 317.0 354.0 \n", + "4 267.0 309.0 314.0 355.0 262.0 246.0 \n", + "\n", + " value_S07 value_S08 value_S09 value_S10 ... value_S17 value_S18 \\\n", + "0 280.0 3197842.0 695000.0 3348234.0 ... 11.7 3131020.0 \n", + "1 249.0 3197900.0 695063.0 3348296.0 ... 10.2 3131420.0 \n", + "2 270.0 3197968.0 695124.0 3348363.0 ... 9.5 3131822.0 \n", + "3 271.0 3198011.0 695175.0 3348416.0 ... 10.5 3132179.0 \n", + "4 212.0 3198056.0 695226.0 3348470.0 ... 9.6 3132501.0 \n", + "\n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + "\n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## sklearn.impute.SimpleImputer\n", + "\n", + "* Input: readings (unstacked, no turbine_id, no timestamp)\n", + "* Output: readings (imputed, numpy array)\n", + "* Effect: readings have been imputed and converted to numpy array" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "step = 3\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[3.230000e+02, 3.200000e+02, 2.840000e+02, 3.480000e+02,\n", + " 2.730000e+02, 3.420000e+02, 2.800000e+02, 3.197842e+06,\n", + " 6.950000e+05, 3.348234e+06, 3.436762e+06, 3.322362e+06,\n", + " 3.357952e+06, 3.223797e+06, 8.300000e+00, 6.000000e+00,\n", + " 1.170000e+01, 3.131020e+06, 5.500000e+01, 5.500000e+01,\n", + " 4.700000e+01, 5.800000e+01, 4.500000e+01, 5.800000e+01,\n", + " 4.700000e+01, 3.560000e+02],\n", + " [3.460000e+02, 3.840000e+02, 3.670000e+02, 4.110000e+02,\n", + " 3.310000e+02, 3.600000e+02, 2.490000e+02, 3.197900e+06,\n", + " 6.950630e+05, 3.348296e+06, 3.436829e+06, 3.322417e+06,\n", + " 3.358013e+06, 3.223839e+06, 7.600000e+00, 5.000000e+00,\n", + " 1.020000e+01, 3.131420e+06, 5.800000e+01, 6.300000e+01,\n", + " 6.200000e+01, 6.700000e+01, 5.500000e+01, 6.100000e+01,\n", + " 4.200000e+01, 4.000000e+02],\n", + " [4.070000e+02, 3.630000e+02, 4.070000e+02, 3.930000e+02,\n", + " 2.750000e+02, 3.350000e+02, 2.700000e+02, 3.197968e+06,\n", + " 6.951240e+05, 3.348363e+06, 3.436895e+06, 3.322463e+06,\n", + " 3.358068e+06, 3.223884e+06, 7.800000e+00, 5.700000e+00,\n", + " 9.500000e+00, 3.131822e+06, 6.800000e+01, 6.100000e+01,\n", + " 6.700000e+01, 6.600000e+01, 4.600000e+01, 5.500000e+01,\n", + " 4.500000e+01, 4.020000e+02],\n", + " [2.570000e+02, 3.070000e+02, 3.150000e+02, 3.610000e+02,\n", + " 3.170000e+02, 3.540000e+02, 2.710000e+02, 3.198011e+06,\n", + " 6.951750e+05, 3.348416e+06, 3.436957e+06, 3.322516e+06,\n", + " 3.358128e+06, 3.223929e+06, 8.600000e+00, 6.600000e+00,\n", + " 1.050000e+01, 3.132179e+06, 4.300000e+01, 5.100000e+01,\n", + " 5.300000e+01, 6.200000e+01, 5.300000e+01, 6.000000e+01,\n", + " 4.500000e+01, 3.570000e+02],\n", + " [2.670000e+02, 3.090000e+02, 3.140000e+02, 3.550000e+02,\n", + " 2.620000e+02, 2.460000e+02, 2.120000e+02, 3.198056e+06,\n", + " 6.952260e+05, 3.348470e+06, 3.437016e+06, 3.322559e+06,\n", + " 3.358169e+06, 3.223965e+06, 7.500000e+00, 5.900000e+00,\n", + " 9.600000e+00, 3.132501e+06, 4.500000e+01, 5.100000e+01,\n", + " 5.400000e+01, 5.900000e+01, 4.300000e+01, 4.100000e+01,\n", + " 3.600000e+01, 3.220000e+02]])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## sklearn.preprocessing.MinMaxScaler\n", + "\n", + "* Input: (imputed, array)\n", + "* Output: readings (scaled, array)\n", + "* Effect: readings have been scaled to [-1, 1] range" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "step = 4\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.23563892, -0.24267292, -0.3286385 , -0.17702227, -0.35287222,\n", + " -0.19248826, -0.3317757 , -1. , -1. , -1. ,\n", + " -1. , -1. , -1. , -1. , -0.11702128,\n", + " -0.24050633, -0.25714286, -0.37378787, -0.22758621, -0.22758621,\n", + " -0.31972789, -0.1862069 , -0.36986301, -0.1862069 , -0.33793103,\n", + " -0.26141079],\n", + " [-0.18171161, -0.0926143 , -0.13380282, -0.02930832, -0.21688159,\n", + " -0.15023474, -0.40420561, -0.99995911, -0.99995779, -0.99995941,\n", + " -0.99995718, -0.99996326, -0.99996042, -0.99997164, -0.19148936,\n", + " -0.36708861, -0.35238095, -0.37370786, -0.1862069 , -0.11724138,\n", + " -0.11564626, -0.06206897, -0.23287671, -0.14482759, -0.40689655,\n", + " -0.17012448],\n", + " [-0.03868699, -0.14185229, -0.0399061 , -0.07151231, -0.34818288,\n", + " -0.20892019, -0.35514019, -0.99991116, -0.99991693, -0.99991555,\n", + " -0.999915 , -0.99993254, -0.99992474, -0.99994125, -0.17021277,\n", + " -0.27848101, -0.3968254 , -0.37362746, -0.04827586, -0.14482759,\n", + " -0.04761905, -0.07586207, -0.35616438, -0.22758621, -0.36551724,\n", + " -0.1659751 ],\n", + " [-0.39038687, -0.27315358, -0.25586854, -0.14654162, -0.24970692,\n", + " -0.16431925, -0.35280374, -0.99988085, -0.99988276, -0.99988086,\n", + " -0.99987538, -0.99989714, -0.99988581, -0.99991086, -0.08510638,\n", + " -0.16455696, -0.33333333, -0.37355606, -0.39310345, -0.28275862,\n", + " -0.23809524, -0.13103448, -0.26027397, -0.15862069, -0.36551724,\n", + " -0.2593361 ],\n", + " [-0.36694021, -0.26846424, -0.25821596, -0.16060961, -0.37866354,\n", + " -0.41784038, -0.49065421, -0.99984912, -0.99984859, -0.99984551,\n", + " -0.99983767, -0.99986841, -0.99985921, -0.99988655, -0.20212766,\n", + " -0.25316456, -0.39047619, -0.37349166, -0.36551724, -0.28275862,\n", + " -0.2244898 , -0.17241379, -0.39726027, -0.42068966, -0.48965517,\n", + " -0.33195021]])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame\n", + "\n", + "* Input: readings (scaled, array)\n", + "* Output: readings (dataframe)\n", + "* Effect: readings have been converted into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "step = 5\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...16171819202122232425
0-0.235639-0.242673-0.328638-0.177022-0.352872-0.192488-0.331776-1.000000-1.000000-1.000000...-0.257143-0.373788-0.227586-0.227586-0.319728-0.186207-0.369863-0.186207-0.337931-0.261411
1-0.181712-0.092614-0.133803-0.029308-0.216882-0.150235-0.404206-0.999959-0.999958-0.999959...-0.352381-0.373708-0.186207-0.117241-0.115646-0.062069-0.232877-0.144828-0.406897-0.170124
2-0.038687-0.141852-0.039906-0.071512-0.348183-0.208920-0.355140-0.999911-0.999917-0.999916...-0.396825-0.373627-0.048276-0.144828-0.047619-0.075862-0.356164-0.227586-0.365517-0.165975
3-0.390387-0.273154-0.255869-0.146542-0.249707-0.164319-0.352804-0.999881-0.999883-0.999881...-0.333333-0.373556-0.393103-0.282759-0.238095-0.131034-0.260274-0.158621-0.365517-0.259336
4-0.366940-0.268464-0.258216-0.160610-0.378664-0.417840-0.490654-0.999849-0.999849-0.999846...-0.390476-0.373492-0.365517-0.282759-0.224490-0.172414-0.397260-0.420690-0.489655-0.331950
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "\n", + " 7 8 9 ... 16 17 18 19 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.257143 -0.373788 -0.227586 -0.227586 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.352381 -0.373708 -0.186207 -0.117241 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.396825 -0.373627 -0.048276 -0.144828 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.333333 -0.373556 -0.393103 -0.282759 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.390476 -0.373492 -0.365517 -0.282759 \n", + "\n", + " 20 21 22 23 24 25 \n", + "0 -0.319728 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 \n", + "1 -0.115646 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 \n", + "2 -0.047619 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 \n", + "3 -0.238095 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 \n", + "4 -0.224490 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe)\n", + "* Output: readings (dataframe with turbine_id)\n", + "* Effect: turbine_id has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "step = 6\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...171819202122232425turbine_id
0-0.235639-0.242673-0.328638-0.177022-0.352872-0.192488-0.331776-1.000000-1.000000-1.000000...-0.373788-0.227586-0.227586-0.319728-0.186207-0.369863-0.186207-0.337931-0.261411T001
1-0.181712-0.092614-0.133803-0.029308-0.216882-0.150235-0.404206-0.999959-0.999958-0.999959...-0.373708-0.186207-0.117241-0.115646-0.062069-0.232877-0.144828-0.406897-0.170124T001
2-0.038687-0.141852-0.039906-0.071512-0.348183-0.208920-0.355140-0.999911-0.999917-0.999916...-0.373627-0.048276-0.144828-0.047619-0.075862-0.356164-0.227586-0.365517-0.165975T001
3-0.390387-0.273154-0.255869-0.146542-0.249707-0.164319-0.352804-0.999881-0.999883-0.999881...-0.373556-0.393103-0.282759-0.238095-0.131034-0.260274-0.158621-0.365517-0.259336T001
4-0.366940-0.268464-0.258216-0.160610-0.378664-0.417840-0.490654-0.999849-0.999849-0.999846...-0.373492-0.365517-0.282759-0.224490-0.172414-0.397260-0.420690-0.489655-0.331950T001
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "\n", + " 7 8 9 ... 17 18 19 20 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.373788 -0.227586 -0.227586 -0.319728 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.373708 -0.186207 -0.117241 -0.115646 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.373627 -0.048276 -0.144828 -0.047619 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.373556 -0.393103 -0.282759 -0.238095 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.373492 -0.365517 -0.282759 -0.224490 \n", + "\n", + " 21 22 23 24 25 turbine_id \n", + "0 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 T001 \n", + "1 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 T001 \n", + "2 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 T001 \n", + "3 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 T001 \n", + "4 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 T001 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe with turbine_id)\n", + "* Output: readings (dataframe with turbine_id and timestamp)\n", + "* Effect: timestamp has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "step = 7\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...1819202122232425turbine_idtimestamp
0-0.235639-0.242673-0.328638-0.177022-0.352872-0.192488-0.331776-1.000000-1.000000-1.000000...-0.227586-0.227586-0.319728-0.186207-0.369863-0.186207-0.337931-0.261411T0012013-01-10 00:00:00
1-0.181712-0.092614-0.133803-0.029308-0.216882-0.150235-0.404206-0.999959-0.999958-0.999959...-0.186207-0.117241-0.115646-0.062069-0.232877-0.144828-0.406897-0.170124T0012013-01-10 00:10:00
2-0.038687-0.141852-0.039906-0.071512-0.348183-0.208920-0.355140-0.999911-0.999917-0.999916...-0.048276-0.144828-0.047619-0.075862-0.356164-0.227586-0.365517-0.165975T0012013-01-10 00:20:00
3-0.390387-0.273154-0.255869-0.146542-0.249707-0.164319-0.352804-0.999881-0.999883-0.999881...-0.393103-0.282759-0.238095-0.131034-0.260274-0.158621-0.365517-0.259336T0012013-01-10 00:30:00
4-0.366940-0.268464-0.258216-0.160610-0.378664-0.417840-0.490654-0.999849-0.999849-0.999846...-0.365517-0.282759-0.224490-0.172414-0.397260-0.420690-0.489655-0.331950T0012013-01-10 00:40:00
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "\n", + " 7 8 9 ... 18 19 20 21 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.227586 -0.227586 -0.319728 -0.186207 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.186207 -0.117241 -0.115646 -0.062069 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.048276 -0.144828 -0.047619 -0.075862 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.393103 -0.282759 -0.238095 -0.131034 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.365517 -0.282759 -0.224490 -0.172414 \n", + "\n", + " 22 23 24 25 turbine_id timestamp \n", + "0 -0.369863 -0.186207 -0.337931 -0.261411 T001 2013-01-10 00:00:00 \n", + "1 -0.232877 -0.144828 -0.406897 -0.170124 T001 2013-01-10 00:10:00 \n", + "2 -0.356164 -0.227586 -0.365517 -0.165975 T001 2013-01-10 00:20:00 \n", + "3 -0.260274 -0.158621 -0.365517 -0.259336 T001 2013-01-10 00:30:00 \n", + "4 -0.397260 -0.420690 -0.489655 -0.331950 T001 2013-01-10 00:40:00 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences\n", + "\n", + "* Input: X, readings (dataframe with turbine_id and timestamp)\n", + "* Output: X\n", + "* Effect: X has been converted to a 3d numpy array that contains 1 matrix of shape\n", + " (window_size x num_signals) for each one of the target times." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline._pipeline.get_hyperparameters()[\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "step = 8\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(51121, 28)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353,)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['y'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353, 24, 26)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.66002345, -0.57327081, -0.64084507, -0.57796014, -0.6014068 ,\n", + " -0.56103286, -0.55140187, -0.9928135 , -0.99291267, -0.99315058,\n", + " -0.99304288, -0.99346346, -0.99352632, -0.99395333, -0.42553191,\n", + " -0.41772152, -0.58730159, -0.35996294, -0.66896552, -0.57241379,\n", + " -0.61904762, -0.5862069 , -0.60273973, -0.55862069, -0.55862069,\n", + " -0.59751037],\n", + " [-0.2989449 , -0.38569754, -0.48591549, -0.47713951, -0.66705744,\n", + " -0.5915493 , -0.77336449, -0.99278389, -0.9928852 , -0.99312701,\n", + " -0.99301988, -0.9934481 , -0.9935075 , -0.9939459 , -0.39361702,\n", + " -0.40506329, -0.54285714, -0.35992014, -0.40689655, -0.42068966,\n", + " -0.46938776, -0.48965517, -0.67123288, -0.5862069 , -0.83448276,\n", + " -0.5560166 ],\n", + " [-0.33645955, -0.40679953, -0.39906103, -0.38569754, -0.56154748,\n", + " -0.43192488, -0.45560748, -0.99275498, -0.9928584 , -0.99310017,\n", + " -0.99299431, -0.99342739, -0.99348349, -0.99392294, -0.29787234,\n", + " -0.3164557 , -0.49206349, -0.35986854, -0.42068966, -0.43448276,\n", + " -0.40136054, -0.43448276, -0.56164384, -0.47586207, -0.51724138,\n", + " -0.46473029],\n", + " [-0.36928488, -0.41148886, -0.51173709, -0.47010551, -0.54982415,\n", + " -0.48122066, -0.51869159, -0.99272467, -0.9928316 , -0.99307791,\n", + " -0.99297067, -0.99340669, -0.99346079, -0.99390066, -0.29787234,\n", + " -0.35443038, -0.49206349, -0.35981854, -0.39310345, -0.43448276,\n", + " -0.49659864, -0.47586207, -0.56164384, -0.50344828, -0.53103448,\n", + " -0.4813278 ],\n", + " [-0.20750293, -0.35287222, -0.37089202, -0.2989449 , -0.32473623,\n", + " -0.37793427, -0.45794393, -0.99269435, -0.99280347, -0.99305173,\n", + " -0.99294447, -0.9933793 , -0.99343419, -0.9938777 , -0.32978723,\n", + " -0.39240506, -0.49206349, -0.35976314, -0.39310345, -0.40689655,\n", + " -0.41496599, -0.42068966, -0.42465753, -0.42068966, -0.51724138,\n", + " -0.4253112 ],\n", + " [-0.55685815, -0.60375147, -0.64084507, -0.54513482, -0.55685815,\n", + " -0.58215962, -0.63785047, -0.99267179, -0.99278404, -0.99303471,\n", + " -0.99292338, -0.99335793, -0.99341472, -0.99386014, -0.29787234,\n", + " -0.34177215, -0.51746032, -0.35972353, -0.54482759, -0.5862069 ,\n", + " -0.60544218, -0.53103448, -0.54794521, -0.57241379, -0.62758621,\n", + " -0.58921162],\n", + " [-0.66705744, -0.67643611, -0.69014085, -0.64361079, -0.74443142,\n", + " -0.7370892 , -0.7546729 , -0.99265487, -0.99276863, -0.99302096,\n", + " -0.9929074 , -0.99334657, -0.9934024 , -0.99384934, -0.39361702,\n", + " -0.48101266, -0.51746032, -0.35969533, -0.65517241, -0.66896552,\n", + " -0.67346939, -0.64137931, -0.75342466, -0.72413793, -0.76551724,\n", + " -0.70746888],\n", + " [-0.53341149, -0.60375147, -0.63849765, -0.61547479, -0.71395076,\n", + " -0.70187793, -0.72897196, -0.99263231, -0.99275054, -0.99300394,\n", + " -0.99289014, -0.99333255, -0.99338877, -0.9938365 , -0.38297872,\n", + " -0.37974684, -0.54920635, -0.35966173, -0.54482759, -0.6137931 ,\n", + " -0.60544218, -0.6137931 , -0.69863014, -0.69655172, -0.72413793,\n", + " -0.65145228],\n", + " [-0.44196952, -0.4021102 , -0.49295775, -0.49355217, -0.62719812,\n", + " -0.62676056, -0.71728972, -0.99260481, -0.99272173, -0.99298103,\n", + " -0.99286777, -0.99331518, -0.9933719 , -0.99382367, -0.38297872,\n", + " -0.4556962 , -0.54285714, -0.35961793, -0.44827586, -0.39310345,\n", + " -0.4829932 , -0.50344828, -0.63013699, -0.62758621, -0.72413793,\n", + " -0.54564315],\n", + " [-0.46307151, -0.38100821, -0.35446009, -0.44900352, -0.50293083,\n", + " -0.4741784 , -0.63317757, -0.99257731, -0.99269226, -0.99295157,\n", + " -0.99284285, -0.99329247, -0.9933479 , -0.99380612, -0.28723404,\n", + " -0.3164557 , -0.47301587, -0.35956633, -0.44827586, -0.37931034,\n", + " -0.34693878, -0.44827586, -0.52054795, -0.47586207, -0.62758621,\n", + " -0.46473029],\n", + " [-0.26611958, -0.26611958, -0.29107981, -0.34349355, -0.3950762 ,\n", + " -0.29577465, -0.43925234, -0.9925477 , -0.99266278, -0.99292211,\n", + " -0.99281601, -0.99326575, -0.99331805, -0.99378316, -0.28723404,\n", + " -0.39240506, -0.46031746, -0.35950873, -0.40689655, -0.37931034,\n", + " -0.34693878, -0.40689655, -0.43835616, -0.35172414, -0.51724138,\n", + " -0.40248963],\n", + " [-0.46307151, -0.35990621, -0.43192488, -0.36928488, -0.47245018,\n", + " -0.44600939, -0.41121495, -0.99252091, -0.9926333 , -0.99289592,\n", + " -0.99278789, -0.99324104, -0.99329275, -0.99375547, -0.28723404,\n", + " -0.43037975, -0.46666667, -0.35945292, -0.46206897, -0.37931034,\n", + " -0.41496599, -0.37931034, -0.47945205, -0.44827586, -0.42068966,\n", + " -0.42116183],\n", + " [-0.44431419, -0.4021102 , -0.38732394, -0.3059789 , -0.35990621,\n", + " -0.28403756, -0.40420561, -0.99249341, -0.99260583, -0.99287039,\n", + " -0.99275913, -0.99321298, -0.99326226, -0.99372846, -0.30851064,\n", + " -0.4556962 , -0.46031746, -0.35939572, -0.44827586, -0.42068966,\n", + " -0.42857143, -0.36551724, -0.4109589 , -0.33793103, -0.43448276,\n", + " -0.406639 ],\n", + " [-0.43962485, -0.36459555, -0.35211268, -0.35052755, -0.44665885,\n", + " -0.34741784, -0.44859813, -0.99246592, -0.99257703, -0.99284028,\n", + " -0.99273037, -0.99318693, -0.99323176, -0.99370279, -0.28723404,\n", + " -0.36708861, -0.48571429, -0.35933712, -0.44827586, -0.39310345,\n", + " -0.33333333, -0.36551724, -0.45205479, -0.33793103, -0.46206897,\n", + " -0.39211618],\n", + " [-0.2028136 , -0.25439625, -0.30751174, -0.3130129 , -0.37631887,\n", + " -0.3685446 , -0.46495327, -0.99243067, -0.99254152, -0.9928082 ,\n", + " -0.99269906, -0.99315821, -0.99320322, -0.99367781, -0.27659574,\n", + " -0.32911392, -0.47301587, -0.35927332, -0.29655172, -0.25517241,\n", + " -0.29251701, -0.31034483, -0.39726027, -0.37931034, -0.47586207,\n", + " -0.33817427],\n", + " [-0.23329426, -0.27080891, -0.31924883, -0.24736225, -0.35521688,\n", + " -0.33098592, -0.4182243 , -0.99239753, -0.99250668, -0.99277743,\n", + " -0.99266518, -0.99312815, -0.99317272, -0.99365012, -0.26595745,\n", + " -0.40506329, -0.46666667, -0.35920811, -0.33793103, -0.26896552,\n", + " -0.31972789, -0.25517241, -0.36986301, -0.33793103, -0.42068966,\n", + " -0.32365145],\n", + " [-0.12778429, -0.11137163, -0.10798122, -0.05275498, -0.25439625,\n", + " -0.23474178, -0.28271028, -0.99236228, -0.99247117, -0.99274143,\n", + " -0.99263131, -0.99309876, -0.99314028, -0.99362108, -0.24468085,\n", + " -0.32911392, -0.43492063, -0.35914011, -0.29655172, -0.25517241,\n", + " -0.21088435, -0.25517241, -0.38356164, -0.29655172, -0.39310345,\n", + " -0.29460581],\n", + " [-0.14185229, -0.2028136 , -0.31690141, -0.17467761, -0.24970692,\n", + " -0.25117371, -0.37383178, -0.9923242 , -0.99243567, -0.99271066,\n", + " -0.9925968 , -0.9930667 , -0.99310849, -0.99359204, -0.22340426,\n", + " -0.3164557 , -0.41587302, -0.35907171, -0.24137931, -0.25517241,\n", + " -0.31972789, -0.24137931, -0.32876712, -0.31034483, -0.39310345,\n", + " -0.29045643],\n", + " [-0.4021102 , -0.32708089, -0.33802817, -0.28018757, -0.3950762 ,\n", + " -0.40140845, -0.48364486, -0.99229459, -0.99240284, -0.99268055,\n", + " -0.99256421, -0.99303731, -0.99308059, -0.99356773, -0.25531915,\n", + " -0.29113924, -0.40952381, -0.35901131, -0.40689655, -0.31034483,\n", + " -0.33333333, -0.28275862, -0.38356164, -0.39310345, -0.48965517,\n", + " -0.37344398],\n", + " [-0.27549824, -0.3059789 , -0.37089202, -0.20046893, -0.34818288,\n", + " -0.33802817, -0.42056075, -0.99225863, -0.99237068, -0.99265109,\n", + " -0.99252778, -0.99300725, -0.99305075, -0.99354072, -0.28723404,\n", + " -0.41772152, -0.48571429, -0.3589459 , -0.28275862, -0.32413793,\n", + " -0.34693878, -0.2 , -0.36986301, -0.35172414, -0.43448276,\n", + " -0.32157676],\n", + " [-0.30832356, -0.3059789 , -0.3286385 , -0.31066823, -0.32473623,\n", + " -0.34741784, -0.38785047, -0.99222479, -0.99233786, -0.99262032,\n", + " -0.9924971 , -0.99297519, -0.9930209 , -0.99351168, -0.28723404,\n", + " -0.3164557 , -0.47936508, -0.3588813 , -0.32413793, -0.31034483,\n", + " -0.31972789, -0.32413793, -0.32876712, -0.35172414, -0.39310345,\n", + " -0.32987552],\n", + " [-0.33645955, -0.2098476 , -0.24413146, -0.2919109 , -0.41383353,\n", + " -0.41079812, -0.46495327, -0.99219025, -0.99230168, -0.99258563,\n", + " -0.99246579, -0.99294781, -0.99299365, -0.9934867 , -0.24468085,\n", + " -0.29113924, -0.42857143, -0.3588177 , -0.31034483, -0.24137931,\n", + " -0.23809524, -0.31034483, -0.42465753, -0.40689655, -0.47586207,\n", + " -0.34024896],\n", + " [-0.24267292, -0.15357562, -0.19248826, -0.13950762, -0.35052755,\n", + " -0.30046948, -0.37616822, -0.99215358, -0.99226215, -0.99254831,\n", + " -0.99242872, -0.99291708, -0.99296121, -0.99345766, -0.22340426,\n", + " -0.25316456, -0.42857143, -0.3587457 , -0.26896552, -0.17241379,\n", + " -0.18367347, -0.1862069 , -0.35616438, -0.29655172, -0.39310345,\n", + " -0.25311203],\n", + " [-0.2989449 , -0.26377491, -0.27699531, -0.15592028, -0.34583822,\n", + " -0.34976526, -0.48831776, -0.99211763, -0.99222731, -0.99251493,\n", + " -0.99239038, -0.99288636, -0.99293072, -0.99343267, -0.20212766,\n", + " -0.24050633, -0.3968254 , -0.35867929, -0.28275862, -0.26896552,\n", + " -0.26530612, -0.15862069, -0.35616438, -0.33793103, -0.47586207,\n", + " -0.31120332]])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'][0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## keras.Sequential.DoubleLSTMTimeSeriesClassifier\n", + "\n", + "* Input: X, y\n", + "* Output: \n", + "* Effect: DoubleLSTM has been fitted." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-01-18 05:32:48.464559: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-01-18 05:32:48.495873: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fba31d9b0c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2022-01-18 05:32:48.495892: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + ] + } + ], + "source": [ + "step = 9\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/pipelines/lstm_regressor_with_unstack.ipynb b/tutorials/pipelines/lstm_regressor_with_unstack.ipynb new file mode 100644 index 0000000..516c6da --- /dev/null +++ b/tutorials/pipelines/lstm_regressor_with_unstack.ipynb @@ -0,0 +1,2499 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "488d2ccc", + "metadata": {}, + "source": [ + "# lstm_regressor_with_unstack" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "caf9a9ef", + "metadata": {}, + "outputs": [], + "source": [ + "from draco.demo import load_demo\n", + "\n", + "train_target_times, test_target_times, readings = load_demo()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80315927", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_name = 'lstm_regressor_with_unstack'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1073a88a", + "metadata": {}, + "outputs": [], + "source": [ + "from draco import DracoPipeline\n", + "\n", + "pipeline = DracoPipeline(pipeline_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c6cb15d", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.template['primitives']" + ] + }, + { + "cell_type": "markdown", + "id": "26bbb52d", + "metadata": {}, + "source": [ + "# Step by Step execution" + ] + }, + { + "cell_type": "markdown", + "id": "3f12ee16", + "metadata": {}, + "source": [ + "## Input Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a2396b1c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampsignal_idvalue
012013-01-12 00:10:00operational setting 1-0.0007
112013-01-12 00:20:00operational setting 10.0019
212013-01-12 00:30:00operational setting 1-0.0043
312013-01-12 00:40:00operational setting 10.0007
412013-01-12 00:50:00operational setting 1-0.0019
\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp signal_id value\n", + "0 1 2013-01-12 00:10:00 operational setting 1 -0.0007\n", + "1 1 2013-01-12 00:20:00 operational setting 1 0.0019\n", + "2 1 2013-01-12 00:30:00 operational setting 1 -0.0043\n", + "3 1 2013-01-12 00:40:00 operational setting 1 0.0007\n", + "4 1 2013-01-12 00:50:00 operational setting 1 -0.0019" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "3cd80f1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
012013-01-12 04:20:00166
112013-01-12 04:30:00165
212013-01-12 04:40:00164
312013-01-12 04:50:00163
412013-01-12 05:00:00162
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 1 2013-01-12 04:20:00 166\n", + "1 1 2013-01-12 04:30:00 165\n", + "2 1 2013-01-12 04:40:00 164\n", + "3 1 2013-01-12 04:50:00 163\n", + "4 1 2013-01-12 05:00:00 162" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_target_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "6a759b57", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
012013-01-13 13:10:00112.0
122013-01-14 08:00:0098.0
232013-01-14 02:50:0069.0
342013-01-14 01:10:0082.0
452013-01-14 13:10:0091.0
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 1 2013-01-13 13:10:00 112.0\n", + "1 2 2013-01-14 08:00:00 98.0\n", + "2 3 2013-01-14 02:50:00 69.0\n", + "3 4 2013-01-14 01:10:00 82.0\n", + "4 5 2013-01-14 13:10:00 91.0" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test_target_times.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "feb3daa6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "training shape (18131, 3)\n", + "testing shape (100, 3)\n" + ] + } + ], + "source": [ + "print(\"training shape\", train_target_times.shape)\n", + "print(\"testing shape\", test_target_times.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "a956f746", + "metadata": {}, + "source": [ + "## Data Preparation (part of Draco Pipeline)\n", + "\n", + "* Input: target_times, readings, turbines\n", + "* Output: X, y, readings, turbines\n", + "* Effect: target_times has been split into X and y" + ] + }, + { + "cell_type": "markdown", + "id": "a813a966", + "metadata": {}, + "source": [ + "## mlblocks.MLPipeline\n", + "\n", + "### pandas.DataFrame.resample\n", + "\n", + "* Input: readings\n", + "* Output: readings (resampled)\n", + "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", + " signal_id and timestamp have been set as a multi-index\n", + " \n", + "### pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bb00b3b8", + "metadata": {}, + "outputs": [], + "source": [ + "context = pipeline.fit(train_target_times, readings, output_=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "381e361d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b41f13c1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_operational setting 1value_operational setting 2value_operational setting 3value_sensor measurement 1value_sensor measurement 10value_sensor measurement 11value_sensor measurement 12value_sensor measurement 13...value_sensor measurement 2value_sensor measurement 20value_sensor measurement 21value_sensor measurement 3value_sensor measurement 4value_sensor measurement 5value_sensor measurement 6value_sensor measurement 7value_sensor measurement 8value_sensor measurement 9
012013-01-12 00:10:00-0.0007-0.0004100.0518.671.347.47521.662388.02...641.8239.0623.41901589.701400.6014.6221.61554.362388.069046.19
112013-01-12 00:20:000.0019-0.0003100.0518.671.347.49522.282388.07...642.1539.0023.42361591.821403.1414.6221.61553.752388.049044.07
212013-01-12 00:30:00-0.00430.0003100.0518.671.347.27522.422388.03...642.3538.9523.34421587.991404.2014.6221.61554.262388.089052.94
312013-01-12 00:40:000.00070.0000100.0518.671.347.13522.862388.08...642.3538.8823.37391582.791401.8714.6221.61554.452388.119049.48
412013-01-12 00:50:00-0.0019-0.0002100.0518.671.347.28522.192388.04...642.3738.9023.40441582.851406.2214.6221.61554.002388.069055.15
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_operational setting 1 \\\n", + "0 1 2013-01-12 00:10:00 -0.0007 \n", + "1 1 2013-01-12 00:20:00 0.0019 \n", + "2 1 2013-01-12 00:30:00 -0.0043 \n", + "3 1 2013-01-12 00:40:00 0.0007 \n", + "4 1 2013-01-12 00:50:00 -0.0019 \n", + "\n", + " value_operational setting 2 value_operational setting 3 \\\n", + "0 -0.0004 100.0 \n", + "1 -0.0003 100.0 \n", + "2 0.0003 100.0 \n", + "3 0.0000 100.0 \n", + "4 -0.0002 100.0 \n", + "\n", + " value_sensor measurement 1 value_sensor measurement 10 \\\n", + "0 518.67 1.3 \n", + "1 518.67 1.3 \n", + "2 518.67 1.3 \n", + "3 518.67 1.3 \n", + "4 518.67 1.3 \n", + "\n", + " value_sensor measurement 11 value_sensor measurement 12 \\\n", + "0 47.47 521.66 \n", + "1 47.49 522.28 \n", + "2 47.27 522.42 \n", + "3 47.13 522.86 \n", + "4 47.28 522.19 \n", + "\n", + " value_sensor measurement 13 ... value_sensor measurement 2 \\\n", + "0 2388.02 ... 641.82 \n", + "1 2388.07 ... 642.15 \n", + "2 2388.03 ... 642.35 \n", + "3 2388.08 ... 642.35 \n", + "4 2388.04 ... 642.37 \n", + "\n", + " value_sensor measurement 20 value_sensor measurement 21 \\\n", + "0 39.06 23.4190 \n", + "1 39.00 23.4236 \n", + "2 38.95 23.3442 \n", + "3 38.88 23.3739 \n", + "4 38.90 23.4044 \n", + "\n", + " value_sensor measurement 3 value_sensor measurement 4 \\\n", + "0 1589.70 1400.60 \n", + "1 1591.82 1403.14 \n", + "2 1587.99 1404.20 \n", + "3 1582.79 1401.87 \n", + "4 1582.85 1406.22 \n", + "\n", + " value_sensor measurement 5 value_sensor measurement 6 \\\n", + "0 14.62 21.61 \n", + "1 14.62 21.61 \n", + "2 14.62 21.61 \n", + "3 14.62 21.61 \n", + "4 14.62 21.61 \n", + "\n", + " value_sensor measurement 7 value_sensor measurement 8 \\\n", + "0 554.36 2388.06 \n", + "1 553.75 2388.04 \n", + "2 554.26 2388.08 \n", + "3 554.45 2388.11 \n", + "4 554.00 2388.06 \n", + "\n", + " value_sensor measurement 9 \n", + "0 9046.19 \n", + "1 9044.07 \n", + "2 9052.94 \n", + "3 9049.48 \n", + "4 9055.15 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "id": "5f521fd3", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (unstacked)\n", + "* Output: readings (without turbine_id), turbine_id\n", + "* Effect: turbine_id has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bb0bac75", + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "1009407e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "93104c3b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 1\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 1\n", + "Name: turbine_id, dtype: int64" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['turbine_id'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "83855579", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampvalue_operational setting 1value_operational setting 2value_operational setting 3value_sensor measurement 1value_sensor measurement 10value_sensor measurement 11value_sensor measurement 12value_sensor measurement 13value_sensor measurement 14...value_sensor measurement 2value_sensor measurement 20value_sensor measurement 21value_sensor measurement 3value_sensor measurement 4value_sensor measurement 5value_sensor measurement 6value_sensor measurement 7value_sensor measurement 8value_sensor measurement 9
02013-01-12 00:10:00-0.0007-0.0004100.0518.671.347.47521.662388.028138.62...641.8239.0623.41901589.701400.6014.6221.61554.362388.069046.19
12013-01-12 00:20:000.0019-0.0003100.0518.671.347.49522.282388.078131.49...642.1539.0023.42361591.821403.1414.6221.61553.752388.049044.07
22013-01-12 00:30:00-0.00430.0003100.0518.671.347.27522.422388.038133.23...642.3538.9523.34421587.991404.2014.6221.61554.262388.089052.94
32013-01-12 00:40:000.00070.0000100.0518.671.347.13522.862388.088133.83...642.3538.8823.37391582.791401.8714.6221.61554.452388.119049.48
42013-01-12 00:50:00-0.0019-0.0002100.0518.671.347.28522.192388.048133.80...642.3738.9023.40441582.851406.2214.6221.61554.002388.069055.15
\n", + "

5 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " timestamp value_operational setting 1 \\\n", + "0 2013-01-12 00:10:00 -0.0007 \n", + "1 2013-01-12 00:20:00 0.0019 \n", + "2 2013-01-12 00:30:00 -0.0043 \n", + "3 2013-01-12 00:40:00 0.0007 \n", + "4 2013-01-12 00:50:00 -0.0019 \n", + "\n", + " value_operational setting 2 value_operational setting 3 \\\n", + "0 -0.0004 100.0 \n", + "1 -0.0003 100.0 \n", + "2 0.0003 100.0 \n", + "3 0.0000 100.0 \n", + "4 -0.0002 100.0 \n", + "\n", + " value_sensor measurement 1 value_sensor measurement 10 \\\n", + "0 518.67 1.3 \n", + "1 518.67 1.3 \n", + "2 518.67 1.3 \n", + "3 518.67 1.3 \n", + "4 518.67 1.3 \n", + "\n", + " value_sensor measurement 11 value_sensor measurement 12 \\\n", + "0 47.47 521.66 \n", + "1 47.49 522.28 \n", + "2 47.27 522.42 \n", + "3 47.13 522.86 \n", + "4 47.28 522.19 \n", + "\n", + " value_sensor measurement 13 value_sensor measurement 14 ... \\\n", + "0 2388.02 8138.62 ... \n", + "1 2388.07 8131.49 ... \n", + "2 2388.03 8133.23 ... \n", + "3 2388.08 8133.83 ... \n", + "4 2388.04 8133.80 ... \n", + "\n", + " value_sensor measurement 2 value_sensor measurement 20 \\\n", + "0 641.82 39.06 \n", + "1 642.15 39.00 \n", + "2 642.35 38.95 \n", + "3 642.35 38.88 \n", + "4 642.37 38.90 \n", + "\n", + " value_sensor measurement 21 value_sensor measurement 3 \\\n", + "0 23.4190 1589.70 \n", + "1 23.4236 1591.82 \n", + "2 23.3442 1587.99 \n", + "3 23.3739 1582.79 \n", + "4 23.4044 1582.85 \n", + "\n", + " value_sensor measurement 4 value_sensor measurement 5 \\\n", + "0 1400.60 14.62 \n", + "1 1403.14 14.62 \n", + "2 1404.20 14.62 \n", + "3 1401.87 14.62 \n", + "4 1406.22 14.62 \n", + "\n", + " value_sensor measurement 6 value_sensor measurement 7 \\\n", + "0 21.61 554.36 \n", + "1 21.61 553.75 \n", + "2 21.61 554.26 \n", + "3 21.61 554.45 \n", + "4 21.61 554.00 \n", + "\n", + " value_sensor measurement 8 value_sensor measurement 9 \n", + "0 2388.06 9046.19 \n", + "1 2388.04 9044.07 \n", + "2 2388.08 9052.94 \n", + "3 2388.11 9049.48 \n", + "4 2388.06 9055.15 \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "id": "a43ffbb1", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (without turbine_id)\n", + "* Output: readings (without timestamp), timestamp\n", + "* Effect: timestamp has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "ebcad5cd", + "metadata": {}, + "outputs": [], + "source": [ + "step = 2\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d497ab07", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2c3bfa0b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2013-01-12 00:10:00\n", + "1 2013-01-12 00:20:00\n", + "2 2013-01-12 00:30:00\n", + "3 2013-01-12 00:40:00\n", + "4 2013-01-12 00:50:00\n", + "Name: timestamp, dtype: datetime64[ns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['timestamp'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "3c837b44", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_operational setting 1value_operational setting 2value_operational setting 3value_sensor measurement 1value_sensor measurement 10value_sensor measurement 11value_sensor measurement 12value_sensor measurement 13value_sensor measurement 14value_sensor measurement 15...value_sensor measurement 2value_sensor measurement 20value_sensor measurement 21value_sensor measurement 3value_sensor measurement 4value_sensor measurement 5value_sensor measurement 6value_sensor measurement 7value_sensor measurement 8value_sensor measurement 9
0-0.0007-0.0004100.0518.671.347.47521.662388.028138.628.4195...641.8239.0623.41901589.701400.6014.6221.61554.362388.069046.19
10.0019-0.0003100.0518.671.347.49522.282388.078131.498.4318...642.1539.0023.42361591.821403.1414.6221.61553.752388.049044.07
2-0.00430.0003100.0518.671.347.27522.422388.038133.238.4178...642.3538.9523.34421587.991404.2014.6221.61554.262388.089052.94
30.00070.0000100.0518.671.347.13522.862388.088133.838.3682...642.3538.8823.37391582.791401.8714.6221.61554.452388.119049.48
4-0.0019-0.0002100.0518.671.347.28522.192388.048133.808.4294...642.3738.9023.40441582.851406.2214.6221.61554.002388.069055.15
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " value_operational setting 1 value_operational setting 2 \\\n", + "0 -0.0007 -0.0004 \n", + "1 0.0019 -0.0003 \n", + "2 -0.0043 0.0003 \n", + "3 0.0007 0.0000 \n", + "4 -0.0019 -0.0002 \n", + "\n", + " value_operational setting 3 value_sensor measurement 1 \\\n", + "0 100.0 518.67 \n", + "1 100.0 518.67 \n", + "2 100.0 518.67 \n", + "3 100.0 518.67 \n", + "4 100.0 518.67 \n", + "\n", + " value_sensor measurement 10 value_sensor measurement 11 \\\n", + "0 1.3 47.47 \n", + "1 1.3 47.49 \n", + "2 1.3 47.27 \n", + "3 1.3 47.13 \n", + "4 1.3 47.28 \n", + "\n", + " value_sensor measurement 12 value_sensor measurement 13 \\\n", + "0 521.66 2388.02 \n", + "1 522.28 2388.07 \n", + "2 522.42 2388.03 \n", + "3 522.86 2388.08 \n", + "4 522.19 2388.04 \n", + "\n", + " value_sensor measurement 14 value_sensor measurement 15 ... \\\n", + "0 8138.62 8.4195 ... \n", + "1 8131.49 8.4318 ... \n", + "2 8133.23 8.4178 ... \n", + "3 8133.83 8.3682 ... \n", + "4 8133.80 8.4294 ... \n", + "\n", + " value_sensor measurement 2 value_sensor measurement 20 \\\n", + "0 641.82 39.06 \n", + "1 642.15 39.00 \n", + "2 642.35 38.95 \n", + "3 642.35 38.88 \n", + "4 642.37 38.90 \n", + "\n", + " value_sensor measurement 21 value_sensor measurement 3 \\\n", + "0 23.4190 1589.70 \n", + "1 23.4236 1591.82 \n", + "2 23.3442 1587.99 \n", + "3 23.3739 1582.79 \n", + "4 23.4044 1582.85 \n", + "\n", + " value_sensor measurement 4 value_sensor measurement 5 \\\n", + "0 1400.60 14.62 \n", + "1 1403.14 14.62 \n", + "2 1404.20 14.62 \n", + "3 1401.87 14.62 \n", + "4 1406.22 14.62 \n", + "\n", + " value_sensor measurement 6 value_sensor measurement 7 \\\n", + "0 21.61 554.36 \n", + "1 21.61 553.75 \n", + "2 21.61 554.26 \n", + "3 21.61 554.45 \n", + "4 21.61 554.00 \n", + "\n", + " value_sensor measurement 8 value_sensor measurement 9 \n", + "0 2388.06 9046.19 \n", + "1 2388.04 9044.07 \n", + "2 2388.08 9052.94 \n", + "3 2388.11 9049.48 \n", + "4 2388.06 9055.15 \n", + "\n", + "[5 rows x 24 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "id": "78cc4b36", + "metadata": {}, + "source": [ + "## sklearn.impute.SimpleImputer\n", + "\n", + "* Input: readings (unstacked, no turbine_id, no timestamp)\n", + "* Output: readings (imputed, numpy array)\n", + "* Effect: readings have been imputed and converted to numpy array" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "3ad08e01", + "metadata": {}, + "outputs": [], + "source": [ + "step = 3\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "19c4ee50", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "af5f9dc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-7.00000e-04, -4.00000e-04, 1.00000e+02, 5.18670e+02,\n", + " 1.30000e+00, 4.74700e+01, 5.21660e+02, 2.38802e+03,\n", + " 8.13862e+03, 8.41950e+00, 3.00000e-02, 3.92000e+02,\n", + " 2.38800e+03, 1.00000e+02, 6.41820e+02, 3.90600e+01,\n", + " 2.34190e+01, 1.58970e+03, 1.40060e+03, 1.46200e+01,\n", + " 2.16100e+01, 5.54360e+02, 2.38806e+03, 9.04619e+03],\n", + " [ 1.90000e-03, -3.00000e-04, 1.00000e+02, 5.18670e+02,\n", + " 1.30000e+00, 4.74900e+01, 5.22280e+02, 2.38807e+03,\n", + " 8.13149e+03, 8.43180e+00, 3.00000e-02, 3.92000e+02,\n", + " 2.38800e+03, 1.00000e+02, 6.42150e+02, 3.90000e+01,\n", + " 2.34236e+01, 1.59182e+03, 1.40314e+03, 1.46200e+01,\n", + " 2.16100e+01, 5.53750e+02, 2.38804e+03, 9.04407e+03],\n", + " [-4.30000e-03, 3.00000e-04, 1.00000e+02, 5.18670e+02,\n", + " 1.30000e+00, 4.72700e+01, 5.22420e+02, 2.38803e+03,\n", + " 8.13323e+03, 8.41780e+00, 3.00000e-02, 3.90000e+02,\n", + " 2.38800e+03, 1.00000e+02, 6.42350e+02, 3.89500e+01,\n", + " 2.33442e+01, 1.58799e+03, 1.40420e+03, 1.46200e+01,\n", + " 2.16100e+01, 5.54260e+02, 2.38808e+03, 9.05294e+03],\n", + " [ 7.00000e-04, 0.00000e+00, 1.00000e+02, 5.18670e+02,\n", + " 1.30000e+00, 4.71300e+01, 5.22860e+02, 2.38808e+03,\n", + " 8.13383e+03, 8.36820e+00, 3.00000e-02, 3.92000e+02,\n", + " 2.38800e+03, 1.00000e+02, 6.42350e+02, 3.88800e+01,\n", + " 2.33739e+01, 1.58279e+03, 1.40187e+03, 1.46200e+01,\n", + " 2.16100e+01, 5.54450e+02, 2.38811e+03, 9.04948e+03],\n", + " [-1.90000e-03, -2.00000e-04, 1.00000e+02, 5.18670e+02,\n", + " 1.30000e+00, 4.72800e+01, 5.22190e+02, 2.38804e+03,\n", + " 8.13380e+03, 8.42940e+00, 3.00000e-02, 3.93000e+02,\n", + " 2.38800e+03, 1.00000e+02, 6.42370e+02, 3.89000e+01,\n", + " 2.34044e+01, 1.58285e+03, 1.40622e+03, 1.46200e+01,\n", + " 2.16100e+01, 5.54000e+02, 2.38806e+03, 9.05515e+03]])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "id": "1e0df4b2", + "metadata": {}, + "source": [ + "## sklearn.preprocessing.MinMaxScaler\n", + "\n", + "* Input: (imputed, array)\n", + "* Output: readings (scaled, array)\n", + "* Effect: readings have been scaled to [-1, 1] range" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "f50662d2", + "metadata": {}, + "outputs": [], + "source": [ + "step = 4\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "37bf8d65", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "73c5d941", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.08045977, -0.69230769, -1. , -1. , -1. ,\n", + " -0.22543353, 0.17159763, -0.58823529, -0.60078439, -0.2720277 ,\n", + " -1. , -0.33333333, -1. , -1. , -0.59411765,\n", + " 0.42635659, 0.40377157, -0.13682891, -0.38048616, -1. ,\n", + " 1. , 0.45249597, -0.49253731, -0.78048999],\n", + " [ 0.2183908 , -0.53846154, -1. , -1. , -1. ,\n", + " -0.20231214, 0.41617357, -0.44117647, -0.674373 , -0.17737591,\n", + " -1. , -0.33333333, -1. , -1. , -0.4 ,\n", + " 0.33333333, 0.41607597, -0.04825569, -0.29473329, -1. ,\n", + " 1. , 0.25603865, -0.55223881, -0.79951539],\n", + " [-0.49425287, 0.38461538, -1. , -1. , -1. ,\n", + " -0.4566474 , 0.47140039, -0.55882353, -0.65641449, -0.28510966,\n", + " -1. , -0.66666667, -1. , -1. , -0.28235294,\n", + " 0.25581395, 0.20369132, -0.2082724 , -0.25894666, -1. ,\n", + " 1. , 0.42028986, -0.43283582, -0.71991385],\n", + " [ 0.08045977, -0.07692308, -1. , -1. , -1. ,\n", + " -0.61849711, 0.64497041, -0.41176471, -0.6502219 , -0.66679492,\n", + " -1. , -0.33333333, -1. , -1. , -0.28235294,\n", + " 0.14728682, 0.28313495, -0.42552747, -0.33760972, -1. ,\n", + " 1. , 0.48148148, -0.34328358, -0.75096473],\n", + " [-0.2183908 , -0.38461538, -1. , -1. , -1. ,\n", + " -0.44508671, 0.38067061, -0.52941176, -0.65053153, -0.19584456,\n", + " -1. , -0.16666667, -1. , -1. , -0.27058824,\n", + " 0.17829457, 0.36471847, -0.42302068, -0.19074949, -1. ,\n", + " 1. , 0.33655395, -0.49253731, -0.70008077]])" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "id": "e483b0ae", + "metadata": {}, + "source": [ + "## pandas.DataFrame\n", + "\n", + "* Input: readings (scaled, array)\n", + "* Output: readings (dataframe)\n", + "* Effect: readings have been converted into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "4722001e", + "metadata": {}, + "outputs": [], + "source": [ + "step = 5\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "34b5d2ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "011b9c51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...14151617181920212223
0-0.080460-0.692308-1.0-1.0-1.0-0.2254340.171598-0.588235-0.600784-0.272028...-0.5941180.4263570.403772-0.136829-0.380486-1.01.00.452496-0.492537-0.780490
10.218391-0.538462-1.0-1.0-1.0-0.2023120.416174-0.441176-0.674373-0.177376...-0.4000000.3333330.416076-0.048256-0.294733-1.01.00.256039-0.552239-0.799515
2-0.4942530.384615-1.0-1.0-1.0-0.4566470.471400-0.558824-0.656414-0.285110...-0.2823530.2558140.203691-0.208272-0.258947-1.01.00.420290-0.432836-0.719914
30.080460-0.076923-1.0-1.0-1.0-0.6184970.644970-0.411765-0.650222-0.666795...-0.2823530.1472870.283135-0.425527-0.337610-1.01.00.481481-0.343284-0.750965
4-0.218391-0.384615-1.0-1.0-1.0-0.4450870.380671-0.529412-0.650532-0.195845...-0.2705880.1782950.364718-0.423021-0.190749-1.01.00.336554-0.492537-0.700081
\n", + "

5 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 \\\n", + "0 -0.080460 -0.692308 -1.0 -1.0 -1.0 -0.225434 0.171598 -0.588235 -0.600784 \n", + "1 0.218391 -0.538462 -1.0 -1.0 -1.0 -0.202312 0.416174 -0.441176 -0.674373 \n", + "2 -0.494253 0.384615 -1.0 -1.0 -1.0 -0.456647 0.471400 -0.558824 -0.656414 \n", + "3 0.080460 -0.076923 -1.0 -1.0 -1.0 -0.618497 0.644970 -0.411765 -0.650222 \n", + "4 -0.218391 -0.384615 -1.0 -1.0 -1.0 -0.445087 0.380671 -0.529412 -0.650532 \n", + "\n", + " 9 ... 14 15 16 17 18 19 20 \\\n", + "0 -0.272028 ... -0.594118 0.426357 0.403772 -0.136829 -0.380486 -1.0 1.0 \n", + "1 -0.177376 ... -0.400000 0.333333 0.416076 -0.048256 -0.294733 -1.0 1.0 \n", + "2 -0.285110 ... -0.282353 0.255814 0.203691 -0.208272 -0.258947 -1.0 1.0 \n", + "3 -0.666795 ... -0.282353 0.147287 0.283135 -0.425527 -0.337610 -1.0 1.0 \n", + "4 -0.195845 ... -0.270588 0.178295 0.364718 -0.423021 -0.190749 -1.0 1.0 \n", + "\n", + " 21 22 23 \n", + "0 0.452496 -0.492537 -0.780490 \n", + "1 0.256039 -0.552239 -0.799515 \n", + "2 0.420290 -0.432836 -0.719914 \n", + "3 0.481481 -0.343284 -0.750965 \n", + "4 0.336554 -0.492537 -0.700081 \n", + "\n", + "[5 rows x 24 columns]" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "id": "9539c0e6", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe)\n", + "* Output: readings (dataframe with turbine_id)\n", + "* Effect: turbine_id has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d58c17c1", + "metadata": {}, + "outputs": [], + "source": [ + "step = 6\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "b5b62c52", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "8bedb44e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...151617181920212223turbine_id
0-0.080460-0.692308-1.0-1.0-1.0-0.2254340.171598-0.588235-0.600784-0.272028...0.4263570.403772-0.136829-0.380486-1.01.00.452496-0.492537-0.7804901
10.218391-0.538462-1.0-1.0-1.0-0.2023120.416174-0.441176-0.674373-0.177376...0.3333330.416076-0.048256-0.294733-1.01.00.256039-0.552239-0.7995151
2-0.4942530.384615-1.0-1.0-1.0-0.4566470.471400-0.558824-0.656414-0.285110...0.2558140.203691-0.208272-0.258947-1.01.00.420290-0.432836-0.7199141
30.080460-0.076923-1.0-1.0-1.0-0.6184970.644970-0.411765-0.650222-0.666795...0.1472870.283135-0.425527-0.337610-1.01.00.481481-0.343284-0.7509651
4-0.218391-0.384615-1.0-1.0-1.0-0.4450870.380671-0.529412-0.650532-0.195845...0.1782950.364718-0.423021-0.190749-1.01.00.336554-0.492537-0.7000811
\n", + "

5 rows × 25 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 \\\n", + "0 -0.080460 -0.692308 -1.0 -1.0 -1.0 -0.225434 0.171598 -0.588235 -0.600784 \n", + "1 0.218391 -0.538462 -1.0 -1.0 -1.0 -0.202312 0.416174 -0.441176 -0.674373 \n", + "2 -0.494253 0.384615 -1.0 -1.0 -1.0 -0.456647 0.471400 -0.558824 -0.656414 \n", + "3 0.080460 -0.076923 -1.0 -1.0 -1.0 -0.618497 0.644970 -0.411765 -0.650222 \n", + "4 -0.218391 -0.384615 -1.0 -1.0 -1.0 -0.445087 0.380671 -0.529412 -0.650532 \n", + "\n", + " 9 ... 15 16 17 18 19 20 21 \\\n", + "0 -0.272028 ... 0.426357 0.403772 -0.136829 -0.380486 -1.0 1.0 0.452496 \n", + "1 -0.177376 ... 0.333333 0.416076 -0.048256 -0.294733 -1.0 1.0 0.256039 \n", + "2 -0.285110 ... 0.255814 0.203691 -0.208272 -0.258947 -1.0 1.0 0.420290 \n", + "3 -0.666795 ... 0.147287 0.283135 -0.425527 -0.337610 -1.0 1.0 0.481481 \n", + "4 -0.195845 ... 0.178295 0.364718 -0.423021 -0.190749 -1.0 1.0 0.336554 \n", + "\n", + " 22 23 turbine_id \n", + "0 -0.492537 -0.780490 1 \n", + "1 -0.552239 -0.799515 1 \n", + "2 -0.432836 -0.719914 1 \n", + "3 -0.343284 -0.750965 1 \n", + "4 -0.492537 -0.700081 1 \n", + "\n", + "[5 rows x 25 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "id": "f2849d45", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe with turbine_id)\n", + "* Output: readings (dataframe with turbine_id and timestamp)\n", + "* Effect: timestamp has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "9896ef19", + "metadata": {}, + "outputs": [], + "source": [ + "step = 7\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "384e4e91", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "7dcc2b2c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...1617181920212223turbine_idtimestamp
0-0.080460-0.692308-1.0-1.0-1.0-0.2254340.171598-0.588235-0.600784-0.272028...0.403772-0.136829-0.380486-1.01.00.452496-0.492537-0.78049012013-01-12 00:10:00
10.218391-0.538462-1.0-1.0-1.0-0.2023120.416174-0.441176-0.674373-0.177376...0.416076-0.048256-0.294733-1.01.00.256039-0.552239-0.79951512013-01-12 00:20:00
2-0.4942530.384615-1.0-1.0-1.0-0.4566470.471400-0.558824-0.656414-0.285110...0.203691-0.208272-0.258947-1.01.00.420290-0.432836-0.71991412013-01-12 00:30:00
30.080460-0.076923-1.0-1.0-1.0-0.6184970.644970-0.411765-0.650222-0.666795...0.283135-0.425527-0.337610-1.01.00.481481-0.343284-0.75096512013-01-12 00:40:00
4-0.218391-0.384615-1.0-1.0-1.0-0.4450870.380671-0.529412-0.650532-0.195845...0.364718-0.423021-0.190749-1.01.00.336554-0.492537-0.70008112013-01-12 00:50:00
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 7 8 \\\n", + "0 -0.080460 -0.692308 -1.0 -1.0 -1.0 -0.225434 0.171598 -0.588235 -0.600784 \n", + "1 0.218391 -0.538462 -1.0 -1.0 -1.0 -0.202312 0.416174 -0.441176 -0.674373 \n", + "2 -0.494253 0.384615 -1.0 -1.0 -1.0 -0.456647 0.471400 -0.558824 -0.656414 \n", + "3 0.080460 -0.076923 -1.0 -1.0 -1.0 -0.618497 0.644970 -0.411765 -0.650222 \n", + "4 -0.218391 -0.384615 -1.0 -1.0 -1.0 -0.445087 0.380671 -0.529412 -0.650532 \n", + "\n", + " 9 ... 16 17 18 19 20 21 22 \\\n", + "0 -0.272028 ... 0.403772 -0.136829 -0.380486 -1.0 1.0 0.452496 -0.492537 \n", + "1 -0.177376 ... 0.416076 -0.048256 -0.294733 -1.0 1.0 0.256039 -0.552239 \n", + "2 -0.285110 ... 0.203691 -0.208272 -0.258947 -1.0 1.0 0.420290 -0.432836 \n", + "3 -0.666795 ... 0.283135 -0.425527 -0.337610 -1.0 1.0 0.481481 -0.343284 \n", + "4 -0.195845 ... 0.364718 -0.423021 -0.190749 -1.0 1.0 0.336554 -0.492537 \n", + "\n", + " 23 turbine_id timestamp \n", + "0 -0.780490 1 2013-01-12 00:10:00 \n", + "1 -0.799515 1 2013-01-12 00:20:00 \n", + "2 -0.719914 1 2013-01-12 00:30:00 \n", + "3 -0.750965 1 2013-01-12 00:40:00 \n", + "4 -0.700081 1 2013-01-12 00:50:00 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "id": "087b270d", + "metadata": {}, + "source": [ + "## mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences\n", + "\n", + "* Input: X, readings (dataframe with turbine_id and timestamp)\n", + "* Output: X\n", + "* Effect: X has been converted to a 3d numpy array that contains 1 matrix of shape\n", + " (window_size x num_signals) for each one of the target times." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b4ff2d0a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline._pipeline.get_hyperparameters()[\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "2c8fd174", + "metadata": {}, + "outputs": [], + "source": [ + "step = 8\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "b051da01", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a802d22b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(33727, 26)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "cc53012b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(18131,)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['y'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "b1212aaf", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(18131, 24, 24)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "87abb56d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0.2183908 , -0.53846154, -1. , -1. , -1. ,\n", + " -0.20231214, 0.41617357, -0.44117647, -0.674373 , -0.17737591,\n", + " -1. , -0.33333333, -1. , -1. , -0.4 ,\n", + " 0.33333333, 0.41607597, -0.04825569, -0.29473329, -1. ,\n", + " 1. , 0.25603865, -0.55223881, -0.79951539],\n", + " [-0.49425287, 0.38461538, -1. , -1. , -1. ,\n", + " -0.4566474 , 0.47140039, -0.55882353, -0.65641449, -0.28510966,\n", + " -1. , -0.66666667, -1. , -1. , -0.28235294,\n", + " 0.25581395, 0.20369132, -0.2082724 , -0.25894666, -1. ,\n", + " 1. , 0.42028986, -0.43283582, -0.71991385],\n", + " [ 0.08045977, -0.07692308, -1. , -1. , -1. ,\n", + " -0.61849711, 0.64497041, -0.41176471, -0.6502219 , -0.66679492,\n", + " -1. , -0.33333333, -1. , -1. , -0.28235294,\n", + " 0.14728682, 0.28313495, -0.42552747, -0.33760972, -1. ,\n", + " 1. , 0.48148148, -0.34328358, -0.75096473]])" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'][0][:3]" + ] + }, + { + "cell_type": "markdown", + "id": "8876f20e", + "metadata": {}, + "source": [ + "## keras.Sequential.LSTMTimeSeriesRegressor\n", + "\n", + "* Input: X, y\n", + "* Output: \n", + "* Effect: LSTM has been fitted." + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "561c3e09", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-02-01 10:08:21.044547: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-02-01 10:08:21.080727: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f8579596430 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2022-02-01 10:08:21.080742: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + ] + } + ], + "source": [ + "step = 9\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/pipelines/lstm_with_unstack.ipynb b/tutorials/pipelines/lstm_with_unstack.ipynb new file mode 100644 index 0000000..799b90e --- /dev/null +++ b/tutorials/pipelines/lstm_with_unstack.ipynb @@ -0,0 +1,2249 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# lstm_with_unstack" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from draco.demo import load_demo\n", + "\n", + "target_times, readings = load_demo()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_name = 'lstm_with_unstack'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from draco.pipeline import DracoPipeline\n", + "\n", + "pipeline = DracoPipeline(pipeline_name)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['mlblocks.MLPipeline',\n", + " 'pandas.DataFrame.pop',\n", + " 'pandas.DataFrame.pop',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'sklearn.preprocessing.MinMaxScaler',\n", + " 'pandas.DataFrame',\n", + " 'pandas.DataFrame.set',\n", + " 'pandas.DataFrame.set',\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences',\n", + " 'keras.Sequential.LSTMTimeSeriesClassifier']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline.template['primitives']" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step by Step execution" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Input Data" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", + "
" + ], + "text/plain": [ + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", + "
" + ], + "text/plain": [ + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "target_times.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation (part of Draco Pipeline)\n", + "\n", + "* Input: target_times, readings, turbines\n", + "* Output: X, y, readings, turbines\n", + "* Effect: target_times has been split into X and y" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlblocks.MLPipeline\n", + "\n", + "### pandas.DataFrame.resample\n", + "\n", + "* Input: readings\n", + "* Output: readings (resampled)\n", + "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", + " signal_id and timestamp have been set as a multi-index\n", + " \n", + "### pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "context = pipeline.fit(target_times, readings, output_=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00323.0320.0284.0348.0273.0342.0280.03197842.0...11.73131020.055.055.047.058.045.058.047.0356.0
1T0012013-01-10 00:10:00346.0384.0367.0411.0331.0360.0249.03197900.0...10.23131420.058.063.062.067.055.061.042.0400.0
2T0012013-01-10 00:20:00407.0363.0407.0393.0275.0335.0270.03197968.0...9.53131822.068.061.067.066.046.055.045.0402.0
3T0012013-01-10 00:30:00257.0307.0315.0361.0317.0354.0271.03198011.0...10.53132179.043.051.053.062.053.060.045.0357.0
4T0012013-01-10 00:40:00267.0309.0314.0355.0262.0246.0212.03198056.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", + "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", + "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", + "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", + "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", + "\n", + " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", + "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", + "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", + "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", + "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", + "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", + "\n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + "\n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (unstacked)\n", + "* Output: readings (without turbine_id), turbine_id\n", + "* Effect: turbine_id has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 T001\n", + "1 T001\n", + "2 T001\n", + "3 T001\n", + "4 T001\n", + "Name: turbine_id, dtype: object" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['turbine_id'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
timestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
02013-01-10 00:00:00323.0320.0284.0348.0273.0342.0280.03197842.0695000.0...11.73131020.055.055.047.058.045.058.047.0356.0
12013-01-10 00:10:00346.0384.0367.0411.0331.0360.0249.03197900.0695063.0...10.23131420.058.063.062.067.055.061.042.0400.0
22013-01-10 00:20:00407.0363.0407.0393.0275.0335.0270.03197968.0695124.0...9.53131822.068.061.067.066.046.055.045.0402.0
32013-01-10 00:30:00257.0307.0315.0361.0317.0354.0271.03198011.0695175.0...10.53132179.043.051.053.062.053.060.045.0357.0
42013-01-10 00:40:00267.0309.0314.0355.0262.0246.0212.03198056.0695226.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " timestamp value_S01 value_S02 value_S03 value_S04 value_S05 \\\n", + "0 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 273.0 \n", + "1 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 331.0 \n", + "2 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 275.0 \n", + "3 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 317.0 \n", + "4 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 262.0 \n", + "\n", + " value_S06 value_S07 value_S08 value_S09 ... value_S17 value_S18 \\\n", + "0 342.0 280.0 3197842.0 695000.0 ... 11.7 3131020.0 \n", + "1 360.0 249.0 3197900.0 695063.0 ... 10.2 3131420.0 \n", + "2 335.0 270.0 3197968.0 695124.0 ... 9.5 3131822.0 \n", + "3 354.0 271.0 3198011.0 695175.0 ... 10.5 3132179.0 \n", + "4 246.0 212.0 3198056.0 695226.0 ... 9.6 3132501.0 \n", + "\n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + "\n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.pop\n", + "\n", + "* Input: readings (without turbine_id)\n", + "* Output: readings (without timestamp), timestamp\n", + "* Effect: timestamp has been popped from readings" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "step = 2\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 2013-01-10 00:00:00\n", + "1 2013-01-10 00:10:00\n", + "2 2013-01-10 00:20:00\n", + "3 2013-01-10 00:30:00\n", + "4 2013-01-10 00:40:00\n", + "Name: timestamp, dtype: datetime64[ns]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['timestamp'].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09value_S10...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0323.0320.0284.0348.0273.0342.0280.03197842.0695000.03348234.0...11.73131020.055.055.047.058.045.058.047.0356.0
1346.0384.0367.0411.0331.0360.0249.03197900.0695063.03348296.0...10.23131420.058.063.062.067.055.061.042.0400.0
2407.0363.0407.0393.0275.0335.0270.03197968.0695124.03348363.0...9.53131822.068.061.067.066.046.055.045.0402.0
3257.0307.0315.0361.0317.0354.0271.03198011.0695175.03348416.0...10.53132179.043.051.053.062.053.060.045.0357.0
4267.0309.0314.0355.0262.0246.0212.03198056.0695226.03348470.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", + "0 323.0 320.0 284.0 348.0 273.0 342.0 \n", + "1 346.0 384.0 367.0 411.0 331.0 360.0 \n", + "2 407.0 363.0 407.0 393.0 275.0 335.0 \n", + "3 257.0 307.0 315.0 361.0 317.0 354.0 \n", + "4 267.0 309.0 314.0 355.0 262.0 246.0 \n", + "\n", + " value_S07 value_S08 value_S09 value_S10 ... value_S17 value_S18 \\\n", + "0 280.0 3197842.0 695000.0 3348234.0 ... 11.7 3131020.0 \n", + "1 249.0 3197900.0 695063.0 3348296.0 ... 10.2 3131420.0 \n", + "2 270.0 3197968.0 695124.0 3348363.0 ... 9.5 3131822.0 \n", + "3 271.0 3198011.0 695175.0 3348416.0 ... 10.5 3132179.0 \n", + "4 212.0 3198056.0 695226.0 3348470.0 ... 9.6 3132501.0 \n", + "\n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + "\n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## sklearn.impute.SimpleImputer\n", + "\n", + "* Input: readings (unstacked, no turbine_id, no timestamp)\n", + "* Output: readings (imputed, numpy array)\n", + "* Effect: readings have been imputed and converted to numpy array" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "step = 3\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[3.230000e+02, 3.200000e+02, 2.840000e+02, 3.480000e+02,\n", + " 2.730000e+02, 3.420000e+02, 2.800000e+02, 3.197842e+06,\n", + " 6.950000e+05, 3.348234e+06, 3.436762e+06, 3.322362e+06,\n", + " 3.357952e+06, 3.223797e+06, 8.300000e+00, 6.000000e+00,\n", + " 1.170000e+01, 3.131020e+06, 5.500000e+01, 5.500000e+01,\n", + " 4.700000e+01, 5.800000e+01, 4.500000e+01, 5.800000e+01,\n", + " 4.700000e+01, 3.560000e+02],\n", + " [3.460000e+02, 3.840000e+02, 3.670000e+02, 4.110000e+02,\n", + " 3.310000e+02, 3.600000e+02, 2.490000e+02, 3.197900e+06,\n", + " 6.950630e+05, 3.348296e+06, 3.436829e+06, 3.322417e+06,\n", + " 3.358013e+06, 3.223839e+06, 7.600000e+00, 5.000000e+00,\n", + " 1.020000e+01, 3.131420e+06, 5.800000e+01, 6.300000e+01,\n", + " 6.200000e+01, 6.700000e+01, 5.500000e+01, 6.100000e+01,\n", + " 4.200000e+01, 4.000000e+02],\n", + " [4.070000e+02, 3.630000e+02, 4.070000e+02, 3.930000e+02,\n", + " 2.750000e+02, 3.350000e+02, 2.700000e+02, 3.197968e+06,\n", + " 6.951240e+05, 3.348363e+06, 3.436895e+06, 3.322463e+06,\n", + " 3.358068e+06, 3.223884e+06, 7.800000e+00, 5.700000e+00,\n", + " 9.500000e+00, 3.131822e+06, 6.800000e+01, 6.100000e+01,\n", + " 6.700000e+01, 6.600000e+01, 4.600000e+01, 5.500000e+01,\n", + " 4.500000e+01, 4.020000e+02],\n", + " [2.570000e+02, 3.070000e+02, 3.150000e+02, 3.610000e+02,\n", + " 3.170000e+02, 3.540000e+02, 2.710000e+02, 3.198011e+06,\n", + " 6.951750e+05, 3.348416e+06, 3.436957e+06, 3.322516e+06,\n", + " 3.358128e+06, 3.223929e+06, 8.600000e+00, 6.600000e+00,\n", + " 1.050000e+01, 3.132179e+06, 4.300000e+01, 5.100000e+01,\n", + " 5.300000e+01, 6.200000e+01, 5.300000e+01, 6.000000e+01,\n", + " 4.500000e+01, 3.570000e+02],\n", + " [2.670000e+02, 3.090000e+02, 3.140000e+02, 3.550000e+02,\n", + " 2.620000e+02, 2.460000e+02, 2.120000e+02, 3.198056e+06,\n", + " 6.952260e+05, 3.348470e+06, 3.437016e+06, 3.322559e+06,\n", + " 3.358169e+06, 3.223965e+06, 7.500000e+00, 5.900000e+00,\n", + " 9.600000e+00, 3.132501e+06, 4.500000e+01, 5.100000e+01,\n", + " 5.400000e+01, 5.900000e+01, 4.300000e+01, 4.100000e+01,\n", + " 3.600000e+01, 3.220000e+02]])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## sklearn.preprocessing.MinMaxScaler\n", + "\n", + "* Input: (imputed, array)\n", + "* Output: readings (scaled, array)\n", + "* Effect: readings have been scaled to [-1, 1] range" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "step = 4\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.23563892, -0.24267292, -0.3286385 , -0.17702227, -0.35287222,\n", + " -0.19248826, -0.3317757 , -1. , -1. , -1. ,\n", + " -1. , -1. , -1. , -1. , -0.11702128,\n", + " -0.24050633, -0.25714286, -0.37378787, -0.22758621, -0.22758621,\n", + " -0.31972789, -0.1862069 , -0.36986301, -0.1862069 , -0.33793103,\n", + " -0.26141079],\n", + " [-0.18171161, -0.0926143 , -0.13380282, -0.02930832, -0.21688159,\n", + " -0.15023474, -0.40420561, -0.99995911, -0.99995779, -0.99995941,\n", + " -0.99995718, -0.99996326, -0.99996042, -0.99997164, -0.19148936,\n", + " -0.36708861, -0.35238095, -0.37370786, -0.1862069 , -0.11724138,\n", + " -0.11564626, -0.06206897, -0.23287671, -0.14482759, -0.40689655,\n", + " -0.17012448],\n", + " [-0.03868699, -0.14185229, -0.0399061 , -0.07151231, -0.34818288,\n", + " -0.20892019, -0.35514019, -0.99991116, -0.99991693, -0.99991555,\n", + " -0.999915 , -0.99993254, -0.99992474, -0.99994125, -0.17021277,\n", + " -0.27848101, -0.3968254 , -0.37362746, -0.04827586, -0.14482759,\n", + " -0.04761905, -0.07586207, -0.35616438, -0.22758621, -0.36551724,\n", + " -0.1659751 ],\n", + " [-0.39038687, -0.27315358, -0.25586854, -0.14654162, -0.24970692,\n", + " -0.16431925, -0.35280374, -0.99988085, -0.99988276, -0.99988086,\n", + " -0.99987538, -0.99989714, -0.99988581, -0.99991086, -0.08510638,\n", + " -0.16455696, -0.33333333, -0.37355606, -0.39310345, -0.28275862,\n", + " -0.23809524, -0.13103448, -0.26027397, -0.15862069, -0.36551724,\n", + " -0.2593361 ],\n", + " [-0.36694021, -0.26846424, -0.25821596, -0.16060961, -0.37866354,\n", + " -0.41784038, -0.49065421, -0.99984912, -0.99984859, -0.99984551,\n", + " -0.99983767, -0.99986841, -0.99985921, -0.99988655, -0.20212766,\n", + " -0.25316456, -0.39047619, -0.37349166, -0.36551724, -0.28275862,\n", + " -0.2244898 , -0.17241379, -0.39726027, -0.42068966, -0.48965517,\n", + " -0.33195021]])" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'][0:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame\n", + "\n", + "* Input: readings (scaled, array)\n", + "* Output: readings (dataframe)\n", + "* Effect: readings have been converted into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "step = 5\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...16171819202122232425
0-0.235639-0.242673-0.328638-0.177022-0.352872-0.192488-0.331776-1.000000-1.000000-1.000000...-0.257143-0.373788-0.227586-0.227586-0.319728-0.186207-0.369863-0.186207-0.337931-0.261411
1-0.181712-0.092614-0.133803-0.029308-0.216882-0.150235-0.404206-0.999959-0.999958-0.999959...-0.352381-0.373708-0.186207-0.117241-0.115646-0.062069-0.232877-0.144828-0.406897-0.170124
2-0.038687-0.141852-0.039906-0.071512-0.348183-0.208920-0.355140-0.999911-0.999917-0.999916...-0.396825-0.373627-0.048276-0.144828-0.047619-0.075862-0.356164-0.227586-0.365517-0.165975
3-0.390387-0.273154-0.255869-0.146542-0.249707-0.164319-0.352804-0.999881-0.999883-0.999881...-0.333333-0.373556-0.393103-0.282759-0.238095-0.131034-0.260274-0.158621-0.365517-0.259336
4-0.366940-0.268464-0.258216-0.160610-0.378664-0.417840-0.490654-0.999849-0.999849-0.999846...-0.390476-0.373492-0.365517-0.282759-0.224490-0.172414-0.397260-0.420690-0.489655-0.331950
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "\n", + " 7 8 9 ... 16 17 18 19 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.257143 -0.373788 -0.227586 -0.227586 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.352381 -0.373708 -0.186207 -0.117241 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.396825 -0.373627 -0.048276 -0.144828 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.333333 -0.373556 -0.393103 -0.282759 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.390476 -0.373492 -0.365517 -0.282759 \n", + "\n", + " 20 21 22 23 24 25 \n", + "0 -0.319728 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 \n", + "1 -0.115646 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 \n", + "2 -0.047619 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 \n", + "3 -0.238095 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 \n", + "4 -0.224490 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe)\n", + "* Output: readings (dataframe with turbine_id)\n", + "* Effect: turbine_id has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "step = 6\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...171819202122232425turbine_id
0-0.235639-0.242673-0.328638-0.177022-0.352872-0.192488-0.331776-1.000000-1.000000-1.000000...-0.373788-0.227586-0.227586-0.319728-0.186207-0.369863-0.186207-0.337931-0.261411T001
1-0.181712-0.092614-0.133803-0.029308-0.216882-0.150235-0.404206-0.999959-0.999958-0.999959...-0.373708-0.186207-0.117241-0.115646-0.062069-0.232877-0.144828-0.406897-0.170124T001
2-0.038687-0.141852-0.039906-0.071512-0.348183-0.208920-0.355140-0.999911-0.999917-0.999916...-0.373627-0.048276-0.144828-0.047619-0.075862-0.356164-0.227586-0.365517-0.165975T001
3-0.390387-0.273154-0.255869-0.146542-0.249707-0.164319-0.352804-0.999881-0.999883-0.999881...-0.373556-0.393103-0.282759-0.238095-0.131034-0.260274-0.158621-0.365517-0.259336T001
4-0.366940-0.268464-0.258216-0.160610-0.378664-0.417840-0.490654-0.999849-0.999849-0.999846...-0.373492-0.365517-0.282759-0.224490-0.172414-0.397260-0.420690-0.489655-0.331950T001
\n", + "

5 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "\n", + " 7 8 9 ... 17 18 19 20 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.373788 -0.227586 -0.227586 -0.319728 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.373708 -0.186207 -0.117241 -0.115646 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.373627 -0.048276 -0.144828 -0.047619 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.373556 -0.393103 -0.282759 -0.238095 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.373492 -0.365517 -0.282759 -0.224490 \n", + "\n", + " 21 22 23 24 25 turbine_id \n", + "0 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 T001 \n", + "1 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 T001 \n", + "2 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 T001 \n", + "3 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 T001 \n", + "4 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 T001 \n", + "\n", + "[5 rows x 27 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.set\n", + "\n", + "* Input: readings (dataframe with turbine_id)\n", + "* Output: readings (dataframe with turbine_id and timestamp)\n", + "* Effect: timestamp has been set as a readings column" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "step = 7\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
0123456789...1819202122232425turbine_idtimestamp
0-0.235639-0.242673-0.328638-0.177022-0.352872-0.192488-0.331776-1.000000-1.000000-1.000000...-0.227586-0.227586-0.319728-0.186207-0.369863-0.186207-0.337931-0.261411T0012013-01-10 00:00:00
1-0.181712-0.092614-0.133803-0.029308-0.216882-0.150235-0.404206-0.999959-0.999958-0.999959...-0.186207-0.117241-0.115646-0.062069-0.232877-0.144828-0.406897-0.170124T0012013-01-10 00:10:00
2-0.038687-0.141852-0.039906-0.071512-0.348183-0.208920-0.355140-0.999911-0.999917-0.999916...-0.048276-0.144828-0.047619-0.075862-0.356164-0.227586-0.365517-0.165975T0012013-01-10 00:20:00
3-0.390387-0.273154-0.255869-0.146542-0.249707-0.164319-0.352804-0.999881-0.999883-0.999881...-0.393103-0.282759-0.238095-0.131034-0.260274-0.158621-0.365517-0.259336T0012013-01-10 00:30:00
4-0.366940-0.268464-0.258216-0.160610-0.378664-0.417840-0.490654-0.999849-0.999849-0.999846...-0.365517-0.282759-0.224490-0.172414-0.397260-0.420690-0.489655-0.331950T0012013-01-10 00:40:00
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "\n", + " 7 8 9 ... 18 19 20 21 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.227586 -0.227586 -0.319728 -0.186207 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.186207 -0.117241 -0.115646 -0.062069 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.048276 -0.144828 -0.047619 -0.075862 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.393103 -0.282759 -0.238095 -0.131034 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.365517 -0.282759 -0.224490 -0.172414 \n", + "\n", + " 22 23 24 25 turbine_id timestamp \n", + "0 -0.369863 -0.186207 -0.337931 -0.261411 T001 2013-01-10 00:00:00 \n", + "1 -0.232877 -0.144828 -0.406897 -0.170124 T001 2013-01-10 00:10:00 \n", + "2 -0.356164 -0.227586 -0.365517 -0.165975 T001 2013-01-10 00:20:00 \n", + "3 -0.260274 -0.158621 -0.365517 -0.259336 T001 2013-01-10 00:30:00 \n", + "4 -0.397260 -0.420690 -0.489655 -0.331950 T001 2013-01-10 00:40:00 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences\n", + "\n", + "* Input: X, readings (dataframe with turbine_id and timestamp)\n", + "* Output: X\n", + "* Effect: X has been converted to a 3d numpy array that contains 1 matrix of shape\n", + " (window_size x num_signals) for each one of the target times." + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pipeline._pipeline.get_hyperparameters()[\n", + " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "step = 8\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(51121, 28)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353,)" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['y'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(353, 24, 26)" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[-0.66002345, -0.57327081, -0.64084507, -0.57796014, -0.6014068 ,\n", + " -0.56103286, -0.55140187, -0.9928135 , -0.99291267, -0.99315058,\n", + " -0.99304288, -0.99346346, -0.99352632, -0.99395333, -0.42553191,\n", + " -0.41772152, -0.58730159, -0.35996294, -0.66896552, -0.57241379,\n", + " -0.61904762, -0.5862069 , -0.60273973, -0.55862069, -0.55862069,\n", + " -0.59751037],\n", + " [-0.2989449 , -0.38569754, -0.48591549, -0.47713951, -0.66705744,\n", + " -0.5915493 , -0.77336449, -0.99278389, -0.9928852 , -0.99312701,\n", + " -0.99301988, -0.9934481 , -0.9935075 , -0.9939459 , -0.39361702,\n", + " -0.40506329, -0.54285714, -0.35992014, -0.40689655, -0.42068966,\n", + " -0.46938776, -0.48965517, -0.67123288, -0.5862069 , -0.83448276,\n", + " -0.5560166 ],\n", + " [-0.33645955, -0.40679953, -0.39906103, -0.38569754, -0.56154748,\n", + " -0.43192488, -0.45560748, -0.99275498, -0.9928584 , -0.99310017,\n", + " -0.99299431, -0.99342739, -0.99348349, -0.99392294, -0.29787234,\n", + " -0.3164557 , -0.49206349, -0.35986854, -0.42068966, -0.43448276,\n", + " -0.40136054, -0.43448276, -0.56164384, -0.47586207, -0.51724138,\n", + " -0.46473029]])" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['X'][0][:3]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## keras.Sequential.LSTMTimeSeriesClassifier\n", + "\n", + "* Input: X, y\n", + "* Output: \n", + "* Effect: LSTM has been fitted." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-01-18 07:34:41.001707: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", + "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2022-01-18 07:34:41.024991: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fac7ea34260 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2022-01-18 07:34:41.025038: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + ] + } + ], + "source": [ + "step = 9\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb b/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb deleted file mode 100644 index f539e89..0000000 --- a/tutorials/pipelines/unstack_double_lstm_timeseries_classifier.ipynb +++ /dev/null @@ -1,2481 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# unstack_double_lstm_timeseries_classifier" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from draco.demo import load_demo\n", - "\n", - "target_times, readings = load_demo()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_name = 'classes.unstack_double_lstm_timeseries_classifier'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from draco.pipeline import DracoPipeline\n", - "\n", - "pipeline = DracoPipeline(pipeline_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['pandas.DataFrame.resample',\n", - " 'pandas.DataFrame.unstack',\n", - " 'pandas.DataFrame.pop',\n", - " 'pandas.DataFrame.pop',\n", - " 'sklearn.impute.SimpleImputer',\n", - " 'sklearn.preprocessing.MinMaxScaler',\n", - " 'pandas.DataFrame',\n", - " 'pandas.DataFrame.set',\n", - " 'pandas.DataFrame.set',\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences',\n", - " 'keras.Sequential.DoubleLSTMTimeSeriesClassifier']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.template['primitives']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Step by Step execution" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Input Data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 323.0\n", - "1 T001 S02 2013-01-10 320.0\n", - "2 T001 S03 2013-01-10 284.0\n", - "3 T001 S04 2013-01-10 348.0\n", - "4 T001 S05 2013-01-10 273.0" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", - "
" - ], - "text/plain": [ - " turbine_id cutoff_time target\n", - "0 T001 2013-01-12 0\n", - "1 T001 2013-01-13 0\n", - "2 T001 2013-01-14 0\n", - "3 T001 2013-01-15 1\n", - "4 T001 2013-01-16 0" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation (part of Draco Pipeline)\n", - "\n", - "* Input: target_times, readings, turbines\n", - "* Output: X, y, readings, turbines\n", - "* Effect: target_times has been split into X and y" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.resample\n", - "\n", - "* Input: readings\n", - "* Output: readings (resampled)\n", - "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "context = pipeline.fit(target_times, readings, output_=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y'])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value
turbine_idsignal_idtimestamp
T001S012013-01-10 00:00:00313.333333
2013-01-10 01:00:00197.500000
2013-01-10 02:00:00248.166667
2013-01-10 03:00:00253.166667
2013-01-10 04:00:00305.000000
\n", - "
" - ], - "text/plain": [ - " value\n", - "turbine_id signal_id timestamp \n", - "T001 S01 2013-01-10 00:00:00 313.333333\n", - " 2013-01-10 01:00:00 197.500000\n", - " 2013-01-10 02:00:00 248.166667\n", - " 2013-01-10 03:00:00 253.166667\n", - " 2013-01-10 04:00:00 305.000000" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.unstack\n", - "\n", - "* Input: readings (resampled)\n", - "* Output: readings (unstacked)\n", - "* Effect: readings have been unstacked" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "step = 1\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
1T0012013-01-10 01:00:00197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
2T0012013-01-10 02:00:00248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
3T0012013-01-10 03:00:00253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
4T0012013-01-10 04:00:00305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", - "0 T001 2013-01-10 00:00:00 313.333333 323.833333 336.000000 \n", - "1 T001 2013-01-10 01:00:00 197.500000 221.333333 216.000000 \n", - "2 T001 2013-01-10 02:00:00 248.166667 271.666667 277.500000 \n", - "3 T001 2013-01-10 03:00:00 253.166667 256.166667 242.666667 \n", - "4 T001 2013-01-10 04:00:00 305.000000 312.333333 346.166667 \n", - "\n", - " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", - "0 364.666667 286.500000 314.000000 243.166667 3.197980e+06 ... \n", - "1 260.666667 206.833333 235.833333 186.666667 3.198221e+06 ... \n", - "2 298.000000 233.666667 271.166667 216.333333 3.198448e+06 ... \n", - "3 265.333333 211.666667 226.666667 181.000000 3.198691e+06 ... \n", - "4 329.833333 280.666667 308.833333 271.833333 3.198978e+06 ... \n", - "\n", - " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", - "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", - "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", - "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", - "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", - "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", - "\n", - " value_S23 value_S24 value_S25 value_S26 \n", - "0 47.666667 52.666667 40.833333 357.333333 \n", - "1 34.500000 39.333333 31.166667 249.666667 \n", - "2 39.333333 45.500000 36.166667 297.666667 \n", - "3 35.333333 37.833333 30.333333 268.000000 \n", - "4 46.166667 49.666667 41.166667 341.833333 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.pop\n", - "\n", - "* Input: readings (unstacked)\n", - "* Output: readings (without turbine_id), turbine_id\n", - "* Effect: turbine_id has been popped from readings" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "step = 2\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 T001\n", - "1 T001\n", - "2 T001\n", - "3 T001\n", - "4 T001\n", - "Name: turbine_id, dtype: object" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['turbine_id'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
02013-01-10 00:00:00313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06695143.166667...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
12013-01-10 01:00:00197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06695403.666667...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
22013-01-10 02:00:00248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06695656.500000...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
32013-01-10 03:00:00253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06695911.333333...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
42013-01-10 04:00:00305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06696195.833333...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", - "

5 rows × 27 columns

\n", - "
" - ], - "text/plain": [ - " timestamp value_S01 value_S02 value_S03 value_S04 \\\n", - "0 2013-01-10 00:00:00 313.333333 323.833333 336.000000 364.666667 \n", - "1 2013-01-10 01:00:00 197.500000 221.333333 216.000000 260.666667 \n", - "2 2013-01-10 02:00:00 248.166667 271.666667 277.500000 298.000000 \n", - "3 2013-01-10 03:00:00 253.166667 256.166667 242.666667 265.333333 \n", - "4 2013-01-10 04:00:00 305.000000 312.333333 346.166667 329.833333 \n", - "\n", - " value_S05 value_S06 value_S07 value_S08 value_S09 ... \\\n", - "0 286.500000 314.000000 243.166667 3.197980e+06 695143.166667 ... \n", - "1 206.833333 235.833333 186.666667 3.198221e+06 695403.666667 ... \n", - "2 233.666667 271.166667 216.333333 3.198448e+06 695656.500000 ... \n", - "3 211.666667 226.666667 181.000000 3.198691e+06 695911.333333 ... \n", - "4 280.666667 308.833333 271.833333 3.198978e+06 696195.833333 ... \n", - "\n", - " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", - "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", - "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", - "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", - "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", - "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", - "\n", - " value_S23 value_S24 value_S25 value_S26 \n", - "0 47.666667 52.666667 40.833333 357.333333 \n", - "1 34.500000 39.333333 31.166667 249.666667 \n", - "2 39.333333 45.500000 36.166667 297.666667 \n", - "3 35.333333 37.833333 30.333333 268.000000 \n", - "4 46.166667 49.666667 41.166667 341.833333 \n", - "\n", - "[5 rows x 27 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.pop\n", - "\n", - "* Input: readings (without turbine_id)\n", - "* Output: readings (without timestamp), timestamp\n", - "* Effect: timestamp has been popped from readings" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "step = 3\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2013-01-10 00:00:00\n", - "1 2013-01-10 01:00:00\n", - "2 2013-01-10 02:00:00\n", - "3 2013-01-10 03:00:00\n", - "4 2013-01-10 04:00:00\n", - "Name: timestamp, dtype: datetime64[ns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['timestamp'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09value_S10...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06695143.1666673.348384e+06...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
1197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06695403.6666673.348651e+06...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
2248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06695656.5000003.348910e+06...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
3253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06695911.3333333.349157e+06...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
4305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06696195.8333333.349452e+06...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", - "

5 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", - "0 313.333333 323.833333 336.000000 364.666667 286.500000 314.000000 \n", - "1 197.500000 221.333333 216.000000 260.666667 206.833333 235.833333 \n", - "2 248.166667 271.666667 277.500000 298.000000 233.666667 271.166667 \n", - "3 253.166667 256.166667 242.666667 265.333333 211.666667 226.666667 \n", - "4 305.000000 312.333333 346.166667 329.833333 280.666667 308.833333 \n", - "\n", - " value_S07 value_S08 value_S09 value_S10 ... value_S17 \\\n", - "0 243.166667 3.197980e+06 695143.166667 3.348384e+06 ... 10.383333 \n", - "1 186.666667 3.198221e+06 695403.666667 3.348651e+06 ... 8.666667 \n", - "2 216.333333 3.198448e+06 695656.500000 3.348910e+06 ... 8.833333 \n", - "3 181.000000 3.198691e+06 695911.333333 3.349157e+06 ... 8.433333 \n", - "4 271.833333 3.198978e+06 696195.833333 3.349452e+06 ... 9.083333 \n", - "\n", - " value_S18 value_S19 value_S20 value_S21 value_S22 value_S23 \\\n", - "0 3.131958e+06 52.666667 54.333333 56.166667 61.000000 47.666667 \n", - "1 3.133668e+06 33.166667 37.000000 36.166667 43.666667 34.500000 \n", - "2 3.135413e+06 41.500000 45.666667 46.500000 49.666667 39.333333 \n", - "3 3.137001e+06 42.333333 42.833333 40.500000 44.166667 35.333333 \n", - "4 3.138843e+06 50.500000 51.166667 55.500000 53.666667 46.166667 \n", - "\n", - " value_S24 value_S25 value_S26 \n", - "0 52.666667 40.833333 357.333333 \n", - "1 39.333333 31.166667 249.666667 \n", - "2 45.500000 36.166667 297.666667 \n", - "3 37.833333 30.333333 268.000000 \n", - "4 49.666667 41.166667 341.833333 \n", - "\n", - "[5 rows x 26 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## sklearn.impute.SimpleImputer\n", - "\n", - "* Input: readings (unstacked, no turbine_id, no timestamp)\n", - "* Output: readings (imputed, numpy array)\n", - "* Effect: readings have been imputed and converted to numpy array" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "step = 4\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[3.13333333e+02, 3.23833333e+02, 3.36000000e+02, 3.64666667e+02,\n", - " 2.86500000e+02, 3.14000000e+02, 2.43166667e+02, 3.19798000e+06,\n", - " 6.95143167e+05, 3.34838383e+06, 3.43692150e+06, 3.32248667e+06,\n", - " 3.35809000e+06, 3.22390150e+06, 7.95000000e+00, 5.85000000e+00,\n", - " 1.03833333e+01, 3.13195833e+06, 5.26666667e+01, 5.43333333e+01,\n", - " 5.61666667e+01, 6.10000000e+01, 4.76666667e+01, 5.26666667e+01,\n", - " 4.08333333e+01, 3.57333333e+02],\n", - " [1.97500000e+02, 2.21333333e+02, 2.16000000e+02, 2.60666667e+02,\n", - " 2.06833333e+02, 2.35833333e+02, 1.86666667e+02, 3.19822067e+06,\n", - " 6.95403667e+05, 3.34865117e+06, 3.43722283e+06, 3.32272200e+06,\n", - " 3.35834000e+06, 3.22409567e+06, 6.83333333e+00, 5.15000000e+00,\n", - " 8.66666667e+00, 3.13366817e+06, 3.31666667e+01, 3.70000000e+01,\n", - " 3.61666667e+01, 4.36666667e+01, 3.45000000e+01, 3.93333333e+01,\n", - " 3.11666667e+01, 2.49666667e+02],\n", - " [2.48166667e+02, 2.71666667e+02, 2.77500000e+02, 2.98000000e+02,\n", - " 2.33666667e+02, 2.71166667e+02, 2.16333333e+02, 3.19844767e+06,\n", - " 6.95656500e+05, 3.34890967e+06, 3.43751900e+06, 3.32295950e+06,\n", - " 3.35862067e+06, 3.22432333e+06, 7.11666667e+00, 5.56666667e+00,\n", - " 8.83333333e+00, 3.13541283e+06, 4.15000000e+01, 4.56666667e+01,\n", - " 4.65000000e+01, 4.96666667e+01, 3.93333333e+01, 4.55000000e+01,\n", - " 3.61666667e+01, 2.97666667e+02],\n", - " [2.53166667e+02, 2.56166667e+02, 2.42666667e+02, 2.65333333e+02,\n", - " 2.11666667e+02, 2.26666667e+02, 1.81000000e+02, 3.19869117e+06,\n", - " 6.95911333e+05, 3.34915717e+06, 3.43778050e+06, 3.32316850e+06,\n", - " 3.35884883e+06, 3.22450217e+06, 6.71666667e+00, 5.16666667e+00,\n", - " 8.43333333e+00, 3.13700133e+06, 4.23333333e+01, 4.28333333e+01,\n", - " 4.05000000e+01, 4.41666667e+01, 3.53333333e+01, 3.78333333e+01,\n", - " 3.03333333e+01, 2.68000000e+02],\n", - " [3.05000000e+02, 3.12333333e+02, 3.46166667e+02, 3.29833333e+02,\n", - " 2.80666667e+02, 3.08833333e+02, 2.71833333e+02, 3.19897850e+06,\n", - " 6.96195833e+05, 3.34945200e+06, 3.43807767e+06, 3.32340933e+06,\n", - " 3.35910983e+06, 3.22471400e+06, 7.20000000e+00, 5.28333333e+00,\n", - " 9.08333333e+00, 3.13884333e+06, 5.05000000e+01, 5.11666667e+01,\n", - " 5.55000000e+01, 5.36666667e+01, 4.61666667e+01, 4.96666667e+01,\n", - " 4.11666667e+01, 3.41833333e+02]])" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'][0:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## sklearn.preprocessing.MinMaxScaler\n", - "\n", - "* Input: (imputed, array)\n", - "* Output: readings (scaled, array)\n", - "* Effect: readings have been scaled to [-1, 1] range" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "step = 5\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[-0.26126126, -0.23706897, -0.20870076, -0.14106583, -0.32328767,\n", - " -0.25969448, -0.42198789, -1. , -1. , -1. ,\n", - " -1. , -1. , -1. , -1. , -0.11007463,\n", - " -0.16824645, -0.10424155, -0.37397741, -0.25233645, -0.22716628,\n", - " -0.20140515, -0.13481829, -0.32239156, -0.25380117, -0.4182243 ,\n", - " -0.25697453],\n", - " [-0.53349001, -0.47805643, -0.49088771, -0.38557994, -0.51037182,\n", - " -0.44339992, -0.55438391, -0.99983031, -0.99982547, -0.99982499,\n", - " -0.99980741, -0.9998428 , -0.99983779, -0.99986887, -0.23507463,\n", - " -0.26777251, -0.25233645, -0.37363511, -0.52570093, -0.470726 ,\n", - " -0.4824356 , -0.37866354, -0.50762016, -0.44093567, -0.55373832,\n", - " -0.48085254],\n", - " [-0.41441441, -0.35971787, -0.3462669 , -0.29780564, -0.44735812,\n", - " -0.36036036, -0.48486624, -0.99967026, -0.99965608, -0.99965576,\n", - " -0.99961813, -0.99968416, -0.99965569, -0.99971512, -0.20335821,\n", - " -0.20853081, -0.2379583 , -0.37328583, -0.4088785 , -0.34894614,\n", - " -0.33723653, -0.29425557, -0.43962485, -0.35438596, -0.48364486,\n", - " -0.38104315],\n", - " [-0.40266353, -0.39615987, -0.4281795 , -0.37460815, -0.49902153,\n", - " -0.4649432 , -0.56766257, -0.99949857, -0.99948535, -0.99949373,\n", - " -0.999451 , -0.99954455, -0.99950765, -0.99959435, -0.24813433,\n", - " -0.26540284, -0.27246585, -0.37296782, -0.39719626, -0.38875878,\n", - " -0.42154567, -0.37162954, -0.49589683, -0.4619883 , -0.56542056,\n", - " -0.4427309 ],\n", - " [-0.28084606, -0.26410658, -0.18479326, -0.22296238, -0.3369863 ,\n", - " -0.27183705, -0.35481351, -0.99929598, -0.99929474, -0.99930071,\n", - " -0.99926107, -0.99938368, -0.99933831, -0.9994513 , -0.19402985,\n", - " -0.24881517, -0.21639109, -0.37259906, -0.28271028, -0.27166276,\n", - " -0.21077283, -0.23798359, -0.34349355, -0.29590643, -0.4135514 ,\n", - " -0.28920464]])" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'][0:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame\n", - "\n", - "* Input: readings (scaled, array)\n", - "* Output: readings (dataframe)\n", - "* Effect: readings have been converted into a dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "step = 6\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...16171819202122232425
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.104242-0.373977-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.252336-0.373635-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.237958-0.373286-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.272466-0.372968-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.216391-0.372599-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205
\n", - "

5 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", - "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", - "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", - "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", - "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", - "\n", - " 7 8 9 ... 16 17 18 19 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.104242 -0.373977 -0.252336 -0.227166 \n", - "1 -0.999830 -0.999825 -0.999825 ... -0.252336 -0.373635 -0.525701 -0.470726 \n", - "2 -0.999670 -0.999656 -0.999656 ... -0.237958 -0.373286 -0.408879 -0.348946 \n", - "3 -0.999499 -0.999485 -0.999494 ... -0.272466 -0.372968 -0.397196 -0.388759 \n", - "4 -0.999296 -0.999295 -0.999301 ... -0.216391 -0.372599 -0.282710 -0.271663 \n", - "\n", - " 20 21 22 23 24 25 \n", - "0 -0.201405 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 \n", - "1 -0.482436 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 \n", - "2 -0.337237 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 \n", - "3 -0.421546 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 \n", - "4 -0.210773 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 \n", - "\n", - "[5 rows x 26 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.set\n", - "\n", - "* Input: readings (dataframe)\n", - "* Output: readings (dataframe with turbine_id)\n", - "* Effect: turbine_id has been set as a readings column" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "step = 7\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...171819202122232425turbine_id
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.373977-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975T001
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.373635-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853T001
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.373286-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043T001
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.372968-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731T001
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.372599-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205T001
\n", - "

5 rows × 27 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", - "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", - "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", - "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", - "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", - "\n", - " 7 8 9 ... 17 18 19 20 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.373977 -0.252336 -0.227166 -0.201405 \n", - "1 -0.999830 -0.999825 -0.999825 ... -0.373635 -0.525701 -0.470726 -0.482436 \n", - "2 -0.999670 -0.999656 -0.999656 ... -0.373286 -0.408879 -0.348946 -0.337237 \n", - "3 -0.999499 -0.999485 -0.999494 ... -0.372968 -0.397196 -0.388759 -0.421546 \n", - "4 -0.999296 -0.999295 -0.999301 ... -0.372599 -0.282710 -0.271663 -0.210773 \n", - "\n", - " 21 22 23 24 25 turbine_id \n", - "0 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 T001 \n", - "1 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 T001 \n", - "2 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 T001 \n", - "3 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 T001 \n", - "4 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 T001 \n", - "\n", - "[5 rows x 27 columns]" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.set\n", - "\n", - "* Input: readings (dataframe with turbine_id)\n", - "* Output: readings (dataframe with turbine_id and timestamp)\n", - "* Effect: timestamp has been set as a readings column" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "step = 8\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...1819202122232425turbine_idtimestamp
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975T0012013-01-10 00:00:00
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853T0012013-01-10 01:00:00
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043T0012013-01-10 02:00:00
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731T0012013-01-10 03:00:00
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205T0012013-01-10 04:00:00
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", - "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", - "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", - "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", - "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", - "\n", - " 7 8 9 ... 18 19 20 21 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.252336 -0.227166 -0.201405 -0.134818 \n", - "1 -0.999830 -0.999825 -0.999825 ... -0.525701 -0.470726 -0.482436 -0.378664 \n", - "2 -0.999670 -0.999656 -0.999656 ... -0.408879 -0.348946 -0.337237 -0.294256 \n", - "3 -0.999499 -0.999485 -0.999494 ... -0.397196 -0.388759 -0.421546 -0.371630 \n", - "4 -0.999296 -0.999295 -0.999301 ... -0.282710 -0.271663 -0.210773 -0.237984 \n", - "\n", - " 22 23 24 25 turbine_id timestamp \n", - "0 -0.322392 -0.253801 -0.418224 -0.256975 T001 2013-01-10 00:00:00 \n", - "1 -0.507620 -0.440936 -0.553738 -0.480853 T001 2013-01-10 01:00:00 \n", - "2 -0.439625 -0.354386 -0.483645 -0.381043 T001 2013-01-10 02:00:00 \n", - "3 -0.495897 -0.461988 -0.565421 -0.442731 T001 2013-01-10 03:00:00 \n", - "4 -0.343494 -0.295906 -0.413551 -0.289205 T001 2013-01-10 04:00:00 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences\n", - "\n", - "* Input: X, readings (dataframe with turbine_id and timestamp)\n", - "* Output: X\n", - "* Effect: X has been converted to a 3d numpy array that contains 1 matrix of shape\n", - " (window_size x num_signals) for each one of the target times." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline._pipeline.get_hyperparameters()[\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "step = 9\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(8521, 28)" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(353,)" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['y'].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(353, 24, 26)" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['X'].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[-0.58793576, -0.60305643, -0.63981971, -0.61481191, -0.69823875,\n", - " -0.65021543, -0.68912322, -0.99436914, -0.99439755, -0.99454249,\n", - " -0.99446788, -0.99476185, -0.99490997, -0.99529511, -0.34701493,\n", - " -0.33886256, -0.33860532, -0.36301186, -0.57943925, -0.59250585,\n", - " -0.6323185 , -0.60609613, -0.69284877, -0.64444444, -0.68691589,\n", - " -0.63853752],\n", - " [-0.56600078, -0.5846395 , -0.63002156, -0.61559561, -0.70880626,\n", - " -0.66392479, -0.69732474, -0.9942427 , -0.99427986, -0.9944408 ,\n", - " -0.99436498, -0.99468147, -0.99482011, -0.99521249, -0.33955224,\n", - " -0.31516588, -0.38892883, -0.36280656, -0.55841121, -0.57611241,\n", - " -0.62295082, -0.61078546, -0.70222743, -0.65847953, -0.69392523,\n", - " -0.63645815],\n", - " [-0.64081473, -0.64184953, -0.67038997, -0.63597179, -0.71350294,\n", - " -0.65844105, -0.66764304, -0.99412236, -0.99416864, -0.99434228,\n", - " -0.99426059, -0.99459663, -0.99472365, -0.99511795, -0.34328358,\n", - " -0.30094787, -0.36304817, -0.36259859, -0.63317757, -0.6323185 ,\n", - " -0.66042155, -0.62954279, -0.70926143, -0.65380117, -0.66588785,\n", - " -0.66002426],\n", - " [-0.73678026, -0.72139498, -0.72800314, -0.69239812, -0.71350294,\n", - " -0.68233451, -0.69732474, -0.99403811, -0.99408512, -0.9942623 ,\n", - " -0.99417111, -0.99451525, -0.99463206, -0.9950315 , -0.40671642,\n", - " -0.36018957, -0.44644141, -0.36242395, -0.72897196, -0.71194379,\n", - " -0.71896956, -0.68347011, -0.70926143, -0.6748538 , -0.69392523,\n", - " -0.71027552],\n", - " [-0.75401488, -0.74333856, -0.75112679, -0.71590909, -0.76555773,\n", - " -0.73599687, -0.75278266, -0.99395808, -0.99400684, -0.99419094,\n", - " -0.99409367, -0.99444556, -0.99455517, -0.99495418, -0.43656716,\n", - " -0.3957346 , -0.465133 , -0.36226933, -0.7453271 , -0.735363 ,\n", - " -0.74004684, -0.70926143, -0.76084408, -0.73099415, -0.75 ,\n", - " -0.7463178 ],\n", - " [-0.79866823, -0.76684953, -0.7558299 , -0.72688088, -0.76125245,\n", - " -0.75714845, -0.78363601, -0.99389098, -0.99393583, -0.99411958,\n", - " -0.99401538, -0.99437709, -0.99448423, -0.99489036, -0.43843284,\n", - " -0.37914692, -0.49388929, -0.36212623, -0.78971963, -0.75644028,\n", - " -0.7470726 , -0.72098476, -0.75615475, -0.7497076 , -0.78037383,\n", - " -0.76572518],\n", - " [-0.84919702, -0.83855799, -0.82245738, -0.78134796, -0.75225049,\n", - " -0.70661966, -0.65787932, -0.99384186, -0.99388279, -0.9940635 ,\n", - " -0.99395157, -0.9943113 , -0.99441264, -0.99481202, -0.51679104,\n", - " -0.50473934, -0.53414809, -0.36199904, -0.8411215 , -0.83138173,\n", - " -0.81264637, -0.77256741, -0.74677608, -0.70292398, -0.65654206,\n", - " -0.77438919],\n", - " [-0.69134352, -0.705721 , -0.73584166, -0.70297806, -0.75225049,\n", - " -0.72659616, -0.71724273, -0.99377229, -0.99381646, -0.99400032,\n", - " -0.99387925, -0.99423682, -0.99433003, -0.99471624, -0.43843284,\n", - " -0.40521327, -0.48094896, -0.36184615, -0.68457944, -0.69555035,\n", - " -0.72599532, -0.6975381 , -0.74677608, -0.71929825, -0.71261682,\n", - " -0.71893953],\n", - " [-0.84488837, -0.82915361, -0.83578287, -0.81896552, -0.86105675,\n", - " -0.8613396 , -0.86330795, -0.99369779, -0.99374656, -0.99393715,\n", - " -0.99381182, -0.99418494, -0.99427639, -0.99466379, -0.49253731,\n", - " -0.48104265, -0.51545651, -0.36172116, -0.8364486 , -0.81967213,\n", - " -0.82435597, -0.81008206, -0.85463072, -0.85497076, -0.86214953,\n", - " -0.84889967],\n", - " [-0.77908343, -0.78761755, -0.78757594, -0.78918495, -0.82348337,\n", - " -0.82491187, -0.85276313, -0.99365725, -0.99370625, -0.99389819,\n", - " -0.99377113, -0.99415254, -0.99424222, -0.99463329, -0.52798507,\n", - " -0.6042654 , -0.51545651, -0.36164779, -0.77336449, -0.77985948,\n", - " -0.78220141, -0.78429074, -0.86635404, -0.82222222, -0.85046729,\n", - " -0.81562987],\n", - " [-0.70544458, -0.64733542, -0.64844209, -0.61833856, -0.6481409 ,\n", - " -0.66392479, -0.71646163, -0.99356747, -0.99360832, -0.99380327,\n", - " -0.99367558, -0.99407272, -0.99415647, -0.99456035, -0.36567164,\n", - " -0.4549763 , -0.34291876, -0.36146698, -0.70560748, -0.63934426,\n", - " -0.63934426, -0.62016413, -0.64830012, -0.65847953, -0.72663551,\n", - " -0.66868827],\n", - " [-0.70387779, -0.67202194, -0.69508132, -0.72413793, -0.73228963,\n", - " -0.72816295, -0.72310096, -0.99348204, -0.99351955, -0.99372023,\n", - " -0.99359367, -0.99399256, -0.99407882, -0.99449203, -0.38432836,\n", - " -0.58530806, -0.33141625, -0.36130226, -0.69392523, -0.66042155,\n", - " -0.68384075, -0.71629543, -0.72801876, -0.72163743, -0.72196262,\n", - " -0.7113152 ],\n", - " [-0.8515472 , -0.81073668, -0.776602 , -0.76724138, -0.78277886,\n", - " -0.75832354, -0.74262839, -0.99341682, -0.99344607, -0.99364669,\n", - " -0.99352762, -0.99392743, -0.99401037, -0.99441763, -0.44029851,\n", - " -0.5521327 , -0.38461538, -0.36116102, -0.84345794, -0.80327869,\n", - " -0.76814988, -0.76084408, -0.77725674, -0.75204678, -0.73831776,\n", - " -0.7865188 ],\n", - " [-0.80258519, -0.83659875, -0.83499902, -0.79741379, -0.80821918,\n", - " -0.81629456, -0.79379028, -0.99336347, -0.99339091, -0.99358745,\n", - " -0.99346147, -0.9938642 , -0.99394733, -0.99434605, -0.44962687,\n", - " -0.6563981 , -0.34579439, -0.36103606, -0.79439252, -0.82669789,\n", - " -0.82669789, -0.78898007, -0.80304807, -0.81052632, -0.79205607,\n", - " -0.81632299],\n", - " [-0.83313749, -0.87539185, -0.90241035, -0.88440439, -0.86771037,\n", - " -0.87935762, -0.87580551, -0.99331764, -0.99335898, -0.99355602,\n", - " -0.99342259, -0.99382267, -0.99390959, -0.99430418, -0.54291045,\n", - " -0.72274882, -0.42918763, -0.36096002, -0.82943925, -0.87119438,\n", - " -0.89461358, -0.87573271, -0.86166471, -0.87134503, -0.87383178,\n", - " -0.88078323],\n", - " [-0.56678418, -0.60031348, -0.64295512, -0.78409091, -0.76164384,\n", - " -0.78535057, -0.82464362, -0.99321481, -0.99327557, -0.99349034,\n", - " -0.99337881, -0.9937915 , -0.99387347, -0.99427367, -0.32835821,\n", - " -0.47630332, -0.25808771, -0.36084678, -0.56074766, -0.59250585,\n", - " -0.6323185 , -0.77960141, -0.84759672, -0.78947368, -0.8364486 ,\n", - " -0.72621729],\n", - " [-0.77007442, -0.81230408, -0.83186361, -0.85540752, -0.85870841,\n", - " -0.86486486, -0.847686 , -0.99311634, -0.99319338, -0.99341516,\n", - " -0.99332651, -0.99374196, -0.99381551, -0.99422246, -0.46641791,\n", - " -0.65165877, -0.39324227, -0.36071245, -0.76168224, -0.80093677,\n", - " -0.82201405, -0.84759672, -0.85463072, -0.85730994, -0.84579439,\n", - " -0.83780974],\n", - " [-0.87622405, -0.92163009, -0.91377621, -0.89224138, -0.84540117,\n", - " -0.83431257, -0.82112869, -0.99306816, -0.99315821, -0.99338734,\n", - " -0.99329935, -0.99370611, -0.99377885, -0.9941789 , -0.55783582,\n", - " -0.65402844, -0.50970525, -0.36064058, -0.86682243, -0.91334895,\n", - " -0.90632319, -0.88745604, -0.84056272, -0.82923977, -0.81775701,\n", - " -0.87731762],\n", - " [-0.82843713, -0.83111285, -0.84166177, -0.8322884 , -0.84579256,\n", - " -0.8515472 , -0.86057411, -0.99302656, -0.99312426, -0.99335155,\n", - " -0.99325919, -0.99365991, -0.99373278, -0.99413129, -0.50559701,\n", - " -0.53791469, -0.52120776, -0.36055736, -0.82242991, -0.82201405,\n", - " -0.83138173, -0.82415006, -0.84056272, -0.84327485, -0.85747664,\n", - " -0.84508751],\n", - " [-0.74539757, -0.73824451, -0.76484421, -0.72100313, -0.73228963,\n", - " -0.70975323, -0.739504 , -0.99296569, -0.99306553, -0.99329699,\n", - " -0.9932005 , -0.99360224, -0.99367493, -0.99407862, -0.45149254,\n", - " -0.46208531, -0.48382459, -0.36044105, -0.73598131, -0.73067916,\n", - " -0.75644028, -0.71629543, -0.72801876, -0.70526316, -0.73831776,\n", - " -0.73696067],\n", - " [-0.40814728, -0.4596395 , -0.51087596, -0.46316614, -0.54598826,\n", - " -0.50607129, -0.57039641, -0.99283748, -0.99294147, -0.9931881 ,\n", - " -0.99308418, -0.99349681, -0.99356041, -0.99398047, -0.30597015,\n", - " -0.29383886, -0.34867002, -0.36020709, -0.46728972, -0.470726 ,\n", - " -0.5175644 , -0.48651817, -0.55685815, -0.51812865, -0.59579439,\n", - " -0.5179345 ],\n", - " [-0.47591069, -0.45219436, -0.48579267, -0.48981191, -0.57847358,\n", - " -0.54876616, -0.61882445, -0.99268659, -0.99280044, -0.99306033,\n", - " -0.99295359, -0.99338192, -0.99344287, -0.9938794 , -0.30223881,\n", - " -0.33649289, -0.32278936, -0.35994787, -0.49065421, -0.46370023,\n", - " -0.4941452 , -0.49589683, -0.58264947, -0.55321637, -0.62850467,\n", - " -0.53110379],\n", - " [-0.26792009, -0.27115987, -0.30080345, -0.24412226, -0.34246575,\n", - " -0.30434783, -0.40285101, -0.99250927, -0.99261854, -0.99288914,\n", - " -0.99278188, -0.99322495, -0.99327569, -0.9937324 , -0.22947761,\n", - " -0.28909953, -0.26096334, -0.35960139, -0.33878505, -0.29976581,\n", - " -0.32786885, -0.2919109 , -0.38100821, -0.32865497, -0.42523364,\n", - " -0.3394559 ],\n", - " [-0.31374853, -0.26449843, -0.2941407 , -0.23315047, -0.36516634,\n", - " -0.35957697, -0.44112478, -0.9923035 , -0.99241264, -0.99269787,\n", - " -0.99258055, -0.99304482, -0.99309553, -0.99356987, -0.2108209 ,\n", - " -0.21563981, -0.23652049, -0.35921021, -0.30607477, -0.26229508,\n", - " -0.29039813, -0.23563892, -0.35990621, -0.35204678, -0.43925234,\n", - " -0.32004852]])" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['X'][0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## keras.Sequential.DoubleLSTMTimeSeriesClassifier\n", - "\n", - "* Input: X, y\n", - "* Output: \n", - "* Effect: DoubleLSTM has been fitted." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "step = 10\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb b/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb deleted file mode 100644 index 1a10480..0000000 --- a/tutorials/pipelines/unstack_lstm_timeseries_classifier.ipynb +++ /dev/null @@ -1,2355 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# unstack_lstm_timeseries_classifier" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from draco.demo import load_demo\n", - "\n", - "target_times, readings = load_demo()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_name = 'classes.unstack_lstm_timeseries_classifier'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "from draco.pipeline import DracoPipeline\n", - "\n", - "pipeline = DracoPipeline(pipeline_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['pandas.DataFrame.resample',\n", - " 'pandas.DataFrame.unstack',\n", - " 'pandas.DataFrame.pop',\n", - " 'pandas.DataFrame.pop',\n", - " 'sklearn.impute.SimpleImputer',\n", - " 'sklearn.preprocessing.MinMaxScaler',\n", - " 'pandas.DataFrame',\n", - " 'pandas.DataFrame.set',\n", - " 'pandas.DataFrame.set',\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences',\n", - " 'keras.Sequential.LSTMTimeSeriesClassifier']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.template['primitives']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Step by Step execution" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Input Data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 323.0\n", - "1 T001 S02 2013-01-10 320.0\n", - "2 T001 S03 2013-01-10 284.0\n", - "3 T001 S04 2013-01-10 348.0\n", - "4 T001 S05 2013-01-10 273.0" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", - "
" - ], - "text/plain": [ - " turbine_id cutoff_time target\n", - "0 T001 2013-01-12 0\n", - "1 T001 2013-01-13 0\n", - "2 T001 2013-01-14 0\n", - "3 T001 2013-01-15 1\n", - "4 T001 2013-01-16 0" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation (part of Draco Pipeline)\n", - "\n", - "* Input: target_times, readings, turbines\n", - "* Output: X, y, readings, turbines\n", - "* Effect: target_times has been split into X and y" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.resample\n", - "\n", - "* Input: readings\n", - "* Output: readings (resampled)\n", - "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "context = pipeline.fit(target_times, readings, output_=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y'])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value
turbine_idsignal_idtimestamp
T001S012013-01-10 00:00:00313.333333
2013-01-10 01:00:00197.500000
2013-01-10 02:00:00248.166667
2013-01-10 03:00:00253.166667
2013-01-10 04:00:00305.000000
\n", - "
" - ], - "text/plain": [ - " value\n", - "turbine_id signal_id timestamp \n", - "T001 S01 2013-01-10 00:00:00 313.333333\n", - " 2013-01-10 01:00:00 197.500000\n", - " 2013-01-10 02:00:00 248.166667\n", - " 2013-01-10 03:00:00 253.166667\n", - " 2013-01-10 04:00:00 305.000000" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.unstack\n", - "\n", - "* Input: readings (resampled)\n", - "* Output: readings (unstacked)\n", - "* Effect: readings have been unstacked" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "step = 1\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
1T0012013-01-10 01:00:00197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
2T0012013-01-10 02:00:00248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
3T0012013-01-10 03:00:00253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
4T0012013-01-10 04:00:00305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", - "0 T001 2013-01-10 00:00:00 313.333333 323.833333 336.000000 \n", - "1 T001 2013-01-10 01:00:00 197.500000 221.333333 216.000000 \n", - "2 T001 2013-01-10 02:00:00 248.166667 271.666667 277.500000 \n", - "3 T001 2013-01-10 03:00:00 253.166667 256.166667 242.666667 \n", - "4 T001 2013-01-10 04:00:00 305.000000 312.333333 346.166667 \n", - "\n", - " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", - "0 364.666667 286.500000 314.000000 243.166667 3.197980e+06 ... \n", - "1 260.666667 206.833333 235.833333 186.666667 3.198221e+06 ... \n", - "2 298.000000 233.666667 271.166667 216.333333 3.198448e+06 ... \n", - "3 265.333333 211.666667 226.666667 181.000000 3.198691e+06 ... \n", - "4 329.833333 280.666667 308.833333 271.833333 3.198978e+06 ... \n", - "\n", - " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", - "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", - "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", - "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", - "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", - "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", - "\n", - " value_S23 value_S24 value_S25 value_S26 \n", - "0 47.666667 52.666667 40.833333 357.333333 \n", - "1 34.500000 39.333333 31.166667 249.666667 \n", - "2 39.333333 45.500000 36.166667 297.666667 \n", - "3 35.333333 37.833333 30.333333 268.000000 \n", - "4 46.166667 49.666667 41.166667 341.833333 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.pop\n", - "\n", - "* Input: readings (unstacked)\n", - "* Output: readings (without turbine_id), turbine_id\n", - "* Effect: turbine_id has been popped from readings" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "step = 2\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 T001\n", - "1 T001\n", - "2 T001\n", - "3 T001\n", - "4 T001\n", - "Name: turbine_id, dtype: object" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['turbine_id'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
timestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
02013-01-10 00:00:00313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06695143.166667...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
12013-01-10 01:00:00197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06695403.666667...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
22013-01-10 02:00:00248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06695656.500000...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
32013-01-10 03:00:00253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06695911.333333...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
42013-01-10 04:00:00305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06696195.833333...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", - "

5 rows × 27 columns

\n", - "
" - ], - "text/plain": [ - " timestamp value_S01 value_S02 value_S03 value_S04 \\\n", - "0 2013-01-10 00:00:00 313.333333 323.833333 336.000000 364.666667 \n", - "1 2013-01-10 01:00:00 197.500000 221.333333 216.000000 260.666667 \n", - "2 2013-01-10 02:00:00 248.166667 271.666667 277.500000 298.000000 \n", - "3 2013-01-10 03:00:00 253.166667 256.166667 242.666667 265.333333 \n", - "4 2013-01-10 04:00:00 305.000000 312.333333 346.166667 329.833333 \n", - "\n", - " value_S05 value_S06 value_S07 value_S08 value_S09 ... \\\n", - "0 286.500000 314.000000 243.166667 3.197980e+06 695143.166667 ... \n", - "1 206.833333 235.833333 186.666667 3.198221e+06 695403.666667 ... \n", - "2 233.666667 271.166667 216.333333 3.198448e+06 695656.500000 ... \n", - "3 211.666667 226.666667 181.000000 3.198691e+06 695911.333333 ... \n", - "4 280.666667 308.833333 271.833333 3.198978e+06 696195.833333 ... \n", - "\n", - " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", - "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", - "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", - "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", - "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", - "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", - "\n", - " value_S23 value_S24 value_S25 value_S26 \n", - "0 47.666667 52.666667 40.833333 357.333333 \n", - "1 34.500000 39.333333 31.166667 249.666667 \n", - "2 39.333333 45.500000 36.166667 297.666667 \n", - "3 35.333333 37.833333 30.333333 268.000000 \n", - "4 46.166667 49.666667 41.166667 341.833333 \n", - "\n", - "[5 rows x 27 columns]" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.pop\n", - "\n", - "* Input: readings (without turbine_id)\n", - "* Output: readings (without timestamp), timestamp\n", - "* Effect: timestamp has been popped from readings" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "step = 3\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2013-01-10 00:00:00\n", - "1 2013-01-10 01:00:00\n", - "2 2013-01-10 02:00:00\n", - "3 2013-01-10 03:00:00\n", - "4 2013-01-10 04:00:00\n", - "Name: timestamp, dtype: datetime64[ns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['timestamp'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08value_S09value_S10...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0313.333333323.833333336.000000364.666667286.500000314.000000243.1666673.197980e+06695143.1666673.348384e+06...10.3833333.131958e+0652.66666754.33333356.16666761.00000047.66666752.66666740.833333357.333333
1197.500000221.333333216.000000260.666667206.833333235.833333186.6666673.198221e+06695403.6666673.348651e+06...8.6666673.133668e+0633.16666737.00000036.16666743.66666734.50000039.33333331.166667249.666667
2248.166667271.666667277.500000298.000000233.666667271.166667216.3333333.198448e+06695656.5000003.348910e+06...8.8333333.135413e+0641.50000045.66666746.50000049.66666739.33333345.50000036.166667297.666667
3253.166667256.166667242.666667265.333333211.666667226.666667181.0000003.198691e+06695911.3333333.349157e+06...8.4333333.137001e+0642.33333342.83333340.50000044.16666735.33333337.83333330.333333268.000000
4305.000000312.333333346.166667329.833333280.666667308.833333271.8333333.198978e+06696195.8333333.349452e+06...9.0833333.138843e+0650.50000051.16666755.50000053.66666746.16666749.66666741.166667341.833333
\n", - "

5 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", - "0 313.333333 323.833333 336.000000 364.666667 286.500000 314.000000 \n", - "1 197.500000 221.333333 216.000000 260.666667 206.833333 235.833333 \n", - "2 248.166667 271.666667 277.500000 298.000000 233.666667 271.166667 \n", - "3 253.166667 256.166667 242.666667 265.333333 211.666667 226.666667 \n", - "4 305.000000 312.333333 346.166667 329.833333 280.666667 308.833333 \n", - "\n", - " value_S07 value_S08 value_S09 value_S10 ... value_S17 \\\n", - "0 243.166667 3.197980e+06 695143.166667 3.348384e+06 ... 10.383333 \n", - "1 186.666667 3.198221e+06 695403.666667 3.348651e+06 ... 8.666667 \n", - "2 216.333333 3.198448e+06 695656.500000 3.348910e+06 ... 8.833333 \n", - "3 181.000000 3.198691e+06 695911.333333 3.349157e+06 ... 8.433333 \n", - "4 271.833333 3.198978e+06 696195.833333 3.349452e+06 ... 9.083333 \n", - "\n", - " value_S18 value_S19 value_S20 value_S21 value_S22 value_S23 \\\n", - "0 3.131958e+06 52.666667 54.333333 56.166667 61.000000 47.666667 \n", - "1 3.133668e+06 33.166667 37.000000 36.166667 43.666667 34.500000 \n", - "2 3.135413e+06 41.500000 45.666667 46.500000 49.666667 39.333333 \n", - "3 3.137001e+06 42.333333 42.833333 40.500000 44.166667 35.333333 \n", - "4 3.138843e+06 50.500000 51.166667 55.500000 53.666667 46.166667 \n", - "\n", - " value_S24 value_S25 value_S26 \n", - "0 52.666667 40.833333 357.333333 \n", - "1 39.333333 31.166667 249.666667 \n", - "2 45.500000 36.166667 297.666667 \n", - "3 37.833333 30.333333 268.000000 \n", - "4 49.666667 41.166667 341.833333 \n", - "\n", - "[5 rows x 26 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## sklearn.impute.SimpleImputer\n", - "\n", - "* Input: readings (unstacked, no turbine_id, no timestamp)\n", - "* Output: readings (imputed, numpy array)\n", - "* Effect: readings have been imputed and converted to numpy array" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "step = 4\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[3.13333333e+02, 3.23833333e+02, 3.36000000e+02, 3.64666667e+02,\n", - " 2.86500000e+02, 3.14000000e+02, 2.43166667e+02, 3.19798000e+06,\n", - " 6.95143167e+05, 3.34838383e+06, 3.43692150e+06, 3.32248667e+06,\n", - " 3.35809000e+06, 3.22390150e+06, 7.95000000e+00, 5.85000000e+00,\n", - " 1.03833333e+01, 3.13195833e+06, 5.26666667e+01, 5.43333333e+01,\n", - " 5.61666667e+01, 6.10000000e+01, 4.76666667e+01, 5.26666667e+01,\n", - " 4.08333333e+01, 3.57333333e+02],\n", - " [1.97500000e+02, 2.21333333e+02, 2.16000000e+02, 2.60666667e+02,\n", - " 2.06833333e+02, 2.35833333e+02, 1.86666667e+02, 3.19822067e+06,\n", - " 6.95403667e+05, 3.34865117e+06, 3.43722283e+06, 3.32272200e+06,\n", - " 3.35834000e+06, 3.22409567e+06, 6.83333333e+00, 5.15000000e+00,\n", - " 8.66666667e+00, 3.13366817e+06, 3.31666667e+01, 3.70000000e+01,\n", - " 3.61666667e+01, 4.36666667e+01, 3.45000000e+01, 3.93333333e+01,\n", - " 3.11666667e+01, 2.49666667e+02],\n", - " [2.48166667e+02, 2.71666667e+02, 2.77500000e+02, 2.98000000e+02,\n", - " 2.33666667e+02, 2.71166667e+02, 2.16333333e+02, 3.19844767e+06,\n", - " 6.95656500e+05, 3.34890967e+06, 3.43751900e+06, 3.32295950e+06,\n", - " 3.35862067e+06, 3.22432333e+06, 7.11666667e+00, 5.56666667e+00,\n", - " 8.83333333e+00, 3.13541283e+06, 4.15000000e+01, 4.56666667e+01,\n", - " 4.65000000e+01, 4.96666667e+01, 3.93333333e+01, 4.55000000e+01,\n", - " 3.61666667e+01, 2.97666667e+02],\n", - " [2.53166667e+02, 2.56166667e+02, 2.42666667e+02, 2.65333333e+02,\n", - " 2.11666667e+02, 2.26666667e+02, 1.81000000e+02, 3.19869117e+06,\n", - " 6.95911333e+05, 3.34915717e+06, 3.43778050e+06, 3.32316850e+06,\n", - " 3.35884883e+06, 3.22450217e+06, 6.71666667e+00, 5.16666667e+00,\n", - " 8.43333333e+00, 3.13700133e+06, 4.23333333e+01, 4.28333333e+01,\n", - " 4.05000000e+01, 4.41666667e+01, 3.53333333e+01, 3.78333333e+01,\n", - " 3.03333333e+01, 2.68000000e+02],\n", - " [3.05000000e+02, 3.12333333e+02, 3.46166667e+02, 3.29833333e+02,\n", - " 2.80666667e+02, 3.08833333e+02, 2.71833333e+02, 3.19897850e+06,\n", - " 6.96195833e+05, 3.34945200e+06, 3.43807767e+06, 3.32340933e+06,\n", - " 3.35910983e+06, 3.22471400e+06, 7.20000000e+00, 5.28333333e+00,\n", - " 9.08333333e+00, 3.13884333e+06, 5.05000000e+01, 5.11666667e+01,\n", - " 5.55000000e+01, 5.36666667e+01, 4.61666667e+01, 4.96666667e+01,\n", - " 4.11666667e+01, 3.41833333e+02]])" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'][0:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## sklearn.preprocessing.MinMaxScaler\n", - "\n", - "* Input: (imputed, array)\n", - "* Output: readings (scaled, array)\n", - "* Effect: readings have been scaled to [-1, 1] range" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "step = 5\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[-0.26126126, -0.23706897, -0.20870076, -0.14106583, -0.32328767,\n", - " -0.25969448, -0.42198789, -1. , -1. , -1. ,\n", - " -1. , -1. , -1. , -1. , -0.11007463,\n", - " -0.16824645, -0.10424155, -0.37397741, -0.25233645, -0.22716628,\n", - " -0.20140515, -0.13481829, -0.32239156, -0.25380117, -0.4182243 ,\n", - " -0.25697453],\n", - " [-0.53349001, -0.47805643, -0.49088771, -0.38557994, -0.51037182,\n", - " -0.44339992, -0.55438391, -0.99983031, -0.99982547, -0.99982499,\n", - " -0.99980741, -0.9998428 , -0.99983779, -0.99986887, -0.23507463,\n", - " -0.26777251, -0.25233645, -0.37363511, -0.52570093, -0.470726 ,\n", - " -0.4824356 , -0.37866354, -0.50762016, -0.44093567, -0.55373832,\n", - " -0.48085254],\n", - " [-0.41441441, -0.35971787, -0.3462669 , -0.29780564, -0.44735812,\n", - " -0.36036036, -0.48486624, -0.99967026, -0.99965608, -0.99965576,\n", - " -0.99961813, -0.99968416, -0.99965569, -0.99971512, -0.20335821,\n", - " -0.20853081, -0.2379583 , -0.37328583, -0.4088785 , -0.34894614,\n", - " -0.33723653, -0.29425557, -0.43962485, -0.35438596, -0.48364486,\n", - " -0.38104315],\n", - " [-0.40266353, -0.39615987, -0.4281795 , -0.37460815, -0.49902153,\n", - " -0.4649432 , -0.56766257, -0.99949857, -0.99948535, -0.99949373,\n", - " -0.999451 , -0.99954455, -0.99950765, -0.99959435, -0.24813433,\n", - " -0.26540284, -0.27246585, -0.37296782, -0.39719626, -0.38875878,\n", - " -0.42154567, -0.37162954, -0.49589683, -0.4619883 , -0.56542056,\n", - " -0.4427309 ],\n", - " [-0.28084606, -0.26410658, -0.18479326, -0.22296238, -0.3369863 ,\n", - " -0.27183705, -0.35481351, -0.99929598, -0.99929474, -0.99930071,\n", - " -0.99926107, -0.99938368, -0.99933831, -0.9994513 , -0.19402985,\n", - " -0.24881517, -0.21639109, -0.37259906, -0.28271028, -0.27166276,\n", - " -0.21077283, -0.23798359, -0.34349355, -0.29590643, -0.4135514 ,\n", - " -0.28920464]])" - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'][0:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame\n", - "\n", - "* Input: readings (scaled, array)\n", - "* Output: readings (dataframe)\n", - "* Effect: readings have been converted into a dataframe" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "step = 6\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...16171819202122232425
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.104242-0.373977-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.252336-0.373635-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.237958-0.373286-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.272466-0.372968-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.216391-0.372599-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205
\n", - "

5 rows × 26 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", - "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", - "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", - "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", - "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", - "\n", - " 7 8 9 ... 16 17 18 19 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.104242 -0.373977 -0.252336 -0.227166 \n", - "1 -0.999830 -0.999825 -0.999825 ... -0.252336 -0.373635 -0.525701 -0.470726 \n", - "2 -0.999670 -0.999656 -0.999656 ... -0.237958 -0.373286 -0.408879 -0.348946 \n", - "3 -0.999499 -0.999485 -0.999494 ... -0.272466 -0.372968 -0.397196 -0.388759 \n", - "4 -0.999296 -0.999295 -0.999301 ... -0.216391 -0.372599 -0.282710 -0.271663 \n", - "\n", - " 20 21 22 23 24 25 \n", - "0 -0.201405 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 \n", - "1 -0.482436 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 \n", - "2 -0.337237 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 \n", - "3 -0.421546 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 \n", - "4 -0.210773 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 \n", - "\n", - "[5 rows x 26 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.set\n", - "\n", - "* Input: readings (dataframe)\n", - "* Output: readings (dataframe with turbine_id)\n", - "* Effect: turbine_id has been set as a readings column" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "step = 7\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...171819202122232425turbine_id
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.373977-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975T001
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.373635-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853T001
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.373286-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043T001
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.372968-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731T001
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.372599-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205T001
\n", - "

5 rows × 27 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", - "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", - "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", - "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", - "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", - "\n", - " 7 8 9 ... 17 18 19 20 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.373977 -0.252336 -0.227166 -0.201405 \n", - "1 -0.999830 -0.999825 -0.999825 ... -0.373635 -0.525701 -0.470726 -0.482436 \n", - "2 -0.999670 -0.999656 -0.999656 ... -0.373286 -0.408879 -0.348946 -0.337237 \n", - "3 -0.999499 -0.999485 -0.999494 ... -0.372968 -0.397196 -0.388759 -0.421546 \n", - "4 -0.999296 -0.999295 -0.999301 ... -0.372599 -0.282710 -0.271663 -0.210773 \n", - "\n", - " 21 22 23 24 25 turbine_id \n", - "0 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 T001 \n", - "1 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 T001 \n", - "2 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 T001 \n", - "3 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 T001 \n", - "4 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 T001 \n", - "\n", - "[5 rows x 27 columns]" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## pandas.DataFrame.set\n", - "\n", - "* Input: readings (dataframe with turbine_id)\n", - "* Output: readings (dataframe with turbine_id and timestamp)\n", - "* Effect: timestamp has been set as a readings column" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "metadata": {}, - "outputs": [], - "source": [ - "step = 8\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
0123456789...1819202122232425turbine_idtimestamp
0-0.261261-0.237069-0.208701-0.141066-0.323288-0.259694-0.421988-1.000000-1.000000-1.000000...-0.252336-0.227166-0.201405-0.134818-0.322392-0.253801-0.418224-0.256975T0012013-01-10 00:00:00
1-0.533490-0.478056-0.490888-0.385580-0.510372-0.443400-0.554384-0.999830-0.999825-0.999825...-0.525701-0.470726-0.482436-0.378664-0.507620-0.440936-0.553738-0.480853T0012013-01-10 01:00:00
2-0.414414-0.359718-0.346267-0.297806-0.447358-0.360360-0.484866-0.999670-0.999656-0.999656...-0.408879-0.348946-0.337237-0.294256-0.439625-0.354386-0.483645-0.381043T0012013-01-10 02:00:00
3-0.402664-0.396160-0.428180-0.374608-0.499022-0.464943-0.567663-0.999499-0.999485-0.999494...-0.397196-0.388759-0.421546-0.371630-0.495897-0.461988-0.565421-0.442731T0012013-01-10 03:00:00
4-0.280846-0.264107-0.184793-0.222962-0.336986-0.271837-0.354814-0.999296-0.999295-0.999301...-0.282710-0.271663-0.210773-0.237984-0.343494-0.295906-0.413551-0.289205T0012013-01-10 04:00:00
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " 0 1 2 3 4 5 6 \\\n", - "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", - "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", - "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", - "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", - "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", - "\n", - " 7 8 9 ... 18 19 20 21 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.252336 -0.227166 -0.201405 -0.134818 \n", - "1 -0.999830 -0.999825 -0.999825 ... -0.525701 -0.470726 -0.482436 -0.378664 \n", - "2 -0.999670 -0.999656 -0.999656 ... -0.408879 -0.348946 -0.337237 -0.294256 \n", - "3 -0.999499 -0.999485 -0.999494 ... -0.397196 -0.388759 -0.421546 -0.371630 \n", - "4 -0.999296 -0.999295 -0.999301 ... -0.282710 -0.271663 -0.210773 -0.237984 \n", - "\n", - " 22 23 24 25 turbine_id timestamp \n", - "0 -0.322392 -0.253801 -0.418224 -0.256975 T001 2013-01-10 00:00:00 \n", - "1 -0.507620 -0.440936 -0.553738 -0.480853 T001 2013-01-10 01:00:00 \n", - "2 -0.439625 -0.354386 -0.483645 -0.381043 T001 2013-01-10 02:00:00 \n", - "3 -0.495897 -0.461988 -0.565421 -0.442731 T001 2013-01-10 03:00:00 \n", - "4 -0.343494 -0.295906 -0.413551 -0.289205 T001 2013-01-10 04:00:00 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 35, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences\n", - "\n", - "* Input: X, readings (dataframe with turbine_id and timestamp)\n", - "* Output: X\n", - "* Effect: X has been converted to a 3d numpy array that contains 1 matrix of shape\n", - " (window_size x num_signals) for each one of the target times." - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline._pipeline.get_hyperparameters()[\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "step = 9\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(8521, 28)" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(353,)" - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['y'].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(353, 24, 26)" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['X'].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "metadata": { - "scrolled": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "array([[-0.58793576, -0.60305643, -0.63981971, -0.61481191, -0.69823875,\n", - " -0.65021543, -0.68912322, -0.99436914, -0.99439755, -0.99454249,\n", - " -0.99446788, -0.99476185, -0.99490997, -0.99529511, -0.34701493,\n", - " -0.33886256, -0.33860532, -0.36301186, -0.57943925, -0.59250585,\n", - " -0.6323185 , -0.60609613, -0.69284877, -0.64444444, -0.68691589,\n", - " -0.63853752],\n", - " [-0.56600078, -0.5846395 , -0.63002156, -0.61559561, -0.70880626,\n", - " -0.66392479, -0.69732474, -0.9942427 , -0.99427986, -0.9944408 ,\n", - " -0.99436498, -0.99468147, -0.99482011, -0.99521249, -0.33955224,\n", - " -0.31516588, -0.38892883, -0.36280656, -0.55841121, -0.57611241,\n", - " -0.62295082, -0.61078546, -0.70222743, -0.65847953, -0.69392523,\n", - " -0.63645815],\n", - " [-0.64081473, -0.64184953, -0.67038997, -0.63597179, -0.71350294,\n", - " -0.65844105, -0.66764304, -0.99412236, -0.99416864, -0.99434228,\n", - " -0.99426059, -0.99459663, -0.99472365, -0.99511795, -0.34328358,\n", - " -0.30094787, -0.36304817, -0.36259859, -0.63317757, -0.6323185 ,\n", - " -0.66042155, -0.62954279, -0.70926143, -0.65380117, -0.66588785,\n", - " -0.66002426]])" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['X'][0][:3]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## keras.Sequential.LSTMTimeSeriesClassifier\n", - "\n", - "* Input: X, y\n", - "* Output: \n", - "* Effect: LSTM has been fitted." - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": {}, - "outputs": [], - "source": [ - "step = 10\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From ffd512959df111d08a05226ea3c98abf07230414 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Mon, 11 Apr 2022 15:28:57 -0400 Subject: [PATCH 156/171] Update dependencies (#67) * update deps * add numpy and pandas * pin minimum for Jinja2 --- .gitignore | 1 + draco/pipeline.py | 12 ++++++++---- draco/pipelines/dummy/dummy.json | 11 +++++++++++ setup.py | 6 ++---- tests/test_pipeline.py | 13 +++++++++++++ 5 files changed, 35 insertions(+), 8 deletions(-) create mode 100644 draco/pipelines/dummy/dummy.json diff --git a/.gitignore b/.gitignore index b4e035b..25331c7 100644 --- a/.gitignore +++ b/.gitignore @@ -113,4 +113,5 @@ scripts/ dask-worker-space/ tutorials/*.pkl +*.pkl *.DS_Store diff --git a/draco/pipeline.py b/draco/pipeline.py index 98fb3d7..6a9adf6 100644 --- a/draco/pipeline.py +++ b/draco/pipeline.py @@ -9,7 +9,6 @@ from copy import deepcopy from hashlib import md5 -import cloudpickle import keras import numpy as np from btb import BTBSession @@ -612,14 +611,14 @@ def predict(self, target_times=None, readings=None, turbines=None, return predictions def save(self, path): - """Serialize and save this pipeline using cloudpickle. + """Serialize and save this pipeline using pickle. Args: path (str): Path to the file where the pipeline will be saved. """ with open(path, 'wb') as pickle_file: - cloudpickle.dump(self, pickle_file) + pickle.dump(self, pickle_file) @classmethod def load(cls, path): @@ -634,4 +633,9 @@ def load(cls, path): Loaded DracoPipeline instance. """ with open(path, 'rb') as pickle_file: - return cloudpickle.load(pickle_file) + pipeline = pickle.load(pickle_file) + + if not isinstance(pipeline, cls): + raise ValueError('Serialized object is not a DracoPipeline') + + return pipeline diff --git a/draco/pipelines/dummy/dummy.json b/draco/pipelines/dummy/dummy.json new file mode 100644 index 0000000..a28121e --- /dev/null +++ b/draco/pipelines/dummy/dummy.json @@ -0,0 +1,11 @@ +{ + "primitives": [ + "mlprimitives.custom.preprocessing.ClassEncoder", + "mlprimitives.custom.feature_extraction.DatetimeFeaturizer", + "mlprimitives.custom.feature_extraction.CategoricalEncoder", + "mlprimitives.custom.feature_extraction.StringVectorizer", + "sklearn.impute.SimpleImputer", + "sklearn.linear_model.LogisticRegression", + "mlprimitives.custom.preprocessing.ClassDecoder" + ] +} \ No newline at end of file diff --git a/setup.py b/setup.py index 0157b7c..5c2de23 100644 --- a/setup.py +++ b/setup.py @@ -22,15 +22,12 @@ 'pymongo>=3.7.2,<4', 'scikit-learn>=0.21', 'tqdm<4.50.0,>=4.36.1', - 'cloudpickle>=1.6,<2', 'scipy>=1.0.1,<2', - 'numpy<1.19.0,>=1.16.0', + 'numpy>=1.16.0,<1.21.0', 'pandas>=1,<2', 'partd>=1.1.0,<2', 'fsspec>=0.8.5,<0.9', 'dask>=2.6.0,<3', - 'distributed>=2.6.0,<3', - 'h5py<2.11.0,>=2.10.0', # fix tensorflow requirement 'tabulate>=0.8.3,<0.9', 'xlsxwriter>=1.3.6<1.4', ] @@ -59,6 +56,7 @@ 'sphinx_rtd_theme>=0.2.4,<0.5', 'docutils>=0.14,<0.18', 'autodocsumm>=0.1.10', + 'Jinja2>=2,<3', # style check 'flake8>=3.7.7,<4', diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index f8526c9..3b7359f 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -67,3 +67,16 @@ def test_predict(self, load_pipeline_mock, mlpipeline_mock): instance.fitted = True target_times, readings = self._get_data() instance.predict(target_times, readings) + + def test_save_load(self): + file = 'path.pkl' + + # Run + instance = DracoPipeline('dummy', 'accuracy') + instance.save(file) + new_instance = DracoPipeline.load(file) + + # Asserts + assert isinstance(new_instance, instance.__class__) + assert instance.template == new_instance.template + assert instance.fitted == new_instance.fitted From a15783cc84750ec19bbb485670bdd2e6069a66bd Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Mon, 11 Apr 2022 17:44:03 -0400 Subject: [PATCH 157/171] Fix Doc Generation (#68) * add docs test * pin doc deps --- .github/workflows/tests.yml | 17 +++++++++++++++++ setup.py | 1 + 2 files changed, 18 insertions(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d4c79b9..6d73a25 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -7,6 +7,23 @@ on: branches: [ master ] jobs: + docs: + runs-on: ${{ matrix.os }} + strategy: + matrix: + python-version: [3.8] + os: [ubuntu-latest] + steps: + - uses: actions/checkout@v1 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v1 + with: + python-version: ${{ matrix.python-version }} + - name: Install package + run: python -m pip install .[dev] + - name: make docs + run: make docs + lint: runs-on: ${{ matrix.os }} strategy: diff --git a/setup.py b/setup.py index 5c2de23..69d188e 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ 'sphinx_rtd_theme>=0.2.4,<0.5', 'docutils>=0.14,<0.18', 'autodocsumm>=0.1.10', + 'markupsafe<2.1.0', 'Jinja2>=2,<3', # style check From 0a678f175065e4eee01b2f5d856ddaa5a7bcdccd Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Tue, 12 Apr 2022 03:25:19 -0400 Subject: [PATCH 158/171] Prepare Release v0.2.0 (#69) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * update release commands * Bump version: 0.1.1.dev0 → 0.1.1.dev1 * cap tensorflow * Bump version: 0.1.1.dev1 → 0.1.1.dev2 * add release notes --- HISTORY.md | 11 +++++++++++ Makefile | 8 +++++++- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 3 ++- 5 files changed, 22 insertions(+), 4 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 84a28d5..539ca0e 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,9 +1,20 @@ # History + +## 0.2.0 - 2022-04-12 + +This release features a reorganization and renaming of ``Draco`` pipelines. In addtion, +we update some of the dependencies for general housekeeping. + +* Update Draco dependencies - [Issue #66](https://github.com/signals-dev/Draco/issues/66) by @sarahmish +* Reorganize pipelines - [Issue #63](https://github.com/signals-dev/Draco/issues/63) by @sarahmish + + ## 0.1.0 - 2022-01-01 * First release on ``draco-ml`` PyPI + ## Previous GreenGuard development ### 0.3.0 - 2021-01-22 diff --git a/Makefile b/Makefile index a6ad0e5..590d8bc 100644 --- a/Makefile +++ b/Makefile @@ -256,7 +256,7 @@ check-release: check-candidate check-clean check-master check-history ## Check i @echo "A new release can be made" .PHONY: release -release: check-release bumpversion-release docker-push publish bumpversion-patch +release: check-release bumpversion-release publish bumpversion-patch .PHONY: release-test release-test: check-release bumpversion-release-test publish-test bumpversion-revert @@ -267,6 +267,12 @@ release-candidate: check-master publish bumpversion-candidate .PHONY: release-candidate-test release-candidate-test: check-clean check-master publish-test +.PHONY: release-minor +release-minor: check-release bumpversion-minor release + +.PHONY: release-major +release-major: check-release bumpversion-major release + # DOCKER TARGETS diff --git a/draco/__init__.py b/draco/__init__.py index a1a5e8a..94384c3 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.1.dev0' +__version__ = '0.1.1.dev2' import os diff --git a/setup.cfg b/setup.cfg index 597575d..3113a93 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.1.dev0 +current_version = 0.1.1.dev2 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 69d188e..f73f2ab 100644 --- a/setup.py +++ b/setup.py @@ -25,6 +25,7 @@ 'scipy>=1.0.1,<2', 'numpy>=1.16.0,<1.21.0', 'pandas>=1,<2', + 'tensorflow>=2,<2.3', 'partd>=1.1.0,<2', 'fsspec>=0.8.5,<0.9', 'dask>=2.6.0,<3', @@ -114,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.1.1.dev0', + version='0.1.1.dev2', zip_safe=False, ) From 49d69e670d2f8ec36746b2cfd4f3893ffecf2065 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 12 Apr 2022 03:28:18 -0400 Subject: [PATCH 159/171] =?UTF-8?q?Bump=20version:=200.1.1.dev2=20?= =?UTF-8?q?=E2=86=92=200.2.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 94384c3..e11da10 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.1.1.dev2' +__version__ = '0.2.0.dev0' import os diff --git a/setup.cfg b/setup.cfg index 3113a93..a225134 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.1.1.dev2 +current_version = 0.2.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index f73f2ab..39ba79e 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.1.1.dev2', + version='0.2.0.dev0', zip_safe=False, ) From 9aa2b2203fd8c24d965c15c98d6409ead33f5ec0 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 12 Apr 2022 03:28:19 -0400 Subject: [PATCH 160/171] =?UTF-8?q?Bump=20version:=200.2.0.dev0=20?= =?UTF-8?q?=E2=86=92=200.2.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index e11da10..8f1402d 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.0.dev0' +__version__ = '0.2.0' import os diff --git a/setup.cfg b/setup.cfg index a225134..d366bb3 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.0.dev0 +current_version = 0.2.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 39ba79e..a104b00 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.2.0.dev0', + version='0.2.0', zip_safe=False, ) From fa13eb0d52e82cf86d8ec8bea5409519902da1a4 Mon Sep 17 00:00:00 2001 From: sarahmish Date: Tue, 12 Apr 2022 03:28:34 -0400 Subject: [PATCH 161/171] =?UTF-8?q?Bump=20version:=200.2.0=20=E2=86=92=200?= =?UTF-8?q?.2.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 8f1402d..e134da2 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.0' +__version__ = '0.2.1.dev0' import os diff --git a/setup.cfg b/setup.cfg index d366bb3..e78faaa 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.0 +current_version = 0.2.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index a104b00..d5deb41 100644 --- a/setup.py +++ b/setup.py @@ -115,6 +115,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.2.0', + version='0.2.1.dev0', zip_safe=False, ) From 03ab7c9f4b5e2bddff3ee4c4f26b1faf30ef7941 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Fri, 24 Feb 2023 17:45:05 -0600 Subject: [PATCH 162/171] Update packages and project description (#70) * remove wind industries * pin `protobuf` * pin metadata * pin ubuntu test version * pin importlib --- .github/workflows/tests.yml | 52 ++++++++++++++++++------------------- README.md | 4 +-- docs/conf.py | 2 +- setup.py | 7 +++-- 4 files changed, 34 insertions(+), 31 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6d73a25..17d140f 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install package @@ -29,30 +29,30 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest] + os: [ubuntu-20.04] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test with tox - run: tox -e lint + pip install .[dev] + - name: make lint + run: make lint readme: runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest] + os: [ubuntu-20.04] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install libgomp1 @@ -61,63 +61,63 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test with tox - run: tox -e readme + pip install rundoc . + - name: make readme + run: make test-readme unit: runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest, macos-10.15] + os: [ubuntu-20.04, macos-10.15] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test with tox - run: tox -e unit + pip install .[test] + - name: make unit + run: make test-unit minimum: runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest] + os: [ubuntu-20.04] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test with tox - run: tox -e minimum + pip install .[test] + - name: make minimum + run: make test-minimum tutorials: runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-latest] + os: [ubuntu-20.04] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v1 + uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip - pip install tox tox-gh-actions - - name: Test with tox - run: tox -e tutorials + pip install jupyter . + - name: make tutorials + run: make test-tutorials diff --git a/README.md b/README.md index 2d398e4..08effaa 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@

-AutoML for Renewable Energy Industries. +AutoML for Time Series.

@@ -29,7 +29,7 @@ AutoML for Renewable Energy Industries. ## Overview The Draco project is a collection of end-to-end solutions for machine learning problems -commonly found in monitoring wind energy production systems. Most tasks utilize sensor data +commonly found in time series monitoring systems. Most tasks utilize sensor data emanating from monitoring systems. We utilize the foundational innovations developed for automation of machine Learning at Data to AI Lab at MIT. diff --git a/docs/conf.py b/docs/conf.py index ecd0023..e01a346 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -63,7 +63,7 @@ title = project + ' Documentation', copyright = '2018, MIT Data To AI Lab' author = 'MIT Data To AI Lab' -description = 'AutoML for Renewable Energy Industries' +description = 'AutoML for Time Series' user = 'sintel-dev' # The version info for the project you're documenting, acts as replacement diff --git a/setup.py b/setup.py index d5deb41..0118bd0 100644 --- a/setup.py +++ b/setup.py @@ -31,6 +31,9 @@ 'dask>=2.6.0,<3', 'tabulate>=0.8.3,<0.9', 'xlsxwriter>=1.3.6<1.4', + # fix conflicts + 'protobuf<4', + 'importlib-metadata<2,>=0.12', ] setup_requires = [ @@ -67,6 +70,7 @@ # fix style issues 'autoflake>=1.1,<2', 'autopep8>=1.4.3,<2', + 'importlib-metadata<5', # distribute on PyPI 'twine>=1.10.0,<4', @@ -75,7 +79,6 @@ # Advanced testing 'coverage>=4.5.1,<6', 'tox>=2.9.1,<4', - 'importlib-metadata<2,>=0.12', ] setup( @@ -91,7 +94,7 @@ 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', ], - description='AutoML for Renewable Energy Industries.', + description='AutoML for Time Series.', entry_points={ 'mlblocks': [ 'pipelines=draco:MLBLOCKS_PIPELINES', From 854391fbff6d8334ddd221bf248dca77e9656859 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Thu, 13 Apr 2023 16:25:17 -0400 Subject: [PATCH 163/171] Remove dfs pipelines (#74) --- README.md | 13 +- draco/pipelines/dfs_xgb/dfs_xgb.json | 29 - ...fs_xgb_prob_with_double_normalization.json | 42 - .../dfs_xgb/dfs_xgb_prob_with_unstack.json | 50 - ...s_xgb_prob_with_unstack_normalization.json | 49 - .../dfs_xgb_with_double_normalization.json | 37 - .../dfs_xgb/dfs_xgb_with_normalization.json | 29 - .../dfs_xgb/dfs_xgb_with_unstack.json | 45 - .../dfs_xgb_with_unstack_normalization.json | 44 - .../double_lstm_prob_with_unstack.json | 37 +- .../double_lstm/double_lstm_with_unstack.json | 37 +- .../lstm/lstm_prob_with_unstack.json | 37 +- draco/pipelines/lstm/lstm_with_unstack.json | 37 +- .../lstm_regressor_with_unstack.json | 39 +- .../double_entity_normalization.json | 51 - .../preprocessing/entity_dataframe.json | 26 - .../preprocessing/entity_normalization.json | 20 - draco/pipelines/preprocessing/unstack.json | 43 - setup.py | 7 +- tests/test_benchmark.py | 2 +- tutorials/01_Draco_Machine_Learning.ipynb | 236 ++- tutorials/03_Benchmarking.ipynb | 253 ++- tutorials/04_Draco_Regression_Pipeline.ipynb | 81 +- .../dfs_xgb_with_double_normalization.ipynb | 1363 -------------- .../dfs_xgb_with_unstack_normalization.ipynb | 1611 ----------------- 25 files changed, 419 insertions(+), 3799 deletions(-) delete mode 100644 draco/pipelines/dfs_xgb/dfs_xgb.json delete mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json delete mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json delete mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json delete mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json delete mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json delete mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json delete mode 100644 draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json delete mode 100644 draco/pipelines/preprocessing/double_entity_normalization.json delete mode 100644 draco/pipelines/preprocessing/entity_dataframe.json delete mode 100644 draco/pipelines/preprocessing/entity_normalization.json delete mode 100644 draco/pipelines/preprocessing/unstack.json delete mode 100644 tutorials/pipelines/dfs_xgb_with_double_normalization.ipynb delete mode 100644 tutorials/pipelines/dfs_xgb_with_unstack_normalization.ipynb diff --git a/README.md b/README.md index 08effaa..365c442 100644 --- a/README.md +++ b/README.md @@ -220,18 +220,17 @@ The returned `pipeline` variable will be `list` containing the names of all the available in the Draco system: ``` -['dfs_xgb', - 'dfs_xgb_with_unstack', - 'dfs_xgb_with_normalization', - 'dfs_xgb_with_unstack_normalization', - 'dfs_xgb_prob_with_unstack_normalization'] +['lstm', + 'lstm_with_unstack', + 'double_lstm', + 'double_lstm_with_unstack'] ``` For the rest of this tutorial, we will select and use the pipeline -`dfs_xgb_with_unstack_normalization` as our template. +`lstm_with_unstack` as our template. ```python3 -pipeline_name = 'dfs_xgb_with_unstack_normalization' +pipeline_name = 'lstm_with_unstack' ``` ## 3. Fitting the Pipeline diff --git a/draco/pipelines/dfs_xgb/dfs_xgb.json b/draco/pipelines/dfs_xgb/dfs_xgb.json deleted file mode 100644 index 4cb3cbf..0000000 --- a/draco/pipelines/dfs_xgb/dfs_xgb.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "primitives": [ - "mlblocks.MLPipeline", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "preprocessing.entity_dataframe" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - } - }, - "input_names": { - "mlblocks.MLPipeline#1": { - "dataframe": "readings" - } - } -} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json deleted file mode 100644 index 4231115..0000000 --- a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json +++ /dev/null @@ -1,42 +0,0 @@ -{ - "primitives": [ - "mlblocks.MLPipeline", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier:probabilities", - "numpy.take" - ], - "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "preprocessing.double_entity_normalization", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - } - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - }, - "numpy.take#1": { - "indices": 1, - "axis": 1 - } - }, - "input_names": { - "mlblocks.MLPipeline#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json deleted file mode 100644 index 03ef141..0000000 --- a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "primitives": [ - "mlblocks.MLPipeline", - "mlblocks.MLPipeline", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier:probabilities", - "numpy.take" - ], - "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "preprocessing.unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } - }, - "mlblocks.MLPipeline#2": { - "pipeline": "preprocessing.entity_dataframe" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - }, - "numpy.take#1": { - "indices": 1, - "axis": 1 - } - }, - "input_names": { - "mlblocks.MLPipeline#1": { - "X": "readings" - }, - "mlblocks.MLPipeline#2": { - "dataframe": "readings", - "turbines": "turbines" - } - } -} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json deleted file mode 100644 index ca0c4fa..0000000 --- a/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json +++ /dev/null @@ -1,49 +0,0 @@ -{ - "primitives": [ - "mlblocks.MLPipeline", - "mlblocks.MLPipeline", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier:probabilities", - "numpy.take" - ], - "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "preprocessing.unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } - }, - "mlblocks.MLPipeline#2": { - "pipeline": "preprocessing.entity_normalization" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - }, - "numpy.take#1": { - "indices": 1, - "axis": 1 - } - }, - "input_names": { - "mlblocks.MLPipeline#1": { - "X": "readings" - }, - "mlblocks.MLPipeline#2": { - "dataframe": "readings" - } - } -} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json deleted file mode 100644 index 82ae325..0000000 --- a/draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "primitives": [ - "mlblocks.MLPipeline", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "preprocessing.double_entity_normalization", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - } - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - } - }, - "input_names": { - "mlblocks.MLPipeline#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json deleted file mode 100644 index d9d7911..0000000 --- a/draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "primitives": [ - "mlblocks.MLPipeline", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "preprocessing.entity_normalization" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - } - }, - "input_names": { - "mlblocks.MLPipeline#1": { - "dataframe": "readings" - } - } -} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json b/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json deleted file mode 100644 index dd01f23..0000000 --- a/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json +++ /dev/null @@ -1,45 +0,0 @@ -{ - "primitives": [ - "mlblocks.MLPipeline", - "mlblocks.MLPipeline", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "preprocessing.unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } - }, - "mlblocks.MLPipeline#2": { - "pipeline": "preprocessing.entity_dataframe" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - } - }, - "input_names": { - "mlblocks.MLPipeline#1": { - "X": "readings" - }, - "mlblocks.MLPipeline#2": { - "dataframe": "readings", - "turbines": "turbines" - } - } -} diff --git a/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json b/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json deleted file mode 100644 index 87e6999..0000000 --- a/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json +++ /dev/null @@ -1,44 +0,0 @@ -{ - "primitives": [ - "mlblocks.MLPipeline", - "mlblocks.MLPipeline", - "featuretools.dfs", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "xgboost.XGBClassifier" - ], - "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "preprocessing.unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } - }, - "mlblocks.MLPipeline#2": { - "pipeline": "preprocessing.entity_normalization" - }, - "featuretools.dfs#1": { - "target_entity": "turbines", - "index": "turbine_id", - "time_index": "cutoff_time", - "encode": false, - "max_depth": -1, - "copy": true, - "verbose": false, - "n_jobs": 1, - "training_window": "1d" - } - }, - "input_names": { - "mlblocks.MLPipeline#1": { - "X": "readings" - }, - "mlblocks.MLPipeline#2": { - "dataframe": "readings" - } - } -} diff --git a/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json b/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json index 289a794..ea48a87 100644 --- a/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json +++ b/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json @@ -1,6 +1,7 @@ { "primitives": [ - "mlblocks.MLPipeline", + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -13,16 +14,19 @@ "numpy.take" ], "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } + "pandas.DataFrame.resample#1": { + "rule": "3600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -63,7 +67,10 @@ } }, "input_names": { - "mlblocks.MLPipeline#1": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -94,6 +101,12 @@ } }, "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/double_lstm/double_lstm_with_unstack.json b/draco/pipelines/double_lstm/double_lstm_with_unstack.json index 1d08259..dede502 100644 --- a/draco/pipelines/double_lstm/double_lstm_with_unstack.json +++ b/draco/pipelines/double_lstm/double_lstm_with_unstack.json @@ -1,6 +1,7 @@ { "primitives": [ - "mlblocks.MLPipeline", + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -12,16 +13,19 @@ "keras.Sequential.DoubleLSTMTimeSeriesClassifier" ], "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } + "pandas.DataFrame.resample#1": { + "rule": "3600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -56,7 +60,10 @@ } }, "input_names": { - "mlblocks.MLPipeline#1": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -87,6 +94,12 @@ } }, "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/lstm/lstm_prob_with_unstack.json b/draco/pipelines/lstm/lstm_prob_with_unstack.json index 1ad69bc..9272257 100644 --- a/draco/pipelines/lstm/lstm_prob_with_unstack.json +++ b/draco/pipelines/lstm/lstm_prob_with_unstack.json @@ -1,6 +1,7 @@ { "primitives": [ - "mlblocks.MLPipeline", + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -13,16 +14,19 @@ "numpy.take" ], "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } + "pandas.DataFrame.resample#1": { + "rule": "3600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -63,7 +67,10 @@ } }, "input_names": { - "mlblocks.MLPipeline#1": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -94,6 +101,12 @@ } }, "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/lstm/lstm_with_unstack.json b/draco/pipelines/lstm/lstm_with_unstack.json index 18c486a..ab9dd99 100644 --- a/draco/pipelines/lstm/lstm_with_unstack.json +++ b/draco/pipelines/lstm/lstm_with_unstack.json @@ -1,6 +1,7 @@ { "primitives": [ - "mlblocks.MLPipeline", + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -12,16 +13,19 @@ "keras.Sequential.LSTMTimeSeriesClassifier" ], "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } + "pandas.DataFrame.resample#1": { + "rule": "3600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -56,7 +60,10 @@ } }, "input_names": { - "mlblocks.MLPipeline#1": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -87,6 +94,12 @@ } }, "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json b/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json index d546827..9e183b9 100644 --- a/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json +++ b/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json @@ -1,6 +1,7 @@ { "primitives": [ - "mlblocks.MLPipeline", + "pandas.DataFrame.resample", + "pandas.DataFrame.unstack", "pandas.DataFrame.pop", "pandas.DataFrame.pop", "sklearn.impute.SimpleImputer", @@ -12,16 +13,19 @@ "keras.Sequential.LSTMTimeSeriesRegressor" ], "init_params": { - "mlblocks.MLPipeline#1": { - "pipeline": "unstack", - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "df" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - } + "pandas.DataFrame.resample#1": { + "rule": "600s", + "on": "timestamp", + "groupby": [ + "turbine_id", + "signal_id" + ], + "aggregation": "mean", + "reset_index": false + }, + "pandas.DataFrame.unstack#1": { + "level": "signal_id", + "reset_index": true }, "pandas.DataFrame.pop#1": { "item": "turbine_id" @@ -52,11 +56,14 @@ }, "keras.Sequential.LSTMTimeSeriesRegressor#1": { "epochs": 35, - "verbose": false + "verbose": true } }, "input_names": { - "mlblocks.MLPipeline#1": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { "X": "readings" }, "pandas.DataFrame.pop#1": { @@ -87,6 +94,12 @@ } }, "output_names": { + "pandas.DataFrame.resample#1": { + "X": "readings" + }, + "pandas.DataFrame.unstack#1": { + "X": "readings" + }, "pandas.DataFrame.pop#1": { "item": "turbine_id" }, diff --git a/draco/pipelines/preprocessing/double_entity_normalization.json b/draco/pipelines/preprocessing/double_entity_normalization.json deleted file mode 100644 index 1438bbe..0000000 --- a/draco/pipelines/preprocessing/double_entity_normalization.json +++ /dev/null @@ -1,51 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity", - "featuretools.EntitySet.normalize_entity" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": true - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - }, - "featuretools.EntitySet.normalize_entity#2": { - "base_entity_id": "readings", - "new_entity_id": "signals", - "index": "signal_id", - "make_time_index": false - } - }, - "input_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "featuretools.EntitySet.entity_from_dataframe#1": { - "dataframe": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - } - } -} diff --git a/draco/pipelines/preprocessing/entity_dataframe.json b/draco/pipelines/preprocessing/entity_dataframe.json deleted file mode 100644 index 0bd238a..0000000 --- a/draco/pipelines/preprocessing/entity_dataframe.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "primitives": [ - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.add_relationship" - ], - "init_params": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.entity_from_dataframe#2": { - "entity_id": "turbines", - "index": "turbine_id", - "make_index": false - }, - "featuretools.EntitySet.add_relationship#1": { - "parent": "turbines", - "parent_column": "turbine_id", - "child": "readings", - "child_column": "turbine_id" - } - } -} diff --git a/draco/pipelines/preprocessing/entity_normalization.json b/draco/pipelines/preprocessing/entity_normalization.json deleted file mode 100644 index 9f3f3ab..0000000 --- a/draco/pipelines/preprocessing/entity_normalization.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "primitives": [ - "featuretools.EntitySet.entity_from_dataframe", - "featuretools.EntitySet.normalize_entity" - ], - "init_params": { - "featuretools.EntitySet.entity_from_dataframe#1": { - "entity_id": "readings", - "index": "reading_id", - "make_index": true, - "time_index": "timestamp" - }, - "featuretools.EntitySet.normalize_entity#1": { - "base_entity_id": "readings", - "new_entity_id": "turbines", - "index": "turbine_id", - "make_time_index": false - } - } -} diff --git a/draco/pipelines/preprocessing/unstack.json b/draco/pipelines/preprocessing/unstack.json deleted file mode 100644 index 1acd833..0000000 --- a/draco/pipelines/preprocessing/unstack.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "primitives": [ - "pandas.DataFrame.resample", - "pandas.DataFrame.unstack" - ], - "init_params": { - "pandas.DataFrame.resample#1": { - "rule": "600s", - "on": "timestamp", - "groupby": [ - "turbine_id", - "signal_id" - ], - "aggregation": "mean", - "reset_index": false - }, - "pandas.DataFrame.unstack#1": { - "level": "signal_id", - "reset_index": true - } - }, - "input_names": { - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - }, - "output_names": { - "pandas.DataFrame.resample#1": { - "X": "readings" - }, - "pandas.DataFrame.unstack#1": { - "X": "readings" - } - }, - "outputs": { - "default": [ - { - "name": "readings", - "variable": "pandas.DataFrame.unstack#1.readings" - } - ] - } -} diff --git a/setup.py b/setup.py index 0118bd0..1dfcc4b 100644 --- a/setup.py +++ b/setup.py @@ -20,10 +20,10 @@ 'mlprimitives>=0.3.2,<0.4', 'mlblocks>=0.4.0,<0.5', 'pymongo>=3.7.2,<4', - 'scikit-learn>=0.21', + 'scikit-learn>=0.21,<1.2', 'tqdm<4.50.0,>=4.36.1', 'scipy>=1.0.1,<2', - 'numpy>=1.16.0,<1.21.0', + 'numpy>=1.16.0,<1.19', 'pandas>=1,<2', 'tensorflow>=2,<2.3', 'partd>=1.1.0,<2', @@ -33,7 +33,8 @@ 'xlsxwriter>=1.3.6<1.4', # fix conflicts 'protobuf<4', - 'importlib-metadata<2,>=0.12', + 'importlib-metadata<5', + #'importlib-metadata<2,>=0.12', ] setup_requires = [ diff --git a/tests/test_benchmark.py b/tests/test_benchmark.py index d88425b..60736b0 100644 --- a/tests/test_benchmark.py +++ b/tests/test_benchmark.py @@ -8,7 +8,7 @@ def test_predict(): # setup templates = [ - 'dfs_xgb_prob_with_unstack_normalization' + 'lstm_with_unstack' ] window_size_rule = [ diff --git a/tutorials/01_Draco_Machine_Learning.ipynb b/tutorials/01_Draco_Machine_Learning.ipynb index 6b1089e..8138127 100644 --- a/tutorials/01_Draco_Machine_Learning.ipynb +++ b/tutorials/01_Draco_Machine_Learning.ipynb @@ -414,14 +414,9 @@ { "data": { "text/plain": [ - "['dfs_xgb_prob_with_unstack',\n", - " 'dfs_xgb_with_normalization',\n", - " 'dfs_xgb',\n", - " 'dfs_xgb_with_unstack',\n", - " 'dfs_xgb_prob_with_unstack_normalization',\n", - " 'dfs_xgb_with_unstack_normalization',\n", - " 'dfs_xgb_prob_with_double_normalization',\n", - " 'dfs_xgb_with_double_normalization',\n", + "['dummy',\n", + " 'lstm_regressor_with_unstack',\n", + " 'lstm_regressor',\n", " 'double_lstm_prob_with_unstack',\n", " 'double_lstm_prob',\n", " 'double_lstm',\n", @@ -458,14 +453,16 @@ { "data": { "text/plain": [ - "['dfs_xgb_prob_with_unstack',\n", - " 'dfs_xgb_with_normalization',\n", - " 'dfs_xgb',\n", - " 'dfs_xgb_with_unstack',\n", - " 'dfs_xgb_prob_with_unstack_normalization',\n", - " 'dfs_xgb_with_unstack_normalization',\n", - " 'dfs_xgb_prob_with_double_normalization',\n", - " 'dfs_xgb_with_double_normalization']" + "['lstm_regressor_with_unstack',\n", + " 'lstm_regressor',\n", + " 'double_lstm_prob_with_unstack',\n", + " 'double_lstm_prob',\n", + " 'double_lstm',\n", + " 'double_lstm_with_unstack',\n", + " 'lstm_prob_with_unstack',\n", + " 'lstm_with_unstack',\n", + " 'lstm_prob',\n", + " 'lstm']" ] }, "execution_count": 11, @@ -474,7 +471,7 @@ } ], "source": [ - "get_pipelines('dfs')" + "get_pipelines('lstm')" ] }, { @@ -493,14 +490,16 @@ { "data": { "text/plain": [ - "{'dfs_xgb_prob_with_unstack': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack.json',\n", - " 'dfs_xgb_with_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_with_normalization.json',\n", - " 'dfs_xgb': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb.json',\n", - " 'dfs_xgb_with_unstack': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack.json',\n", - " 'dfs_xgb_prob_with_unstack_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_unstack_normalization.json',\n", - " 'dfs_xgb_with_unstack_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_with_unstack_normalization.json',\n", - " 'dfs_xgb_prob_with_double_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_prob_with_double_normalization.json',\n", - " 'dfs_xgb_with_double_normalization': '/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/draco/pipelines/dfs_xgb/dfs_xgb_with_double_normalization.json'}" + "{'lstm_regressor_with_unstack': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json',\n", + " 'lstm_regressor': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/lstm_regressor/lstm_regressor.json',\n", + " 'double_lstm_prob_with_unstack': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json',\n", + " 'double_lstm_prob': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/double_lstm/double_lstm_prob.json',\n", + " 'double_lstm': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/double_lstm/double_lstm.json',\n", + " 'double_lstm_with_unstack': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/double_lstm/double_lstm_with_unstack.json',\n", + " 'lstm_prob_with_unstack': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/lstm/lstm_prob_with_unstack.json',\n", + " 'lstm_with_unstack': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/lstm/lstm_with_unstack.json',\n", + " 'lstm_prob': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/lstm/lstm_prob.json',\n", + " 'lstm': '/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/draco/pipelines/lstm/lstm.json'}" ] }, "execution_count": 12, @@ -509,7 +508,7 @@ } ], "source": [ - "get_pipelines('dfs', path=True)" + "get_pipelines('lstm', path=True)" ] }, { @@ -539,8 +538,8 @@ "outputs": [], "source": [ "templates = [\n", - " 'dfs_xgb_with_unstack_normalization', \n", - " 'dfs_xgb_with_double_normalization'\n", + " 'lstm_with_unstack', \n", + " 'double_lstm_with_unstack'\n", "]" ] }, @@ -613,56 +612,43 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:btb.session:Obtaining default configuration for dfs_xgb_with_unstack_normalization\n", - "INFO:draco.pipeline:New configuration found:\n", - " Template: dfs_xgb_with_unstack_normalization \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 100\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.1\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.0\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 1\n", - "INFO:btb.session:New optimal found: dfs_xgb_with_unstack_normalization - 0.6117760013143775\n", - "INFO:btb.session:Obtaining default configuration for dfs_xgb_with_double_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", + "INFO:btb.session:Obtaining default configuration for lstm_with_unstack\n", + "2023-04-07 14:17:30.569247: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n", + "2023-04-07 14:17:30.594310: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fa3c50cbbb0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2023-04-07 14:17:30.594323: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: dfs_xgb_with_unstack_normalization \n", + " Template: lstm_with_unstack \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 90\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 342\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 6\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.9043352048331922\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.5258350872963311\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 9\n", - "INFO:btb.session:New optimal found: dfs_xgb_with_unstack_normalization - 0.6205571445297473\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", + "INFO:btb.session:New optimal found: lstm_with_unstack - 0.6292647327130085\n", + "INFO:btb.session:Obtaining default configuration for double_lstm_with_unstack\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: dfs_xgb_with_double_normalization \n", + " Template: double_lstm_with_unstack \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 80\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 66\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 10\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6434375682152088\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.14135407511034503\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", - "INFO:btb.session:New optimal found: dfs_xgb_with_double_normalization - 0.629513025867624\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n" + " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 80\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.3\n", + "INFO:btb.session:New optimal found: double_lstm_with_unstack - 0.6434978589136803\n", + "INFO:btb.session:Generating new proposal configuration for double_lstm_with_unstack\n" ] }, { "data": { "text/plain": [ - "{'id': '452a22a136f67c575aee3341c9dc2395',\n", - " 'name': 'dfs_xgb_with_double_normalization',\n", - " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 80,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 66,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6434375682152088,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.14135407511034503,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", - " 'score': 0.629513025867624}" + "{'id': '2904e940a6e56612e275f93afc00c6e6',\n", + " 'name': 'double_lstm_with_unstack',\n", + " 'config': {('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 80,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1',\n", + " 'dropout_2_rate'): 0.3},\n", + " 'score': 0.6434978589136803}" ] }, "execution_count": 16, @@ -671,7 +657,7 @@ } ], "source": [ - "session.run(5)" + "session.run(3)" ] }, { @@ -690,16 +676,15 @@ { "data": { "text/plain": [ - "{'id': '452a22a136f67c575aee3341c9dc2395',\n", - " 'name': 'dfs_xgb_with_double_normalization',\n", - " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 80,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 66,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6434375682152088,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.14135407511034503,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", - " 'score': 0.629513025867624}" + "{'id': '2904e940a6e56612e275f93afc00c6e6',\n", + " 'name': 'double_lstm_with_unstack',\n", + " 'config': {('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 80,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1',\n", + " 'dropout_2_rate'): 0.3},\n", + " 'score': 0.6434978589136803}" ] }, "execution_count": 17, @@ -726,13 +711,11 @@ { "data": { "text/plain": [ - "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 80,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 66,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 10,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.6434375682152088,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.14135407511034503,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" + "{('sklearn.impute.SimpleImputer#1', 'strategy'): 'mean',\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 80,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.3}" ] }, "execution_count": 18, @@ -759,7 +742,7 @@ { "data": { "text/plain": [ - "'dfs_xgb_with_double_normalization'" + "'double_lstm_with_unstack'" ] }, "execution_count": 19, @@ -787,7 +770,7 @@ { "data": { "text/plain": [ - "0.629513025867624" + "0.6434978589136803" ] }, "execution_count": 20, @@ -817,41 +800,40 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n", + "INFO:btb.session:Generating new proposal configuration for lstm_with_unstack\n", + "INFO:btb.session:Generating new proposal configuration for lstm_with_unstack\n", + "INFO:btb.session:Generating new proposal configuration for double_lstm_with_unstack\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: dfs_xgb_with_unstack_normalization \n", + " Template: double_lstm_with_unstack \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 48\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 130\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 8\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7437898568465957\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9963350624783064\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10\n", - "INFO:btb.session:New optimal found: dfs_xgb_with_unstack_normalization - 0.651642052400304\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_double_normalization\n", - "INFO:btb.session:Generating new proposal configuration for dfs_xgb_with_unstack_normalization\n" + " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 224\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.31140813814002105\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 268\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.607924752631197\n", + "INFO:btb.session:New optimal found: double_lstm_with_unstack - 0.654373123123123\n", + "INFO:btb.session:Generating new proposal configuration for lstm_with_unstack\n", + "INFO:btb.session:Generating new proposal configuration for double_lstm_with_unstack\n", + "INFO:btb.session:Generating new proposal configuration for lstm_with_unstack\n", + "INFO:btb.session:Generating new proposal configuration for double_lstm_with_unstack\n", + "INFO:btb.session:Generating new proposal configuration for lstm_with_unstack\n", + "INFO:btb.session:Generating new proposal configuration for double_lstm_with_unstack\n", + "INFO:btb.session:Generating new proposal configuration for lstm_with_unstack\n" ] }, { "data": { "text/plain": [ - "{'id': '22ec731234212508b7b4413ccce34294',\n", - " 'name': 'dfs_xgb_with_unstack_normalization',\n", - " 'config': {('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 48,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 130,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 8,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7437898568465957,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9963350624783064,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10},\n", - " 'score': 0.651642052400304}" + "{'id': 'a34709538eddbb67637d57d48237d69d',\n", + " 'name': 'double_lstm_with_unstack',\n", + " 'config': {('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 224,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1',\n", + " 'dropout_1_rate'): 0.31140813814002105,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 268,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1',\n", + " 'dropout_2_rate'): 0.607924752631197},\n", + " 'score': 0.654373123123123}" ] }, "execution_count": 21, @@ -871,7 +853,7 @@ { "data": { "text/plain": [ - "0.651642052400304" + "0.654373123123123" ] }, "execution_count": 22, @@ -891,13 +873,13 @@ { "data": { "text/plain": [ - "{('mlprimitives.custom.feature_extraction.CategoricalEncoder#1',\n", - " 'max_labels'): 48,\n", - " ('xgboost.XGBClassifier#1', 'n_estimators'): 130,\n", - " ('xgboost.XGBClassifier#1', 'max_depth'): 8,\n", - " ('xgboost.XGBClassifier#1', 'learning_rate'): 0.7437898568465957,\n", - " ('xgboost.XGBClassifier#1', 'gamma'): 0.9963350624783064,\n", - " ('xgboost.XGBClassifier#1', 'min_child_weight'): 10}" + "{('sklearn.impute.SimpleImputer#1', 'strategy'): 'constant',\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 224,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1',\n", + " 'dropout_1_rate'): 0.31140813814002105,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 268,\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1',\n", + " 'dropout_2_rate'): 0.607924752631197}" ] }, "execution_count": 23, @@ -964,7 +946,7 @@ { "data": { "text/plain": [ - "0.608695652173913" + "0.6521739130434783" ] }, "execution_count": 26, @@ -1067,7 +1049,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/tutorials/03_Benchmarking.ipynb b/tutorials/03_Benchmarking.ipynb index 12c5e47..f22224a 100644 --- a/tutorials/03_Benchmarking.ipynb +++ b/tutorials/03_Benchmarking.ipynb @@ -77,7 +77,7 @@ "source": [ "templates = [\n", " 'lstm_prob_with_unstack',\n", - " 'dfs_xgb_prob_with_double_normalization'\n", + " 'double_lstm_prob_with_unstack'\n", "]\n", "window_size_rule = [('1d', '1h'), ('2d', '2h')]\n", "init_params = {\n", @@ -85,6 +85,11 @@ " 'keras.Sequential.LSTMTimeSeriesClassifier#1': {\n", " 'epochs': 1,\n", " }\n", + " },\n", + " 'double_lstm_prob_with_unstack': {\n", + " 'keras.Sequential.DoubleLSTMTimeSeriesClassifier#1': {\n", + " 'epochs': 1,\n", + " }\n", " }\n", "}\n" ] @@ -100,90 +105,58 @@ "name": "stderr", "output_type": "stream", "text": [ - "INFO:draco.benchmark:Evaluating template probability.unstack_lstm_timeseries_classifier on problem None (1d, 1h)\n", + "INFO:draco.benchmark:Evaluating template lstm_prob_with_unstack on problem None (1d, 1h)\n", + "2023-04-07 14:33:33.017625: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n", + "2023-04-07 14:33:33.043631: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fc3e937a8e0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2023-04-07 14:33:33.043643: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.unstack_lstm_timeseries_classifier \n", + " Template: lstm_prob_with_unstack \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", + "INFO:draco.benchmark:Evaluating template lstm_prob_with_unstack on problem None (2d, 2h)\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.unstack_lstm_timeseries_classifier \n", - " Hyperparameters: \n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 287\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.565737233372491\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 145\n", - "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.unstack_lstm_timeseries_classifier \n", - " Hyperparameters: \n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 269\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.5973752345055594\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 412\n", - "INFO:draco.benchmark:Evaluating template probability.unstack_lstm_timeseries_classifier on problem None (2d, 2h)\n", - "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.unstack_lstm_timeseries_classifier \n", + " Template: lstm_prob_with_unstack \n", " Hyperparameters: \n", " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 80\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.unstack_lstm_timeseries_classifier \n", + " Template: lstm_prob_with_unstack \n", " Hyperparameters: \n", - " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 114\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.016427744327526084\n", - " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 224\n", - "INFO:draco.benchmark:Evaluating template probability.normalize_dfs_xgb_classifier on problem None (1d, 1h)\n", - "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.normalize_dfs_xgb_classifier \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", - " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 100\n", - " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.1\n", - " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.0\n", - " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 1\n", - "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.normalize_dfs_xgb_classifier \n", - " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 11\n", - " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 231\n", - " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 9\n", - " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.554989010368875\n", - " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.909957492053926\n", - " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 7\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): median\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'lstm_1_units'): 137\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.612475373625103\n", + " ('keras.Sequential.LSTMTimeSeriesClassifier#1', 'dense_1_units'): 191\n", + "INFO:draco.benchmark:Evaluating template double_lstm_prob_with_unstack on problem None (1d, 1h)\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.normalize_dfs_xgb_classifier \n", + " Template: double_lstm_prob_with_unstack \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 61\n", - " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 122\n", - " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 5\n", - " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.6840927016151666\n", - " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.5480298094360865\n", - " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 6\n", - "INFO:draco.benchmark:Evaluating template probability.normalize_dfs_xgb_classifier on problem None (2d, 2h)\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 80\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.3\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.normalize_dfs_xgb_classifier \n", + " Template: double_lstm_prob_with_unstack \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 0\n", - " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 100\n", - " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 3\n", - " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.1\n", - " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.0\n", - " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 1\n", + " ('sklearn.impute.SimpleImputer#1', 'strategy'): constant\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 245\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.4308586778212253\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 221\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.5926391753395145\n", + "INFO:draco.benchmark:Evaluating template double_lstm_prob_with_unstack on problem None (2d, 2h)\n", "INFO:draco.pipeline:New configuration found:\n", - " Template: probability.normalize_dfs_xgb_classifier \n", + " Template: double_lstm_prob_with_unstack \n", " Hyperparameters: \n", - " ('mlprimitives.custom.feature_extraction.CategoricalEncoder#1', 'max_labels'): 99\n", - " ('xgboost.XGBClassifier:probabilities#1', 'n_estimators'): 616\n", - " ('xgboost.XGBClassifier:probabilities#1', 'max_depth'): 8\n", - " ('xgboost.XGBClassifier:probabilities#1', 'learning_rate'): 0.0700166745838724\n", - " ('xgboost.XGBClassifier:probabilities#1', 'gamma'): 0.40990340522001234\n", - " ('xgboost.XGBClassifier:probabilities#1', 'min_child_weight'): 10\n" + " ('sklearn.impute.SimpleImputer#1', 'strategy'): mean\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_1_units'): 80\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_1_rate'): 0.3\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'lstm_2_units'): 80\n", + " ('keras.Sequential.DoubleLSTMTimeSeriesClassifier#1', 'dropout_2_rate'): 0.3\n" ] } ], @@ -252,20 +225,20 @@ " None\n", " 1d\n", " 1h\n", - " probability.unstack_lstm_timeseries_classifier\n", - " 0.350122\n", - " 0.538316\n", - " 0.618558\n", - " 0.463675\n", + " lstm_prob_with_unstack\n", + " 0.494505\n", + " 0.589905\n", + " 0.589905\n", + " 0.322650\n", " roc_auc_score\n", " {'threshold': 0.5}\n", - " 0 days 00:00:04.250012\n", - " 0 days 00:00:14.374875\n", - " 0 days 00:00:15.360015\n", - " 0 days 00:01:10.806375\n", + " 0 days 00:00:03.873157\n", + " 0 days 00:00:14.369536\n", + " 0 days 00:00:08.178422\n", + " 0 days 00:00:47.144655\n", " OK\n", - " 0.640449\n", - " 0.058824\n", + " 0.280899\n", + " 0.255814\n", " 1.0\n", " 0.0\n", " \n", @@ -274,20 +247,20 @@ " None\n", " 2d\n", " 2h\n", - " probability.unstack_lstm_timeseries_classifier\n", - " 0.686203\n", - " 0.491949\n", - " 0.556803\n", - " 0.510989\n", + " lstm_prob_with_unstack\n", + " 0.446581\n", + " 0.543056\n", + " 0.561570\n", + " 0.707875\n", " roc_auc_score\n", " {'threshold': 0.5}\n", - " 0 days 00:00:04.410682\n", - " 0 days 00:00:14.411205\n", - " 0 days 00:00:10.633619\n", - " 0 days 00:00:55.011304\n", + " 0 days 00:00:03.460467\n", + " 0 days 00:00:12.121905\n", + " 0 days 00:00:08.275919\n", + " 0 days 00:00:44.449291\n", " OK\n", - " 0.595506\n", - " 0.307692\n", + " 0.730337\n", + " 0.586207\n", " 1.0\n", " 0.0\n", " \n", @@ -296,20 +269,20 @@ " None\n", " 1d\n", " 1h\n", - " probability.normalize_dfs_xgb_classifier\n", - " 0.697802\n", - " 0.669508\n", - " 0.701792\n", - " 0.766789\n", + " double_lstm_prob_with_unstack\n", + " 0.813187\n", + " 0.307993\n", + " 0.592696\n", + " 0.417582\n", " roc_auc_score\n", " {'threshold': 0.5}\n", - " 0 days 00:01:11.416859\n", - " 0 days 00:02:55.012078\n", - " 0 days 00:00:00.806430\n", - " 0 days 00:05:20.653100\n", + " 0 days 00:00:05.460985\n", + " 0 days 00:00:18.103660\n", + " 0 days 00:00:14.011877\n", + " 0 days 00:01:11.192546\n", " OK\n", - " 0.797753\n", - " 0.666667\n", + " 0.303371\n", + " 0.367347\n", " 1.0\n", " 0.0\n", " \n", @@ -318,20 +291,20 @@ " None\n", " 2d\n", " 2h\n", - " probability.normalize_dfs_xgb_classifier\n", - " 0.720391\n", - " 0.718617\n", - " 0.740664\n", - " 0.782662\n", + " double_lstm_prob_with_unstack\n", + " 0.245726\n", + " 0.663919\n", + " 0.663919\n", + " 0.293346\n", " roc_auc_score\n", " {'threshold': 0.5}\n", - " 0 days 00:01:03.612676\n", - " 0 days 00:02:26.925796\n", - " 0 days 00:00:00.755424\n", - " 0 days 00:04:37.570182\n", + " 0 days 00:00:05.568835\n", + " 0 days 00:00:17.948361\n", + " 0 days 00:00:14.003816\n", + " 0 days 00:01:11.051792\n", " OK\n", - " 0.820225\n", - " 0.692308\n", + " 0.303371\n", + " 0.184211\n", " 1.0\n", " 0.0\n", " \n", @@ -340,41 +313,41 @@ "" ], "text/plain": [ - " problem_name window_size resample_rule \\\n", - "0 None 1d 1h \n", - "1 None 2d 2h \n", - "2 None 1d 1h \n", - "3 None 2d 2h \n", + " problem_name window_size resample_rule template \\\n", + "0 None 1d 1h lstm_prob_with_unstack \n", + "1 None 2d 2h lstm_prob_with_unstack \n", + "2 None 1d 1h double_lstm_prob_with_unstack \n", + "3 None 2d 2h double_lstm_prob_with_unstack \n", "\n", - " template default_test default_cv \\\n", - "0 probability.unstack_lstm_timeseries_classifier 0.350122 0.538316 \n", - "1 probability.unstack_lstm_timeseries_classifier 0.686203 0.491949 \n", - "2 probability.normalize_dfs_xgb_classifier 0.697802 0.669508 \n", - "3 probability.normalize_dfs_xgb_classifier 0.720391 0.718617 \n", + " default_test default_cv tuned_cv tuned_test tuning_metric \\\n", + "0 0.494505 0.589905 0.589905 0.322650 roc_auc_score \n", + "1 0.446581 0.543056 0.561570 0.707875 roc_auc_score \n", + "2 0.813187 0.307993 0.592696 0.417582 roc_auc_score \n", + "3 0.245726 0.663919 0.663919 0.293346 roc_auc_score \n", "\n", - " tuned_cv tuned_test tuning_metric tuning_metric_kwargs \\\n", - "0 0.618558 0.463675 roc_auc_score {'threshold': 0.5} \n", - "1 0.556803 0.510989 roc_auc_score {'threshold': 0.5} \n", - "2 0.701792 0.766789 roc_auc_score {'threshold': 0.5} \n", - "3 0.740664 0.782662 roc_auc_score {'threshold': 0.5} \n", + " tuning_metric_kwargs fit_predict_time default_cv_time \\\n", + "0 {'threshold': 0.5} 0 days 00:00:03.873157 0 days 00:00:14.369536 \n", + "1 {'threshold': 0.5} 0 days 00:00:03.460467 0 days 00:00:12.121905 \n", + "2 {'threshold': 0.5} 0 days 00:00:05.460985 0 days 00:00:18.103660 \n", + "3 {'threshold': 0.5} 0 days 00:00:05.568835 0 days 00:00:17.948361 \n", "\n", - " fit_predict_time default_cv_time average_cv_time \\\n", - "0 0 days 00:00:04.250012 0 days 00:00:14.374875 0 days 00:00:15.360015 \n", - "1 0 days 00:00:04.410682 0 days 00:00:14.411205 0 days 00:00:10.633619 \n", - "2 0 days 00:01:11.416859 0 days 00:02:55.012078 0 days 00:00:00.806430 \n", - "3 0 days 00:01:03.612676 0 days 00:02:26.925796 0 days 00:00:00.755424 \n", + " average_cv_time total_time status \\\n", + "0 0 days 00:00:08.178422 0 days 00:00:47.144655 OK \n", + "1 0 days 00:00:08.275919 0 days 00:00:44.449291 OK \n", + "2 0 days 00:00:14.011877 0 days 00:01:11.192546 OK \n", + "3 0 days 00:00:14.003816 0 days 00:01:11.051792 OK \n", "\n", - " total_time status accuracy_threshold/0.5 f1_threshold/0.5 \\\n", - "0 0 days 00:01:10.806375 OK 0.640449 0.058824 \n", - "1 0 days 00:00:55.011304 OK 0.595506 0.307692 \n", - "2 0 days 00:05:20.653100 OK 0.797753 0.666667 \n", - "3 0 days 00:04:37.570182 OK 0.820225 0.692308 \n", + " accuracy_threshold/0.5 f1_threshold/0.5 fpr_threshold/0.5 \\\n", + "0 0.280899 0.255814 1.0 \n", + "1 0.730337 0.586207 1.0 \n", + "2 0.303371 0.367347 1.0 \n", + "3 0.303371 0.184211 1.0 \n", "\n", - " fpr_threshold/0.5 tpr_threshold/0.5 \n", - "0 1.0 0.0 \n", - "1 1.0 0.0 \n", - "2 1.0 0.0 \n", - "3 1.0 0.0 " + " tpr_threshold/0.5 \n", + "0 0.0 \n", + "1 0.0 \n", + "2 0.0 \n", + "3 0.0 " ] }, "execution_count": 4, @@ -403,7 +376,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/tutorials/04_Draco_Regression_Pipeline.ipynb b/tutorials/04_Draco_Regression_Pipeline.ipynb index 709c839..4d241ca 100644 --- a/tutorials/04_Draco_Regression_Pipeline.ipynb +++ b/tutorials/04_Draco_Regression_Pipeline.ipynb @@ -515,14 +515,7 @@ { "data": { "text/plain": [ - "['dfs_xgb_prob_with_unstack',\n", - " 'dfs_xgb_with_normalization',\n", - " 'dfs_xgb',\n", - " 'dfs_xgb_with_unstack',\n", - " 'dfs_xgb_prob_with_unstack_normalization',\n", - " 'dfs_xgb_with_unstack_normalization',\n", - " 'dfs_xgb_prob_with_double_normalization',\n", - " 'dfs_xgb_with_double_normalization',\n", + "['dummy',\n", " 'lstm_regressor_with_unstack',\n", " 'lstm_regressor',\n", " 'double_lstm_prob_with_unstack',\n", @@ -615,7 +608,13 @@ "source": [ "from draco.pipeline import DracoPipeline\n", "\n", - "pipeline = DracoPipeline(pipeline_name)" + "init_params = {\n", + " \"keras.Sequential.LSTMTimeSeriesRegressor#1\": {\n", + " \"epochs\": 10\n", + " }\n", + "}\n", + "\n", + "pipeline = DracoPipeline(pipeline_name, init_params=init_params)" ] }, { @@ -634,10 +633,35 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-02-01 15:05:13.365367: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-02-01 15:05:13.379993: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fe6a0ec50a0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", - "2022-02-01 15:05:13.380010: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + "2023-04-07 16:46:35.571262: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n", + "2023-04-07 16:46:35.594871: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7ff23c392800 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2023-04-07 16:46:35.594885: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "227/227 [==============================] - 6s 28ms/step - loss: 9064.8613 - mse: 9064.8613 - val_loss: 11566.7559 - val_mse: 11566.7559\n", + "Epoch 2/10\n", + "227/227 [==============================] - 6s 27ms/step - loss: 6775.8911 - mse: 6775.8911 - val_loss: 9392.9561 - val_mse: 9392.9561\n", + "Epoch 3/10\n", + "227/227 [==============================] - 6s 27ms/step - loss: 5391.6719 - mse: 5391.6719 - val_loss: 7923.1221 - val_mse: 7923.1221\n", + "Epoch 4/10\n", + "227/227 [==============================] - 6s 28ms/step - loss: 4524.3457 - mse: 4524.3457 - val_loss: 6955.8647 - val_mse: 6955.8647\n", + "Epoch 5/10\n", + "227/227 [==============================] - 7s 33ms/step - loss: 4040.5396 - mse: 4040.5396 - val_loss: 6356.0605 - val_mse: 6356.0605\n", + "Epoch 6/10\n", + "227/227 [==============================] - 6s 28ms/step - loss: 3802.5298 - mse: 3802.5298 - val_loss: 5998.2061 - val_mse: 5998.2061\n", + "Epoch 7/10\n", + "227/227 [==============================] - 7s 30ms/step - loss: 3683.9429 - mse: 3683.9429 - val_loss: 5790.9092 - val_mse: 5790.9092\n", + "Epoch 8/10\n", + "227/227 [==============================] - 7s 33ms/step - loss: 3636.9177 - mse: 3636.9177 - val_loss: 5674.6558 - val_mse: 5674.6558\n", + "Epoch 9/10\n", + "227/227 [==============================] - 7s 30ms/step - loss: 3609.4973 - mse: 3609.4973 - val_loss: 5619.3926 - val_mse: 5619.3926\n", + "Epoch 10/10\n", + "227/227 [==============================] - 7s 29ms/step - loss: 3617.7119 - mse: 3617.7119 - val_loss: 5587.2671 - val_mse: 5587.2671\n" ] } ], @@ -658,7 +682,15 @@ "cell_type": "code", "execution_count": 17, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2/2 [==============================] - 0s 3ms/step\n" + ] + } + ], "source": [ "predictions = pipeline.predict(test_target_times, readings)" ] @@ -678,7 +710,7 @@ { "data": { "text/plain": [ - "0.6362969806460871" + "-0.1533211964451806" ] }, "execution_count": 18, @@ -748,14 +780,21 @@ "execution_count": 21, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2/2 [==============================] - 0s 5ms/step\n" + ] + }, { "data": { "text/plain": [ - "array([[129.89064 ],\n", - " [139.89001 ],\n", - " [ 39.425865],\n", - " [110.67838 ],\n", - " [ 98.52903 ]], dtype=float32)" + "array([[91.7917 ],\n", + " [91.791695],\n", + " [91.79166 ],\n", + " [91.79167 ],\n", + " [91.79167 ]], dtype=float32)" ] }, "execution_count": 21, @@ -785,7 +824,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/tutorials/pipelines/dfs_xgb_with_double_normalization.ipynb b/tutorials/pipelines/dfs_xgb_with_double_normalization.ipynb deleted file mode 100644 index 6fd5f1e..0000000 --- a/tutorials/pipelines/dfs_xgb_with_double_normalization.ipynb +++ /dev/null @@ -1,1363 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# dfs_xgb_with_double_normalization" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from draco.demo import load_demo\n", - "\n", - "target_times, readings = load_demo()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_name = 'dfs_xgb_with_double_normalization'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/dask/dataframe/utils.py:14: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", - " import pandas.util.testing as tm\n" - ] - } - ], - "source": [ - "from draco.pipeline import DracoPipeline\n", - "\n", - "pipeline = DracoPipeline(pipeline_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['mlblocks.MLPipeline',\n", - " 'featuretools.dfs',\n", - " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", - " 'xgboost.XGBClassifier']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.template['primitives']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Step by Step execution" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Input Data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 323.0\n", - "1 T001 S02 2013-01-10 320.0\n", - "2 T001 S03 2013-01-10 284.0\n", - "3 T001 S04 2013-01-10 348.0\n", - "4 T001 S05 2013-01-10 273.0" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", - "
" - ], - "text/plain": [ - " turbine_id cutoff_time target\n", - "0 T001 2013-01-12 0\n", - "1 T001 2013-01-13 0\n", - "2 T001 2013-01-14 0\n", - "3 T001 2013-01-15 1\n", - "4 T001 2013-01-16 0" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation (part of Draco Pipeline)\n", - "\n", - "* Input: target_times, readings, turbines\n", - "* Output: X, y, readings, turbines\n", - "* Effect: target_times has been split into X and y" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## mlblocks.MLPipeline\n", - "\n", - "### pandas.DataFrame.resample\n", - "\n", - "* Input: readings\n", - "* Output: readings (resampled)\n", - "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index\n", - "\n", - "### featuretools.EntitySet.entity_from_dataframe\n", - "\n", - "* Input: readings (resampled)\n", - "* Output: entityset\n", - "* Effect: Entityset has been generated from readings\n", - "\n", - "### featuretools.EntitySet.normalize_entity\n", - "\n", - "* Input: entityset\n", - "* Output: entityset with relationship (readings.turbine_id with turbines.turbine_id)\n", - "* Effect: establish relation between readings and turbines\n", - "\n", - "### featuretools.EntitySet.normalize_entity\n", - "\n", - "* Input: entityset\n", - "* Output: entityset with relationship (readings.signal_id with signals.signal_id)\n", - "* Effect: establish relationship between readings and signals" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "step = 0\n", - "context = pipeline.fit(target_times, readings, output_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Entityset: entityset\n", - " Entities:\n", - " readings [Rows: 1329146, Columns: 5]\n", - " turbines [Rows: 1, Columns: 1]\n", - " signals [Rows: 26, Columns: 1]\n", - " Relationships:\n", - " readings.turbine_id -> turbines.turbine_id\n", - " readings.signal_id -> signals.signal_id" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['entityset']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## featuretools.dfs\n", - "\n", - "* Input: entityset (unstacked, no turbine_id, no timestamp)\n", - "* Output: X (has additional features)\n", - "* Effect: build features for relational dataset using DFS" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "step = 1\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SUM(readings.value)STD(readings.value)MAX(readings.value)SKEW(readings.value)MIN(readings.value)MEAN(readings.value)COUNT(readings)NUM_UNIQUE(readings.signal_id)MODE(readings.signal_id)NUM_UNIQUE(readings.DAY(timestamp))...MEAN(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp)))MEAN(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp)))MODE(readings.signals.MODE(readings.DAY(timestamp)))MODE(readings.signals.MODE(readings.YEAR(timestamp)))MODE(readings.signals.MODE(readings.WEEKDAY(timestamp)))MODE(readings.signals.MODE(readings.MONTH(timestamp)))
turbine_id
T0013.433649e+091.456860e+063448719.01.0192140.0917107.079193374426S012...2.01.0111111201341
T0013.441489e+091.459865e+063453777.01.0187610.0919201.162179374426S012...2.01.0111112201351
T0013.455470e+091.465277e+063463880.01.0181922.7922935.352244374426S012...2.01.0111113201361
T0013.475361e+091.473337e+063474703.01.017666-1.0928248.092869374426S012...2.01.0111114201301
T0012.888083e+091.477958e+063485019.01.0320020.0924186.531200374426S012...2.01.0111115201311
\n", - "

5 rows × 99 columns

\n", - "
" - ], - "text/plain": [ - " SUM(readings.value) STD(readings.value) MAX(readings.value) \\\n", - "turbine_id \n", - "T001 3.433649e+09 1.456860e+06 3448719.0 \n", - "T001 3.441489e+09 1.459865e+06 3453777.0 \n", - "T001 3.455470e+09 1.465277e+06 3463880.0 \n", - "T001 3.475361e+09 1.473337e+06 3474703.0 \n", - "T001 2.888083e+09 1.477958e+06 3485019.0 \n", - "\n", - " SKEW(readings.value) MIN(readings.value) MEAN(readings.value) \\\n", - "turbine_id \n", - "T001 1.019214 0.0 917107.079193 \n", - "T001 1.018761 0.0 919201.162179 \n", - "T001 1.018192 2.7 922935.352244 \n", - "T001 1.017666 -1.0 928248.092869 \n", - "T001 1.032002 0.0 924186.531200 \n", - "\n", - " COUNT(readings) NUM_UNIQUE(readings.signal_id) \\\n", - "turbine_id \n", - "T001 3744 26 \n", - "T001 3744 26 \n", - "T001 3744 26 \n", - "T001 3744 26 \n", - "T001 3744 26 \n", - "\n", - " MODE(readings.signal_id) NUM_UNIQUE(readings.DAY(timestamp)) ... \\\n", - "turbine_id ... \n", - "T001 S01 2 ... \n", - "T001 S01 2 ... \n", - "T001 S01 2 ... \n", - "T001 S01 2 ... \n", - "T001 S01 2 ... \n", - "\n", - " MEAN(readings.signals.NUM_UNIQUE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 2.0 \n", - "T001 2.0 \n", - "T001 2.0 \n", - "T001 2.0 \n", - "T001 2.0 \n", - "\n", - " MEAN(readings.signals.NUM_UNIQUE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 1.0 \n", - "T001 1.0 \n", - "T001 1.0 \n", - "T001 1.0 \n", - "T001 1.0 \n", - "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " MODE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 11 \n", - "T001 12 \n", - "T001 13 \n", - "T001 14 \n", - "T001 15 \n", - "\n", - " MODE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "\n", - " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 4 \n", - "T001 5 \n", - "T001 6 \n", - "T001 0 \n", - "T001 1 \n", - "\n", - " MODE(readings.signals.MODE(readings.MONTH(timestamp))) \n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - "[5 rows x 99 columns]" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['X'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "99" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# features generated (the turbine_id is set as index).\n", - "len(context['X'].columns)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 323.0\n", - "1 T001 S02 2013-01-10 320.0\n", - "2 T001 S03 2013-01-10 284.0\n", - "3 T001 S04 2013-01-10 348.0\n", - "4 T001 S05 2013-01-10 273.0" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## mlprimitives.custom.feature_extraction.CategoricalEncoder\n", - "\n", - "* Input: X\n", - "* Output: X (label encoded)\n", - "* Effect: encodes categorical features using OneHotLabelEncoder" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "step = 2\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SUM(readings.value)STD(readings.value)MAX(readings.value)SKEW(readings.value)MIN(readings.value)MEAN(readings.value)COUNT(readings)NUM_UNIQUE(readings.signal_id)NUM_UNIQUE(readings.DAY(timestamp))NUM_UNIQUE(readings.MONTH(timestamp))...NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp)))NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp)))MODE(readings.signals.MODE(readings.DAY(timestamp)))MODE(readings.signals.MODE(readings.YEAR(timestamp)))MODE(readings.signals.MODE(readings.WEEKDAY(timestamp)))MODE(readings.signals.MODE(readings.MONTH(timestamp)))MODE(readings.signal_id)=S01MODE(readings.signals.MODE(readings.turbine_id))=T001
turbine_id
T0013.433649e+091.456860e+063448719.01.0192140.0917107.07919337442621...11111120134111
T0013.441489e+091.459865e+063453777.01.0187610.0919201.16217937442621...11111220135111
T0013.455470e+091.465277e+063463880.01.0181922.7922935.35224437442621...11111320136111
T0013.475361e+091.473337e+063474703.01.017666-1.0928248.09286937442621...11111420130111
T0012.888083e+091.477958e+063485019.01.0320020.0924186.53120037442621...11111520131111
\n", - "

5 rows × 99 columns

\n", - "
" - ], - "text/plain": [ - " SUM(readings.value) STD(readings.value) MAX(readings.value) \\\n", - "turbine_id \n", - "T001 3.433649e+09 1.456860e+06 3448719.0 \n", - "T001 3.441489e+09 1.459865e+06 3453777.0 \n", - "T001 3.455470e+09 1.465277e+06 3463880.0 \n", - "T001 3.475361e+09 1.473337e+06 3474703.0 \n", - "T001 2.888083e+09 1.477958e+06 3485019.0 \n", - "\n", - " SKEW(readings.value) MIN(readings.value) MEAN(readings.value) \\\n", - "turbine_id \n", - "T001 1.019214 0.0 917107.079193 \n", - "T001 1.018761 0.0 919201.162179 \n", - "T001 1.018192 2.7 922935.352244 \n", - "T001 1.017666 -1.0 928248.092869 \n", - "T001 1.032002 0.0 924186.531200 \n", - "\n", - " COUNT(readings) NUM_UNIQUE(readings.signal_id) \\\n", - "turbine_id \n", - "T001 3744 26 \n", - "T001 3744 26 \n", - "T001 3744 26 \n", - "T001 3744 26 \n", - "T001 3744 26 \n", - "\n", - " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", - "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "\n", - " NUM_UNIQUE(readings.MONTH(timestamp)) ... \\\n", - "turbine_id ... \n", - "T001 1 ... \n", - "T001 1 ... \n", - "T001 1 ... \n", - "T001 1 ... \n", - "T001 1 ... \n", - "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " MODE(readings.signals.MODE(readings.DAY(timestamp))) \\\n", - "turbine_id \n", - "T001 11 \n", - "T001 12 \n", - "T001 13 \n", - "T001 14 \n", - "T001 15 \n", - "\n", - " MODE(readings.signals.MODE(readings.YEAR(timestamp))) \\\n", - "turbine_id \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "T001 2013 \n", - "\n", - " MODE(readings.signals.MODE(readings.WEEKDAY(timestamp))) \\\n", - "turbine_id \n", - "T001 4 \n", - "T001 5 \n", - "T001 6 \n", - "T001 0 \n", - "T001 1 \n", - "\n", - " MODE(readings.signals.MODE(readings.MONTH(timestamp))) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " MODE(readings.signal_id)=S01 \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " MODE(readings.signals.MODE(readings.turbine_id))=T001 \n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - "[5 rows x 99 columns]" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['X'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 323.0\n", - "1 T001 S02 2013-01-10 320.0\n", - "2 T001 S03 2013-01-10 284.0\n", - "3 T001 S04 2013-01-10 348.0\n", - "4 T001 S05 2013-01-10 273.0" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## xgboost.XGBClassifier\n", - "\n", - "* Input: X (label encoded and featurized)\n", - "* Output: None\n", - "* Effect: trained model" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "step = 3\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/tutorials/pipelines/dfs_xgb_with_unstack_normalization.ipynb b/tutorials/pipelines/dfs_xgb_with_unstack_normalization.ipynb deleted file mode 100644 index 5731706..0000000 --- a/tutorials/pipelines/dfs_xgb_with_unstack_normalization.ipynb +++ /dev/null @@ -1,1611 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# dfs_xgb_with_unstack_normalization" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from draco.demo import load_demo\n", - "\n", - "target_times, readings = load_demo()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "pipeline_name = 'dfs_xgb_with_unstack_normalization'" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/sarah/opt/anaconda3/envs/draco/lib/python3.7/site-packages/dask/dataframe/utils.py:14: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", - " import pandas.util.testing as tm\n" - ] - } - ], - "source": [ - "from draco.pipeline import DracoPipeline\n", - "\n", - "pipeline = DracoPipeline(pipeline_name)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['mlblocks.MLPipeline',\n", - " 'mlblocks.MLPipeline',\n", - " 'featuretools.dfs',\n", - " 'mlprimitives.custom.feature_extraction.CategoricalEncoder',\n", - " 'xgboost.XGBClassifier']" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pipeline.template['primitives']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Step by Step execution" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Input Data" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idsignal_idtimestampvalue
0T001S012013-01-10323.0
1T001S022013-01-10320.0
2T001S032013-01-10284.0
3T001S042013-01-10348.0
4T001S052013-01-10273.0
\n", - "
" - ], - "text/plain": [ - " turbine_id signal_id timestamp value\n", - "0 T001 S01 2013-01-10 323.0\n", - "1 T001 S02 2013-01-10 320.0\n", - "2 T001 S03 2013-01-10 284.0\n", - "3 T001 S04 2013-01-10 348.0\n", - "4 T001 S05 2013-01-10 273.0" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "readings.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idcutoff_timetarget
0T0012013-01-120
1T0012013-01-130
2T0012013-01-140
3T0012013-01-151
4T0012013-01-160
\n", - "
" - ], - "text/plain": [ - " turbine_id cutoff_time target\n", - "0 T001 2013-01-12 0\n", - "1 T001 2013-01-13 0\n", - "2 T001 2013-01-14 0\n", - "3 T001 2013-01-15 1\n", - "4 T001 2013-01-16 0" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "target_times.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation (part of Draco Pipeline)\n", - "\n", - "* Input: target_times, readings, turbines\n", - "* Output: X, y, readings, turbines\n", - "* Effect: target_times has been split into X and y" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## mlblocks.MLPipeline 1\n", - "\n", - "### pandas.DataFrame.resample\n", - "\n", - "* Input: readings\n", - "* Output: readings (resampled)\n", - "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index\n", - " \n", - "### pandas.DataFrame.unstack\n", - "\n", - "* Input: readings (resampled)\n", - "* Output: readings (unstacked)\n", - "* Effect: readings have been unstacked" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "step = 0\n", - "context = pipeline.fit(target_times, readings, output_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y'])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00323.0320.0284.0348.0273.0342.0280.03197842.0...11.73131020.055.055.047.058.045.058.047.0356.0
1T0012013-01-10 00:10:00346.0384.0367.0411.0331.0360.0249.03197900.0...10.23131420.058.063.062.067.055.061.042.0400.0
2T0012013-01-10 00:20:00407.0363.0407.0393.0275.0335.0270.03197968.0...9.53131822.068.061.067.066.046.055.045.0402.0
3T0012013-01-10 00:30:00257.0307.0315.0361.0317.0354.0271.03198011.0...10.53132179.043.051.053.062.053.060.045.0357.0
4T0012013-01-10 00:40:00267.0309.0314.0355.0262.0246.0212.03198056.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", - "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", - "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", - "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", - "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", - "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", - "\n", - " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", - "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", - "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", - "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", - "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", - "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", - "\n", - " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", - "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", - "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", - "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", - "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", - "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", - "\n", - " value_S25 value_S26 \n", - "0 47.0 356.0 \n", - "1 42.0 400.0 \n", - "2 45.0 402.0 \n", - "3 45.0 357.0 \n", - "4 36.0 322.0 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## mlblocks.MLPipeline 2\n", - "\n", - "### featuretools.EntitySet.entity_from_dataframe\n", - "\n", - "* Input: readings (resampled)\n", - "* Output: entityset\n", - "* Effect: Entityset has been generated from readings\n", - "\n", - "### featuretools.EntitySet.normalize_entity\n", - "\n", - "* Input: entityset\n", - "* Output: entityset with relationship (readings.turbine_id with turbines.turbine_id)\n", - "* Effect: establish relation between readings and turbines" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "step = 1\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'X', 'y', 'entityset'])" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Entityset: entityset\n", - " Entities:\n", - " readings [Rows: 51121, Columns: 29]\n", - " turbines [Rows: 1, Columns: 1]\n", - " Relationships:\n", - " readings.turbine_id -> turbines.turbine_id" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['entityset']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## featuretools.dfs\n", - "\n", - "* Input: entityset (unstacked, no turbine_id, no timestamp)\n", - "* Output: X (has additional features)\n", - "* Effect: build features for relational dataset using DFS" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "step = 2\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys() " - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SUM(readings.value_S14)SUM(readings.value_S11)SUM(readings.value_S25)SUM(readings.value_S23)SUM(readings.value_S17)SUM(readings.value_S19)SUM(readings.value_S04)SUM(readings.value_S05)SUM(readings.value_S21)SUM(readings.value_S16)...MEAN(readings.value_S20)COUNT(readings)NUM_UNIQUE(readings.YEAR(timestamp))NUM_UNIQUE(readings.MONTH(timestamp))NUM_UNIQUE(readings.DAY(timestamp))NUM_UNIQUE(readings.WEEKDAY(timestamp))MODE(readings.YEAR(timestamp))MODE(readings.MONTH(timestamp))MODE(readings.DAY(timestamp))MODE(readings.WEEKDAY(timestamp))
turbine_id
T001465421817.0496362516.02743.02780.0994.63174.019412.017083.03061.0550.4...22.326389144112220131114
T001465897578.0496952628.04237.04640.01166.75112.038289.034344.04919.0713.7...35.166667144112220131125
T001466806830.0498019072.09008.09179.01581.79134.086707.078749.09863.0916.3...53.381944144112220131136
T001468250434.0499530451.010073.010310.01690.910674.087907.083264.010638.0970.6...61.423611144112220131140
T001371675934.0400196323.07381.08228.01666.08831.068811.064088.08629.0948.8...87.575221144112220131151
\n", - "

5 rows × 165 columns

\n", - "
" - ], - "text/plain": [ - " SUM(readings.value_S14) SUM(readings.value_S11) \\\n", - "turbine_id \n", - "T001 465421817.0 496362516.0 \n", - "T001 465897578.0 496952628.0 \n", - "T001 466806830.0 498019072.0 \n", - "T001 468250434.0 499530451.0 \n", - "T001 371675934.0 400196323.0 \n", - "\n", - " SUM(readings.value_S25) SUM(readings.value_S23) \\\n", - "turbine_id \n", - "T001 2743.0 2780.0 \n", - "T001 4237.0 4640.0 \n", - "T001 9008.0 9179.0 \n", - "T001 10073.0 10310.0 \n", - "T001 7381.0 8228.0 \n", - "\n", - " SUM(readings.value_S17) SUM(readings.value_S19) \\\n", - "turbine_id \n", - "T001 994.6 3174.0 \n", - "T001 1166.7 5112.0 \n", - "T001 1581.7 9134.0 \n", - "T001 1690.9 10674.0 \n", - "T001 1666.0 8831.0 \n", - "\n", - " SUM(readings.value_S04) SUM(readings.value_S05) \\\n", - "turbine_id \n", - "T001 19412.0 17083.0 \n", - "T001 38289.0 34344.0 \n", - "T001 86707.0 78749.0 \n", - "T001 87907.0 83264.0 \n", - "T001 68811.0 64088.0 \n", - "\n", - " SUM(readings.value_S21) SUM(readings.value_S16) ... \\\n", - "turbine_id ... \n", - "T001 3061.0 550.4 ... \n", - "T001 4919.0 713.7 ... \n", - "T001 9863.0 916.3 ... \n", - "T001 10638.0 970.6 ... \n", - "T001 8629.0 948.8 ... \n", - "\n", - " MEAN(readings.value_S20) COUNT(readings) \\\n", - "turbine_id \n", - "T001 22.326389 144 \n", - "T001 35.166667 144 \n", - "T001 53.381944 144 \n", - "T001 61.423611 144 \n", - "T001 87.575221 144 \n", - "\n", - " NUM_UNIQUE(readings.YEAR(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", - "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "\n", - " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", - "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "\n", - " MODE(readings.YEAR(timestamp)) MODE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "\n", - " MODE(readings.DAY(timestamp)) MODE(readings.WEEKDAY(timestamp)) \n", - "turbine_id \n", - "T001 11 4 \n", - "T001 12 5 \n", - "T001 13 6 \n", - "T001 14 0 \n", - "T001 15 1 \n", - "\n", - "[5 rows x 165 columns]" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['X'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "165" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# features generated (the turbine_id is set as index).\n", - "len(context['X'].columns)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## mlprimitives.custom.feature_extraction.CategoricalEncoder\n", - "\n", - "* Input: X\n", - "* Output: X (label encoded)\n", - "* Effect: encodes categorical features using OneHotLabelEncoder" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "step = 3\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
SUM(readings.value_S14)SUM(readings.value_S11)SUM(readings.value_S25)SUM(readings.value_S23)SUM(readings.value_S17)SUM(readings.value_S19)SUM(readings.value_S04)SUM(readings.value_S05)SUM(readings.value_S21)SUM(readings.value_S16)...MEAN(readings.value_S20)COUNT(readings)NUM_UNIQUE(readings.YEAR(timestamp))NUM_UNIQUE(readings.MONTH(timestamp))NUM_UNIQUE(readings.DAY(timestamp))NUM_UNIQUE(readings.WEEKDAY(timestamp))MODE(readings.YEAR(timestamp))MODE(readings.MONTH(timestamp))MODE(readings.DAY(timestamp))MODE(readings.WEEKDAY(timestamp))
turbine_id
T001465421817.0496362516.02743.02780.0994.63174.019412.017083.03061.0550.4...22.326389144112220131114
T001465897578.0496952628.04237.04640.01166.75112.038289.034344.04919.0713.7...35.166667144112220131125
T001466806830.0498019072.09008.09179.01581.79134.086707.078749.09863.0916.3...53.381944144112220131136
T001468250434.0499530451.010073.010310.01690.910674.087907.083264.010638.0970.6...61.423611144112220131140
T001371675934.0400196323.07381.08228.01666.08831.068811.064088.08629.0948.8...87.575221144112220131151
\n", - "

5 rows × 165 columns

\n", - "
" - ], - "text/plain": [ - " SUM(readings.value_S14) SUM(readings.value_S11) \\\n", - "turbine_id \n", - "T001 465421817.0 496362516.0 \n", - "T001 465897578.0 496952628.0 \n", - "T001 466806830.0 498019072.0 \n", - "T001 468250434.0 499530451.0 \n", - "T001 371675934.0 400196323.0 \n", - "\n", - " SUM(readings.value_S25) SUM(readings.value_S23) \\\n", - "turbine_id \n", - "T001 2743.0 2780.0 \n", - "T001 4237.0 4640.0 \n", - "T001 9008.0 9179.0 \n", - "T001 10073.0 10310.0 \n", - "T001 7381.0 8228.0 \n", - "\n", - " SUM(readings.value_S17) SUM(readings.value_S19) \\\n", - "turbine_id \n", - "T001 994.6 3174.0 \n", - "T001 1166.7 5112.0 \n", - "T001 1581.7 9134.0 \n", - "T001 1690.9 10674.0 \n", - "T001 1666.0 8831.0 \n", - "\n", - " SUM(readings.value_S04) SUM(readings.value_S05) \\\n", - "turbine_id \n", - "T001 19412.0 17083.0 \n", - "T001 38289.0 34344.0 \n", - "T001 86707.0 78749.0 \n", - "T001 87907.0 83264.0 \n", - "T001 68811.0 64088.0 \n", - "\n", - " SUM(readings.value_S21) SUM(readings.value_S16) ... \\\n", - "turbine_id ... \n", - "T001 3061.0 550.4 ... \n", - "T001 4919.0 713.7 ... \n", - "T001 9863.0 916.3 ... \n", - "T001 10638.0 970.6 ... \n", - "T001 8629.0 948.8 ... \n", - "\n", - " MEAN(readings.value_S20) COUNT(readings) \\\n", - "turbine_id \n", - "T001 22.326389 144 \n", - "T001 35.166667 144 \n", - "T001 53.381944 144 \n", - "T001 61.423611 144 \n", - "T001 87.575221 144 \n", - "\n", - " NUM_UNIQUE(readings.YEAR(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "T001 1 \n", - "\n", - " NUM_UNIQUE(readings.DAY(timestamp)) \\\n", - "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "\n", - " NUM_UNIQUE(readings.WEEKDAY(timestamp)) \\\n", - "turbine_id \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "T001 2 \n", - "\n", - " MODE(readings.YEAR(timestamp)) MODE(readings.MONTH(timestamp)) \\\n", - "turbine_id \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "T001 2013 1 \n", - "\n", - " MODE(readings.DAY(timestamp)) MODE(readings.WEEKDAY(timestamp)) \n", - "turbine_id \n", - "T001 11 4 \n", - "T001 12 5 \n", - "T001 13 6 \n", - "T001 14 0 \n", - "T001 15 1 \n", - "\n", - "[5 rows x 165 columns]" - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['X'].head()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
turbine_idtimestampvalue_S01value_S02value_S03value_S04value_S05value_S06value_S07value_S08...value_S17value_S18value_S19value_S20value_S21value_S22value_S23value_S24value_S25value_S26
0T0012013-01-10 00:00:00323.0320.0284.0348.0273.0342.0280.03197842.0...11.73131020.055.055.047.058.045.058.047.0356.0
1T0012013-01-10 00:10:00346.0384.0367.0411.0331.0360.0249.03197900.0...10.23131420.058.063.062.067.055.061.042.0400.0
2T0012013-01-10 00:20:00407.0363.0407.0393.0275.0335.0270.03197968.0...9.53131822.068.061.067.066.046.055.045.0402.0
3T0012013-01-10 00:30:00257.0307.0315.0361.0317.0354.0271.03198011.0...10.53132179.043.051.053.062.053.060.045.0357.0
4T0012013-01-10 00:40:00267.0309.0314.0355.0262.0246.0212.03198056.0...9.63132501.045.051.054.059.043.041.036.0322.0
\n", - "

5 rows × 28 columns

\n", - "
" - ], - "text/plain": [ - " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", - "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", - "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", - "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", - "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", - "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", - "\n", - " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", - "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", - "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", - "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", - "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", - "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", - "\n", - " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", - "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", - "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", - "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", - "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", - "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", - "\n", - " value_S25 value_S26 \n", - "0 47.0 356.0 \n", - "1 42.0 400.0 \n", - "2 45.0 402.0 \n", - "3 45.0 357.0 \n", - "4 36.0 322.0 \n", - "\n", - "[5 rows x 28 columns]" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context['readings'].head()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## xgboost.XGBClassifier\n", - "\n", - "* Input: X (label encoded and featurized)\n", - "* Output: None\n", - "* Effect: trained model" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "step = 4\n", - "context = pipeline.fit(**context, output_=step, start_=step)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "dict_keys(['readings', 'turbines', 'entityset', 'X', 'y'])" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "context.keys()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.11" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From 8a580338edf93825dd9c6688025f7dc5a0482c4c Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Sun, 23 Apr 2023 19:24:19 -0400 Subject: [PATCH 164/171] Migrate mlstars (#75) * remove dfs pipelines * change mlstars * change mlprimitives to mlstars * fix tutorials * update mlstars version --- draco/pipeline.py | 2 +- draco/pipelines/double_lstm/double_lstm.json | 6 +- .../double_lstm/double_lstm_prob.json | 6 +- .../double_lstm_prob_with_unstack.json | 6 +- .../double_lstm/double_lstm_with_unstack.json | 6 +- draco/pipelines/dummy/dummy.json | 10 +- draco/pipelines/lstm/lstm.json | 6 +- draco/pipelines/lstm/lstm_prob.json | 6 +- .../lstm/lstm_prob_with_unstack.json | 6 +- draco/pipelines/lstm/lstm_with_unstack.json | 6 +- .../lstm_regressor/lstm_regressor.json | 6 +- .../lstm_regressor_with_unstack.json | 6 +- setup.py | 5 +- tests/test_pipeline.py | 8 + .../pipelines/double_lstm_with_unstack.ipynb | 2097 +++++++------- .../lstm_regressor_with_unstack.ipynb | 2416 ++++++++--------- tutorials/pipelines/lstm_with_unstack.ipynb | 1845 +++++++------ 17 files changed, 3331 insertions(+), 3112 deletions(-) diff --git a/draco/pipeline.py b/draco/pipeline.py index 6a9adf6..fee44aa 100644 --- a/draco/pipeline.py +++ b/draco/pipeline.py @@ -15,7 +15,7 @@ from btb.tuning import Tunable from mlblocks import MLPipeline from mlblocks.discovery import load_pipeline -from mlprimitives.adapters.keras import Sequential +from mlstars.adapters.keras import Sequential from sklearn.exceptions import NotFittedError from sklearn.model_selection import KFold, StratifiedKFold diff --git a/draco/pipelines/double_lstm/double_lstm.json b/draco/pipelines/double_lstm/double_lstm.json index e3be8a5..8e5e4fd 100644 --- a/draco/pipelines/double_lstm/double_lstm.json +++ b/draco/pipelines/double_lstm/double_lstm.json @@ -7,7 +7,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.DoubleLSTMTimeSeriesClassifier" ], "init_params": { @@ -33,7 +33,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -67,7 +67,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/double_lstm/double_lstm_prob.json b/draco/pipelines/double_lstm/double_lstm_prob.json index a118af0..0a20648 100644 --- a/draco/pipelines/double_lstm/double_lstm_prob.json +++ b/draco/pipelines/double_lstm/double_lstm_prob.json @@ -7,7 +7,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.DoubleLSTMTimeSeriesClassifier", "numpy.take" ], @@ -34,7 +34,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -74,7 +74,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json b/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json index ea48a87..b46c4e3 100644 --- a/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json +++ b/draco/pipelines/double_lstm/double_lstm_prob_with_unstack.json @@ -9,7 +9,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.DoubleLSTMTimeSeriesClassifier", "numpy.take" ], @@ -50,7 +50,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -96,7 +96,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/double_lstm/double_lstm_with_unstack.json b/draco/pipelines/double_lstm/double_lstm_with_unstack.json index dede502..f1992a6 100644 --- a/draco/pipelines/double_lstm/double_lstm_with_unstack.json +++ b/draco/pipelines/double_lstm/double_lstm_with_unstack.json @@ -9,7 +9,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.DoubleLSTMTimeSeriesClassifier" ], "init_params": { @@ -49,7 +49,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -89,7 +89,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/dummy/dummy.json b/draco/pipelines/dummy/dummy.json index a28121e..6175f5b 100644 --- a/draco/pipelines/dummy/dummy.json +++ b/draco/pipelines/dummy/dummy.json @@ -1,11 +1,7 @@ { "primitives": [ - "mlprimitives.custom.preprocessing.ClassEncoder", - "mlprimitives.custom.feature_extraction.DatetimeFeaturizer", - "mlprimitives.custom.feature_extraction.CategoricalEncoder", - "mlprimitives.custom.feature_extraction.StringVectorizer", "sklearn.impute.SimpleImputer", - "sklearn.linear_model.LogisticRegression", - "mlprimitives.custom.preprocessing.ClassDecoder" + "sklearn.preprocessing.MinMaxScaler", + "sklearn.linear_model.LogisticRegression" ] -} \ No newline at end of file +} diff --git a/draco/pipelines/lstm/lstm.json b/draco/pipelines/lstm/lstm.json index c29b1c7..b430fb0 100644 --- a/draco/pipelines/lstm/lstm.json +++ b/draco/pipelines/lstm/lstm.json @@ -7,7 +7,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.LSTMTimeSeriesClassifier" ], "init_params": { @@ -33,7 +33,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -67,7 +67,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/lstm/lstm_prob.json b/draco/pipelines/lstm/lstm_prob.json index 17da404..df135ba 100644 --- a/draco/pipelines/lstm/lstm_prob.json +++ b/draco/pipelines/lstm/lstm_prob.json @@ -7,7 +7,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.LSTMTimeSeriesClassifier", "numpy.take" ], @@ -34,7 +34,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -74,7 +74,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/lstm/lstm_prob_with_unstack.json b/draco/pipelines/lstm/lstm_prob_with_unstack.json index 9272257..2bf6172 100644 --- a/draco/pipelines/lstm/lstm_prob_with_unstack.json +++ b/draco/pipelines/lstm/lstm_prob_with_unstack.json @@ -9,7 +9,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.LSTMTimeSeriesClassifier", "numpy.take" ], @@ -50,7 +50,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -96,7 +96,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/lstm/lstm_with_unstack.json b/draco/pipelines/lstm/lstm_with_unstack.json index ab9dd99..86ffc4b 100644 --- a/draco/pipelines/lstm/lstm_with_unstack.json +++ b/draco/pipelines/lstm/lstm_with_unstack.json @@ -9,7 +9,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.LSTMTimeSeriesClassifier" ], "init_params": { @@ -49,7 +49,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -89,7 +89,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/lstm_regressor/lstm_regressor.json b/draco/pipelines/lstm_regressor/lstm_regressor.json index 77ddb1e..82ed77f 100644 --- a/draco/pipelines/lstm_regressor/lstm_regressor.json +++ b/draco/pipelines/lstm_regressor/lstm_regressor.json @@ -7,7 +7,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.LSTMTimeSeriesRegressor" ], "init_params": { @@ -33,7 +33,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -67,7 +67,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json b/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json index 9e183b9..225c47e 100644 --- a/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json +++ b/draco/pipelines/lstm_regressor/lstm_regressor_with_unstack.json @@ -9,7 +9,7 @@ "pandas.DataFrame", "pandas.DataFrame.set", "pandas.DataFrame.set", - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences", + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences", "keras.Sequential.LSTMTimeSeriesRegressor" ], "init_params": { @@ -49,7 +49,7 @@ "pandas.DataFrame.set#2": { "key": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "window_size": 24, "cutoff_time": "cutoff_time", "time_index": "timestamp" @@ -89,7 +89,7 @@ "X": "readings", "value": "timestamp" }, - "mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1": { + "mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1": { "timeseries": "readings" } }, diff --git a/setup.py b/setup.py index 1dfcc4b..9087746 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ install_requires = [ 'baytune>=0.4.0,<0.5', - 'mlprimitives>=0.3.2,<0.4', + 'ml-stars>=0.1.0', 'mlblocks>=0.4.0,<0.5', 'pymongo>=3.7.2,<4', 'scikit-learn>=0.21,<1.2', @@ -30,11 +30,10 @@ 'fsspec>=0.8.5,<0.9', 'dask>=2.6.0,<3', 'tabulate>=0.8.3,<0.9', - 'xlsxwriter>=1.3.6<1.4', + 'xlsxwriter>=1.3.6,<1.4', # fix conflicts 'protobuf<4', 'importlib-metadata<5', - #'importlib-metadata<2,>=0.12', ] setup_requires = [ diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 3b7359f..b27de6f 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -7,6 +7,7 @@ import pandas as pd import pytest +from mlblocks import MLPipeline from draco.pipeline import DracoPipeline, get_pipelines @@ -28,6 +29,13 @@ def test_get_pipelines_type_error(): get_pipelines(pipeline_type='does-not-exist') +def test_loading_pipelines(): + draco_pipelines = get_pipelines() + for pipeline in draco_pipelines: + mlpipeline = MLPipeline(pipeline) + assert isinstance(mlpipeline, MLPipeline) + + class TestDracoPipeline(TestCase): def _get_data(self): diff --git a/tutorials/pipelines/double_lstm_with_unstack.ipynb b/tutorials/pipelines/double_lstm_with_unstack.ipynb index 4bc7d0f..043dae8 100644 --- a/tutorials/pipelines/double_lstm_with_unstack.ipynb +++ b/tutorials/pipelines/double_lstm_with_unstack.ipynb @@ -46,7 +46,8 @@ { "data": { "text/plain": [ - "['mlblocks.MLPipeline',\n", + "['pandas.DataFrame.resample',\n", + " 'pandas.DataFrame.unstack',\n", " 'pandas.DataFrame.pop',\n", " 'pandas.DataFrame.pop',\n", " 'sklearn.impute.SimpleImputer',\n", @@ -54,7 +55,7 @@ " 'pandas.DataFrame',\n", " 'pandas.DataFrame.set',\n", " 'pandas.DataFrame.set',\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences',\n", + " 'mlstars.custom.timeseries_preprocessing.cutoff_window_sequences',\n", " 'keras.Sequential.DoubleLSTMTimeSeriesClassifier']" ] }, @@ -270,20 +271,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## mlblocks.MLPipeline\n", - "\n", - "### pandas.DataFrame.resample\n", + "## pandas.DataFrame.resample\n", "\n", "* Input: readings\n", "* Output: readings (resampled)\n", "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index\n", - " \n", - "### pandas.DataFrame.unstack\n", - "\n", - "* Input: readings (resampled)\n", - "* Output: readings (unstacked)\n", - "* Effect: readings have been unstacked" + " signal_id and timestamp have been set as a multi-index\n" ] }, { @@ -319,6 +312,130 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
turbine_idsignal_idtimestamp
T001S012013-01-10 00:00:00313.333333
2013-01-10 01:00:00197.500000
2013-01-10 02:00:00248.166667
2013-01-10 03:00:00253.166667
2013-01-10 04:00:00305.000000
\n", + "
" + ], + "text/plain": [ + " value\n", + "turbine_id signal_id timestamp \n", + "T001 S01 2013-01-10 00:00:00 313.333333\n", + " 2013-01-10 01:00:00 197.500000\n", + " 2013-01-10 02:00:00 248.166667\n", + " 2013-01-10 03:00:00 253.166667\n", + " 2013-01-10 04:00:00 305.000000" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, "outputs": [ { "data": { @@ -369,121 +486,121 @@ " 0\n", " T001\n", " 2013-01-10 00:00:00\n", - " 323.0\n", - " 320.0\n", - " 284.0\n", - " 348.0\n", - " 273.0\n", - " 342.0\n", - " 280.0\n", - " 3197842.0\n", + " 313.333333\n", + " 323.833333\n", + " 336.000000\n", + " 364.666667\n", + " 286.500000\n", + " 314.000000\n", + " 243.166667\n", + " 3.197980e+06\n", " ...\n", - " 11.7\n", - " 3131020.0\n", - " 55.0\n", - " 55.0\n", - " 47.0\n", - " 58.0\n", - " 45.0\n", - " 58.0\n", - " 47.0\n", - " 356.0\n", + " 10.383333\n", + " 3.131958e+06\n", + " 52.666667\n", + " 54.333333\n", + " 56.166667\n", + " 61.000000\n", + " 47.666667\n", + " 52.666667\n", + " 40.833333\n", + " 357.333333\n", " \n", " \n", " 1\n", " T001\n", - " 2013-01-10 00:10:00\n", - " 346.0\n", - " 384.0\n", - " 367.0\n", - " 411.0\n", - " 331.0\n", - " 360.0\n", - " 249.0\n", - " 3197900.0\n", + " 2013-01-10 01:00:00\n", + " 197.500000\n", + " 221.333333\n", + " 216.000000\n", + " 260.666667\n", + " 206.833333\n", + " 235.833333\n", + " 186.666667\n", + " 3.198221e+06\n", " ...\n", - " 10.2\n", - " 3131420.0\n", - " 58.0\n", - " 63.0\n", - " 62.0\n", - " 67.0\n", - " 55.0\n", - " 61.0\n", - " 42.0\n", - " 400.0\n", + " 8.666667\n", + " 3.133668e+06\n", + " 33.166667\n", + " 37.000000\n", + " 36.166667\n", + " 43.666667\n", + " 34.500000\n", + " 39.333333\n", + " 31.166667\n", + " 249.666667\n", " \n", " \n", " 2\n", " T001\n", - " 2013-01-10 00:20:00\n", - " 407.0\n", - " 363.0\n", - " 407.0\n", - " 393.0\n", - " 275.0\n", - " 335.0\n", - " 270.0\n", - " 3197968.0\n", + " 2013-01-10 02:00:00\n", + " 248.166667\n", + " 271.666667\n", + " 277.500000\n", + " 298.000000\n", + " 233.666667\n", + " 271.166667\n", + " 216.333333\n", + " 3.198448e+06\n", " ...\n", - " 9.5\n", - " 3131822.0\n", - " 68.0\n", - " 61.0\n", - " 67.0\n", - " 66.0\n", - " 46.0\n", - " 55.0\n", - " 45.0\n", - " 402.0\n", + " 8.833333\n", + " 3.135413e+06\n", + " 41.500000\n", + " 45.666667\n", + " 46.500000\n", + " 49.666667\n", + " 39.333333\n", + " 45.500000\n", + " 36.166667\n", + " 297.666667\n", " \n", " \n", " 3\n", " T001\n", - " 2013-01-10 00:30:00\n", - " 257.0\n", - " 307.0\n", - " 315.0\n", - " 361.0\n", - " 317.0\n", - " 354.0\n", - " 271.0\n", - " 3198011.0\n", + " 2013-01-10 03:00:00\n", + " 253.166667\n", + " 256.166667\n", + " 242.666667\n", + " 265.333333\n", + " 211.666667\n", + " 226.666667\n", + " 181.000000\n", + " 3.198691e+06\n", " ...\n", - " 10.5\n", - " 3132179.0\n", - " 43.0\n", - " 51.0\n", - " 53.0\n", - " 62.0\n", - " 53.0\n", - " 60.0\n", - " 45.0\n", - " 357.0\n", + " 8.433333\n", + " 3.137001e+06\n", + " 42.333333\n", + " 42.833333\n", + " 40.500000\n", + " 44.166667\n", + " 35.333333\n", + " 37.833333\n", + " 30.333333\n", + " 268.000000\n", " \n", " \n", " 4\n", " T001\n", - " 2013-01-10 00:40:00\n", - " 267.0\n", - " 309.0\n", - " 314.0\n", - " 355.0\n", - " 262.0\n", - " 246.0\n", - " 212.0\n", - " 3198056.0\n", + " 2013-01-10 04:00:00\n", + " 305.000000\n", + " 312.333333\n", + " 346.166667\n", + " 329.833333\n", + " 280.666667\n", + " 308.833333\n", + " 271.833333\n", + " 3.198978e+06\n", " ...\n", - " 9.6\n", - " 3132501.0\n", - " 45.0\n", - " 51.0\n", - " 54.0\n", - " 59.0\n", - " 43.0\n", - " 41.0\n", - " 36.0\n", - " 322.0\n", + " 9.083333\n", + " 3.138843e+06\n", + " 50.500000\n", + " 51.166667\n", + " 55.500000\n", + " 53.666667\n", + " 46.166667\n", + " 49.666667\n", + " 41.166667\n", + " 341.833333\n", " \n", " \n", "\n", @@ -491,38 +608,38 @@ "" ], "text/plain": [ - " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", - "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", - "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", - "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", - "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", - "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", + " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", + "0 T001 2013-01-10 00:00:00 313.333333 323.833333 336.000000 \n", + "1 T001 2013-01-10 01:00:00 197.500000 221.333333 216.000000 \n", + "2 T001 2013-01-10 02:00:00 248.166667 271.666667 277.500000 \n", + "3 T001 2013-01-10 03:00:00 253.166667 256.166667 242.666667 \n", + "4 T001 2013-01-10 04:00:00 305.000000 312.333333 346.166667 \n", "\n", - " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", - "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", - "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", - "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", - "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", - "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", + " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", + "0 364.666667 286.500000 314.000000 243.166667 3.197980e+06 ... \n", + "1 260.666667 206.833333 235.833333 186.666667 3.198221e+06 ... \n", + "2 298.000000 233.666667 271.166667 216.333333 3.198448e+06 ... \n", + "3 265.333333 211.666667 226.666667 181.000000 3.198691e+06 ... \n", + "4 329.833333 280.666667 308.833333 271.833333 3.198978e+06 ... \n", "\n", - " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", - "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", - "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", - "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", - "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", - "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", + "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", + "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", + "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", + "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", "\n", - " value_S25 value_S26 \n", - "0 47.0 356.0 \n", - "1 42.0 400.0 \n", - "2 45.0 402.0 \n", - "3 45.0 357.0 \n", - "4 36.0 322.0 \n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 47.666667 52.666667 40.833333 357.333333 \n", + "1 34.500000 39.333333 31.166667 249.666667 \n", + "2 39.333333 45.500000 36.166667 297.666667 \n", + "3 35.333333 37.833333 30.333333 268.000000 \n", + "4 46.166667 49.666667 41.166667 341.833333 \n", "\n", "[5 rows x 28 columns]" ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -544,17 +661,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "step = 1\n", + "step = 2\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -563,7 +680,7 @@ "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" ] }, - "execution_count": 11, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -574,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -588,7 +705,7 @@ "Name: turbine_id, dtype: object" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -599,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -650,122 +767,122 @@ " \n", " 0\n", " 2013-01-10 00:00:00\n", - " 323.0\n", - " 320.0\n", - " 284.0\n", - " 348.0\n", - " 273.0\n", - " 342.0\n", - " 280.0\n", - " 3197842.0\n", - " 695000.0\n", + " 313.333333\n", + " 323.833333\n", + " 336.000000\n", + " 364.666667\n", + " 286.500000\n", + " 314.000000\n", + " 243.166667\n", + " 3.197980e+06\n", + " 695143.166667\n", " ...\n", - " 11.7\n", - " 3131020.0\n", - " 55.0\n", - " 55.0\n", - " 47.0\n", - " 58.0\n", - " 45.0\n", - " 58.0\n", - " 47.0\n", - " 356.0\n", + " 10.383333\n", + " 3.131958e+06\n", + " 52.666667\n", + " 54.333333\n", + " 56.166667\n", + " 61.000000\n", + " 47.666667\n", + " 52.666667\n", + " 40.833333\n", + " 357.333333\n", " \n", " \n", " 1\n", - " 2013-01-10 00:10:00\n", - " 346.0\n", - " 384.0\n", - " 367.0\n", - " 411.0\n", - " 331.0\n", - " 360.0\n", - " 249.0\n", - " 3197900.0\n", - " 695063.0\n", + " 2013-01-10 01:00:00\n", + " 197.500000\n", + " 221.333333\n", + " 216.000000\n", + " 260.666667\n", + " 206.833333\n", + " 235.833333\n", + " 186.666667\n", + " 3.198221e+06\n", + " 695403.666667\n", " ...\n", - " 10.2\n", - " 3131420.0\n", - " 58.0\n", - " 63.0\n", - " 62.0\n", - " 67.0\n", - " 55.0\n", - " 61.0\n", - " 42.0\n", - " 400.0\n", + " 8.666667\n", + " 3.133668e+06\n", + " 33.166667\n", + " 37.000000\n", + " 36.166667\n", + " 43.666667\n", + " 34.500000\n", + " 39.333333\n", + " 31.166667\n", + " 249.666667\n", " \n", " \n", " 2\n", - " 2013-01-10 00:20:00\n", - " 407.0\n", - " 363.0\n", - " 407.0\n", - " 393.0\n", - " 275.0\n", - " 335.0\n", - " 270.0\n", - " 3197968.0\n", - " 695124.0\n", + " 2013-01-10 02:00:00\n", + " 248.166667\n", + " 271.666667\n", + " 277.500000\n", + " 298.000000\n", + " 233.666667\n", + " 271.166667\n", + " 216.333333\n", + " 3.198448e+06\n", + " 695656.500000\n", " ...\n", - " 9.5\n", - " 3131822.0\n", - " 68.0\n", - " 61.0\n", - " 67.0\n", - " 66.0\n", - " 46.0\n", - " 55.0\n", - " 45.0\n", - " 402.0\n", + " 8.833333\n", + " 3.135413e+06\n", + " 41.500000\n", + " 45.666667\n", + " 46.500000\n", + " 49.666667\n", + " 39.333333\n", + " 45.500000\n", + " 36.166667\n", + " 297.666667\n", " \n", " \n", " 3\n", - " 2013-01-10 00:30:00\n", - " 257.0\n", - " 307.0\n", - " 315.0\n", - " 361.0\n", - " 317.0\n", - " 354.0\n", - " 271.0\n", - " 3198011.0\n", - " 695175.0\n", + " 2013-01-10 03:00:00\n", + " 253.166667\n", + " 256.166667\n", + " 242.666667\n", + " 265.333333\n", + " 211.666667\n", + " 226.666667\n", + " 181.000000\n", + " 3.198691e+06\n", + " 695911.333333\n", " ...\n", - " 10.5\n", - " 3132179.0\n", - " 43.0\n", - " 51.0\n", - " 53.0\n", - " 62.0\n", - " 53.0\n", - " 60.0\n", - " 45.0\n", - " 357.0\n", + " 8.433333\n", + " 3.137001e+06\n", + " 42.333333\n", + " 42.833333\n", + " 40.500000\n", + " 44.166667\n", + " 35.333333\n", + " 37.833333\n", + " 30.333333\n", + " 268.000000\n", " \n", " \n", " 4\n", - " 2013-01-10 00:40:00\n", - " 267.0\n", - " 309.0\n", - " 314.0\n", - " 355.0\n", - " 262.0\n", - " 246.0\n", - " 212.0\n", - " 3198056.0\n", - " 695226.0\n", + " 2013-01-10 04:00:00\n", + " 305.000000\n", + " 312.333333\n", + " 346.166667\n", + " 329.833333\n", + " 280.666667\n", + " 308.833333\n", + " 271.833333\n", + " 3.198978e+06\n", + " 696195.833333\n", " ...\n", - " 9.6\n", - " 3132501.0\n", - " 45.0\n", - " 51.0\n", - " 54.0\n", - " 59.0\n", - " 43.0\n", - " 41.0\n", - " 36.0\n", - " 322.0\n", + " 9.083333\n", + " 3.138843e+06\n", + " 50.500000\n", + " 51.166667\n", + " 55.500000\n", + " 53.666667\n", + " 46.166667\n", + " 49.666667\n", + " 41.166667\n", + " 341.833333\n", " \n", " \n", "\n", @@ -773,38 +890,38 @@ "" ], "text/plain": [ - " timestamp value_S01 value_S02 value_S03 value_S04 value_S05 \\\n", - "0 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 273.0 \n", - "1 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 331.0 \n", - "2 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 275.0 \n", - "3 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 317.0 \n", - "4 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 262.0 \n", + " timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 2013-01-10 00:00:00 313.333333 323.833333 336.000000 364.666667 \n", + "1 2013-01-10 01:00:00 197.500000 221.333333 216.000000 260.666667 \n", + "2 2013-01-10 02:00:00 248.166667 271.666667 277.500000 298.000000 \n", + "3 2013-01-10 03:00:00 253.166667 256.166667 242.666667 265.333333 \n", + "4 2013-01-10 04:00:00 305.000000 312.333333 346.166667 329.833333 \n", "\n", - " value_S06 value_S07 value_S08 value_S09 ... value_S17 value_S18 \\\n", - "0 342.0 280.0 3197842.0 695000.0 ... 11.7 3131020.0 \n", - "1 360.0 249.0 3197900.0 695063.0 ... 10.2 3131420.0 \n", - "2 335.0 270.0 3197968.0 695124.0 ... 9.5 3131822.0 \n", - "3 354.0 271.0 3198011.0 695175.0 ... 10.5 3132179.0 \n", - "4 246.0 212.0 3198056.0 695226.0 ... 9.6 3132501.0 \n", + " value_S05 value_S06 value_S07 value_S08 value_S09 ... \\\n", + "0 286.500000 314.000000 243.166667 3.197980e+06 695143.166667 ... \n", + "1 206.833333 235.833333 186.666667 3.198221e+06 695403.666667 ... \n", + "2 233.666667 271.166667 216.333333 3.198448e+06 695656.500000 ... \n", + "3 211.666667 226.666667 181.000000 3.198691e+06 695911.333333 ... \n", + "4 280.666667 308.833333 271.833333 3.198978e+06 696195.833333 ... \n", "\n", - " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", - "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", - "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", - "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", - "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", - "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", + "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", + "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", + "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", + "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", "\n", - " value_S25 value_S26 \n", - "0 47.0 356.0 \n", - "1 42.0 400.0 \n", - "2 45.0 402.0 \n", - "3 45.0 357.0 \n", - "4 36.0 322.0 \n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 47.666667 52.666667 40.833333 357.333333 \n", + "1 34.500000 39.333333 31.166667 249.666667 \n", + "2 39.333333 45.500000 36.166667 297.666667 \n", + "3 35.333333 37.833333 30.333333 268.000000 \n", + "4 46.166667 49.666667 41.166667 341.833333 \n", "\n", "[5 rows x 27 columns]" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -826,17 +943,17 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "step = 2\n", + "step = 3\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -845,7 +962,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -856,21 +973,21 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2013-01-10 00:00:00\n", - "1 2013-01-10 00:10:00\n", - "2 2013-01-10 00:20:00\n", - "3 2013-01-10 00:30:00\n", - "4 2013-01-10 00:40:00\n", + "1 2013-01-10 01:00:00\n", + "2 2013-01-10 02:00:00\n", + "3 2013-01-10 03:00:00\n", + "4 2013-01-10 04:00:00\n", "Name: timestamp, dtype: datetime64[ns]" ] }, - "execution_count": 16, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -881,7 +998,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -931,123 +1048,123 @@ " \n", " \n", " 0\n", - " 323.0\n", - " 320.0\n", - " 284.0\n", - " 348.0\n", - " 273.0\n", - " 342.0\n", - " 280.0\n", - " 3197842.0\n", - " 695000.0\n", - " 3348234.0\n", + " 313.333333\n", + " 323.833333\n", + " 336.000000\n", + " 364.666667\n", + " 286.500000\n", + " 314.000000\n", + " 243.166667\n", + " 3.197980e+06\n", + " 695143.166667\n", + " 3.348384e+06\n", " ...\n", - " 11.7\n", - " 3131020.0\n", - " 55.0\n", - " 55.0\n", - " 47.0\n", - " 58.0\n", - " 45.0\n", - " 58.0\n", - " 47.0\n", - " 356.0\n", + " 10.383333\n", + " 3.131958e+06\n", + " 52.666667\n", + " 54.333333\n", + " 56.166667\n", + " 61.000000\n", + " 47.666667\n", + " 52.666667\n", + " 40.833333\n", + " 357.333333\n", " \n", " \n", " 1\n", - " 346.0\n", - " 384.0\n", - " 367.0\n", - " 411.0\n", - " 331.0\n", - " 360.0\n", - " 249.0\n", - " 3197900.0\n", - " 695063.0\n", - " 3348296.0\n", + " 197.500000\n", + " 221.333333\n", + " 216.000000\n", + " 260.666667\n", + " 206.833333\n", + " 235.833333\n", + " 186.666667\n", + " 3.198221e+06\n", + " 695403.666667\n", + " 3.348651e+06\n", " ...\n", - " 10.2\n", - " 3131420.0\n", - " 58.0\n", - " 63.0\n", - " 62.0\n", - " 67.0\n", - " 55.0\n", - " 61.0\n", - " 42.0\n", - " 400.0\n", + " 8.666667\n", + " 3.133668e+06\n", + " 33.166667\n", + " 37.000000\n", + " 36.166667\n", + " 43.666667\n", + " 34.500000\n", + " 39.333333\n", + " 31.166667\n", + " 249.666667\n", " \n", " \n", " 2\n", - " 407.0\n", - " 363.0\n", - " 407.0\n", - " 393.0\n", - " 275.0\n", - " 335.0\n", - " 270.0\n", - " 3197968.0\n", - " 695124.0\n", - " 3348363.0\n", + " 248.166667\n", + " 271.666667\n", + " 277.500000\n", + " 298.000000\n", + " 233.666667\n", + " 271.166667\n", + " 216.333333\n", + " 3.198448e+06\n", + " 695656.500000\n", + " 3.348910e+06\n", " ...\n", - " 9.5\n", - " 3131822.0\n", - " 68.0\n", - " 61.0\n", - " 67.0\n", - " 66.0\n", - " 46.0\n", - " 55.0\n", - " 45.0\n", - " 402.0\n", + " 8.833333\n", + " 3.135413e+06\n", + " 41.500000\n", + " 45.666667\n", + " 46.500000\n", + " 49.666667\n", + " 39.333333\n", + " 45.500000\n", + " 36.166667\n", + " 297.666667\n", " \n", " \n", " 3\n", - " 257.0\n", - " 307.0\n", - " 315.0\n", - " 361.0\n", - " 317.0\n", - " 354.0\n", - " 271.0\n", - " 3198011.0\n", - " 695175.0\n", - " 3348416.0\n", + " 253.166667\n", + " 256.166667\n", + " 242.666667\n", + " 265.333333\n", + " 211.666667\n", + " 226.666667\n", + " 181.000000\n", + " 3.198691e+06\n", + " 695911.333333\n", + " 3.349157e+06\n", " ...\n", - " 10.5\n", - " 3132179.0\n", - " 43.0\n", - " 51.0\n", - " 53.0\n", - " 62.0\n", - " 53.0\n", - " 60.0\n", - " 45.0\n", - " 357.0\n", + " 8.433333\n", + " 3.137001e+06\n", + " 42.333333\n", + " 42.833333\n", + " 40.500000\n", + " 44.166667\n", + " 35.333333\n", + " 37.833333\n", + " 30.333333\n", + " 268.000000\n", " \n", " \n", " 4\n", - " 267.0\n", - " 309.0\n", - " 314.0\n", - " 355.0\n", - " 262.0\n", - " 246.0\n", - " 212.0\n", - " 3198056.0\n", - " 695226.0\n", - " 3348470.0\n", + " 305.000000\n", + " 312.333333\n", + " 346.166667\n", + " 329.833333\n", + " 280.666667\n", + " 308.833333\n", + " 271.833333\n", + " 3.198978e+06\n", + " 696195.833333\n", + " 3.349452e+06\n", " ...\n", - " 9.6\n", - " 3132501.0\n", - " 45.0\n", - " 51.0\n", - " 54.0\n", - " 59.0\n", - " 43.0\n", - " 41.0\n", - " 36.0\n", - " 322.0\n", + " 9.083333\n", + " 3.138843e+06\n", + " 50.500000\n", + " 51.166667\n", + " 55.500000\n", + " 53.666667\n", + " 46.166667\n", + " 49.666667\n", + " 41.166667\n", + " 341.833333\n", " \n", " \n", "\n", @@ -1055,38 +1172,38 @@ "" ], "text/plain": [ - " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", - "0 323.0 320.0 284.0 348.0 273.0 342.0 \n", - "1 346.0 384.0 367.0 411.0 331.0 360.0 \n", - "2 407.0 363.0 407.0 393.0 275.0 335.0 \n", - "3 257.0 307.0 315.0 361.0 317.0 354.0 \n", - "4 267.0 309.0 314.0 355.0 262.0 246.0 \n", + " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", + "0 313.333333 323.833333 336.000000 364.666667 286.500000 314.000000 \n", + "1 197.500000 221.333333 216.000000 260.666667 206.833333 235.833333 \n", + "2 248.166667 271.666667 277.500000 298.000000 233.666667 271.166667 \n", + "3 253.166667 256.166667 242.666667 265.333333 211.666667 226.666667 \n", + "4 305.000000 312.333333 346.166667 329.833333 280.666667 308.833333 \n", "\n", - " value_S07 value_S08 value_S09 value_S10 ... value_S17 value_S18 \\\n", - "0 280.0 3197842.0 695000.0 3348234.0 ... 11.7 3131020.0 \n", - "1 249.0 3197900.0 695063.0 3348296.0 ... 10.2 3131420.0 \n", - "2 270.0 3197968.0 695124.0 3348363.0 ... 9.5 3131822.0 \n", - "3 271.0 3198011.0 695175.0 3348416.0 ... 10.5 3132179.0 \n", - "4 212.0 3198056.0 695226.0 3348470.0 ... 9.6 3132501.0 \n", + " value_S07 value_S08 value_S09 value_S10 ... value_S17 \\\n", + "0 243.166667 3.197980e+06 695143.166667 3.348384e+06 ... 10.383333 \n", + "1 186.666667 3.198221e+06 695403.666667 3.348651e+06 ... 8.666667 \n", + "2 216.333333 3.198448e+06 695656.500000 3.348910e+06 ... 8.833333 \n", + "3 181.000000 3.198691e+06 695911.333333 3.349157e+06 ... 8.433333 \n", + "4 271.833333 3.198978e+06 696195.833333 3.349452e+06 ... 9.083333 \n", "\n", - " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", - "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", - "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", - "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", - "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", - "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + " value_S18 value_S19 value_S20 value_S21 value_S22 value_S23 \\\n", + "0 3.131958e+06 52.666667 54.333333 56.166667 61.000000 47.666667 \n", + "1 3.133668e+06 33.166667 37.000000 36.166667 43.666667 34.500000 \n", + "2 3.135413e+06 41.500000 45.666667 46.500000 49.666667 39.333333 \n", + "3 3.137001e+06 42.333333 42.833333 40.500000 44.166667 35.333333 \n", + "4 3.138843e+06 50.500000 51.166667 55.500000 53.666667 46.166667 \n", "\n", - " value_S25 value_S26 \n", - "0 47.0 356.0 \n", - "1 42.0 400.0 \n", - "2 45.0 402.0 \n", - "3 45.0 357.0 \n", - "4 36.0 322.0 \n", + " value_S24 value_S25 value_S26 \n", + "0 52.666667 40.833333 357.333333 \n", + "1 39.333333 31.166667 249.666667 \n", + "2 45.500000 36.166667 297.666667 \n", + "3 37.833333 30.333333 268.000000 \n", + "4 49.666667 41.166667 341.833333 \n", "\n", "[5 rows x 26 columns]" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1108,17 +1225,26 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/sklearn/impute/_base.py:356: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "step = 3\n", + "step = 4\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1127,7 +1253,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1138,50 +1264,50 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[3.230000e+02, 3.200000e+02, 2.840000e+02, 3.480000e+02,\n", - " 2.730000e+02, 3.420000e+02, 2.800000e+02, 3.197842e+06,\n", - " 6.950000e+05, 3.348234e+06, 3.436762e+06, 3.322362e+06,\n", - " 3.357952e+06, 3.223797e+06, 8.300000e+00, 6.000000e+00,\n", - " 1.170000e+01, 3.131020e+06, 5.500000e+01, 5.500000e+01,\n", - " 4.700000e+01, 5.800000e+01, 4.500000e+01, 5.800000e+01,\n", - " 4.700000e+01, 3.560000e+02],\n", - " [3.460000e+02, 3.840000e+02, 3.670000e+02, 4.110000e+02,\n", - " 3.310000e+02, 3.600000e+02, 2.490000e+02, 3.197900e+06,\n", - " 6.950630e+05, 3.348296e+06, 3.436829e+06, 3.322417e+06,\n", - " 3.358013e+06, 3.223839e+06, 7.600000e+00, 5.000000e+00,\n", - " 1.020000e+01, 3.131420e+06, 5.800000e+01, 6.300000e+01,\n", - " 6.200000e+01, 6.700000e+01, 5.500000e+01, 6.100000e+01,\n", - " 4.200000e+01, 4.000000e+02],\n", - " [4.070000e+02, 3.630000e+02, 4.070000e+02, 3.930000e+02,\n", - " 2.750000e+02, 3.350000e+02, 2.700000e+02, 3.197968e+06,\n", - " 6.951240e+05, 3.348363e+06, 3.436895e+06, 3.322463e+06,\n", - " 3.358068e+06, 3.223884e+06, 7.800000e+00, 5.700000e+00,\n", - " 9.500000e+00, 3.131822e+06, 6.800000e+01, 6.100000e+01,\n", - " 6.700000e+01, 6.600000e+01, 4.600000e+01, 5.500000e+01,\n", - " 4.500000e+01, 4.020000e+02],\n", - " [2.570000e+02, 3.070000e+02, 3.150000e+02, 3.610000e+02,\n", - " 3.170000e+02, 3.540000e+02, 2.710000e+02, 3.198011e+06,\n", - " 6.951750e+05, 3.348416e+06, 3.436957e+06, 3.322516e+06,\n", - " 3.358128e+06, 3.223929e+06, 8.600000e+00, 6.600000e+00,\n", - " 1.050000e+01, 3.132179e+06, 4.300000e+01, 5.100000e+01,\n", - " 5.300000e+01, 6.200000e+01, 5.300000e+01, 6.000000e+01,\n", - " 4.500000e+01, 3.570000e+02],\n", - " [2.670000e+02, 3.090000e+02, 3.140000e+02, 3.550000e+02,\n", - " 2.620000e+02, 2.460000e+02, 2.120000e+02, 3.198056e+06,\n", - " 6.952260e+05, 3.348470e+06, 3.437016e+06, 3.322559e+06,\n", - " 3.358169e+06, 3.223965e+06, 7.500000e+00, 5.900000e+00,\n", - " 9.600000e+00, 3.132501e+06, 4.500000e+01, 5.100000e+01,\n", - " 5.400000e+01, 5.900000e+01, 4.300000e+01, 4.100000e+01,\n", - " 3.600000e+01, 3.220000e+02]])" + "array([[3.13333333e+02, 3.23833333e+02, 3.36000000e+02, 3.64666667e+02,\n", + " 2.86500000e+02, 3.14000000e+02, 2.43166667e+02, 3.19798000e+06,\n", + " 6.95143167e+05, 3.34838383e+06, 3.43692150e+06, 3.32248667e+06,\n", + " 3.35809000e+06, 3.22390150e+06, 7.95000000e+00, 5.85000000e+00,\n", + " 1.03833333e+01, 3.13195833e+06, 5.26666667e+01, 5.43333333e+01,\n", + " 5.61666667e+01, 6.10000000e+01, 4.76666667e+01, 5.26666667e+01,\n", + " 4.08333333e+01, 3.57333333e+02],\n", + " [1.97500000e+02, 2.21333333e+02, 2.16000000e+02, 2.60666667e+02,\n", + " 2.06833333e+02, 2.35833333e+02, 1.86666667e+02, 3.19822067e+06,\n", + " 6.95403667e+05, 3.34865117e+06, 3.43722283e+06, 3.32272200e+06,\n", + " 3.35834000e+06, 3.22409567e+06, 6.83333333e+00, 5.15000000e+00,\n", + " 8.66666667e+00, 3.13366817e+06, 3.31666667e+01, 3.70000000e+01,\n", + " 3.61666667e+01, 4.36666667e+01, 3.45000000e+01, 3.93333333e+01,\n", + " 3.11666667e+01, 2.49666667e+02],\n", + " [2.48166667e+02, 2.71666667e+02, 2.77500000e+02, 2.98000000e+02,\n", + " 2.33666667e+02, 2.71166667e+02, 2.16333333e+02, 3.19844767e+06,\n", + " 6.95656500e+05, 3.34890967e+06, 3.43751900e+06, 3.32295950e+06,\n", + " 3.35862067e+06, 3.22432333e+06, 7.11666667e+00, 5.56666667e+00,\n", + " 8.83333333e+00, 3.13541283e+06, 4.15000000e+01, 4.56666667e+01,\n", + " 4.65000000e+01, 4.96666667e+01, 3.93333333e+01, 4.55000000e+01,\n", + " 3.61666667e+01, 2.97666667e+02],\n", + " [2.53166667e+02, 2.56166667e+02, 2.42666667e+02, 2.65333333e+02,\n", + " 2.11666667e+02, 2.26666667e+02, 1.81000000e+02, 3.19869117e+06,\n", + " 6.95911333e+05, 3.34915717e+06, 3.43778050e+06, 3.32316850e+06,\n", + " 3.35884883e+06, 3.22450217e+06, 6.71666667e+00, 5.16666667e+00,\n", + " 8.43333333e+00, 3.13700133e+06, 4.23333333e+01, 4.28333333e+01,\n", + " 4.05000000e+01, 4.41666667e+01, 3.53333333e+01, 3.78333333e+01,\n", + " 3.03333333e+01, 2.68000000e+02],\n", + " [3.05000000e+02, 3.12333333e+02, 3.46166667e+02, 3.29833333e+02,\n", + " 2.80666667e+02, 3.08833333e+02, 2.71833333e+02, 3.19897850e+06,\n", + " 6.96195833e+05, 3.34945200e+06, 3.43807767e+06, 3.32340933e+06,\n", + " 3.35910983e+06, 3.22471400e+06, 7.20000000e+00, 5.28333333e+00,\n", + " 9.08333333e+00, 3.13884333e+06, 5.05000000e+01, 5.11666667e+01,\n", + " 5.55000000e+01, 5.36666667e+01, 4.61666667e+01, 4.96666667e+01,\n", + " 4.11666667e+01, 3.41833333e+02]])" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1203,17 +1329,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "step = 4\n", + "step = 5\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1222,7 +1348,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1233,45 +1359,45 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[-0.23563892, -0.24267292, -0.3286385 , -0.17702227, -0.35287222,\n", - " -0.19248826, -0.3317757 , -1. , -1. , -1. ,\n", - " -1. , -1. , -1. , -1. , -0.11702128,\n", - " -0.24050633, -0.25714286, -0.37378787, -0.22758621, -0.22758621,\n", - " -0.31972789, -0.1862069 , -0.36986301, -0.1862069 , -0.33793103,\n", - " -0.26141079],\n", - " [-0.18171161, -0.0926143 , -0.13380282, -0.02930832, -0.21688159,\n", - " -0.15023474, -0.40420561, -0.99995911, -0.99995779, -0.99995941,\n", - " -0.99995718, -0.99996326, -0.99996042, -0.99997164, -0.19148936,\n", - " -0.36708861, -0.35238095, -0.37370786, -0.1862069 , -0.11724138,\n", - " -0.11564626, -0.06206897, -0.23287671, -0.14482759, -0.40689655,\n", - " -0.17012448],\n", - " [-0.03868699, -0.14185229, -0.0399061 , -0.07151231, -0.34818288,\n", - " -0.20892019, -0.35514019, -0.99991116, -0.99991693, -0.99991555,\n", - " -0.999915 , -0.99993254, -0.99992474, -0.99994125, -0.17021277,\n", - " -0.27848101, -0.3968254 , -0.37362746, -0.04827586, -0.14482759,\n", - " -0.04761905, -0.07586207, -0.35616438, -0.22758621, -0.36551724,\n", - " -0.1659751 ],\n", - " [-0.39038687, -0.27315358, -0.25586854, -0.14654162, -0.24970692,\n", - " -0.16431925, -0.35280374, -0.99988085, -0.99988276, -0.99988086,\n", - " -0.99987538, -0.99989714, -0.99988581, -0.99991086, -0.08510638,\n", - " -0.16455696, -0.33333333, -0.37355606, -0.39310345, -0.28275862,\n", - " -0.23809524, -0.13103448, -0.26027397, -0.15862069, -0.36551724,\n", - " -0.2593361 ],\n", - " [-0.36694021, -0.26846424, -0.25821596, -0.16060961, -0.37866354,\n", - " -0.41784038, -0.49065421, -0.99984912, -0.99984859, -0.99984551,\n", - " -0.99983767, -0.99986841, -0.99985921, -0.99988655, -0.20212766,\n", - " -0.25316456, -0.39047619, -0.37349166, -0.36551724, -0.28275862,\n", - " -0.2244898 , -0.17241379, -0.39726027, -0.42068966, -0.48965517,\n", - " -0.33195021]])" + "array([[-0.26126126, -0.23706897, -0.20870076, -0.14106583, -0.32328767,\n", + " -0.25969448, -0.42198789, -1. , -1. , -1. ,\n", + " -1. , -1. , -1. , -1. , -0.11007463,\n", + " -0.16824645, -0.10424155, -0.37397741, -0.25233645, -0.22716628,\n", + " -0.20140515, -0.13481829, -0.32239156, -0.25380117, -0.4182243 ,\n", + " -0.25697453],\n", + " [-0.53349001, -0.47805643, -0.49088771, -0.38557994, -0.51037182,\n", + " -0.44339992, -0.55438391, -0.99983031, -0.99982547, -0.99982499,\n", + " -0.99980741, -0.9998428 , -0.99983779, -0.99986887, -0.23507463,\n", + " -0.26777251, -0.25233645, -0.37363511, -0.52570093, -0.470726 ,\n", + " -0.4824356 , -0.37866354, -0.50762016, -0.44093567, -0.55373832,\n", + " -0.48085254],\n", + " [-0.41441441, -0.35971787, -0.3462669 , -0.29780564, -0.44735812,\n", + " -0.36036036, -0.48486624, -0.99967026, -0.99965608, -0.99965576,\n", + " -0.99961813, -0.99968416, -0.99965569, -0.99971512, -0.20335821,\n", + " -0.20853081, -0.2379583 , -0.37328583, -0.4088785 , -0.34894614,\n", + " -0.33723653, -0.29425557, -0.43962485, -0.35438596, -0.48364486,\n", + " -0.38104315],\n", + " [-0.40266353, -0.39615987, -0.4281795 , -0.37460815, -0.49902153,\n", + " -0.4649432 , -0.56766257, -0.99949857, -0.99948535, -0.99949373,\n", + " -0.999451 , -0.99954455, -0.99950765, -0.99959435, -0.24813433,\n", + " -0.26540284, -0.27246585, -0.37296782, -0.39719626, -0.38875878,\n", + " -0.42154567, -0.37162954, -0.49589683, -0.4619883 , -0.56542056,\n", + " -0.4427309 ],\n", + " [-0.28084606, -0.26410658, -0.18479326, -0.22296238, -0.3369863 ,\n", + " -0.27183705, -0.35481351, -0.99929598, -0.99929474, -0.99930071,\n", + " -0.99926107, -0.99938368, -0.99933831, -0.9994513 , -0.19402985,\n", + " -0.24881517, -0.21639109, -0.37259906, -0.28271028, -0.27166276,\n", + " -0.21077283, -0.23798359, -0.34349355, -0.29590643, -0.4135514 ,\n", + " -0.28920464]])" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1293,17 +1419,17 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "step = 5\n", + "step = 6\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1312,7 +1438,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1323,7 +1449,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1373,123 +1499,123 @@ " \n", " \n", " 0\n", - " -0.235639\n", - " -0.242673\n", - " -0.328638\n", - " -0.177022\n", - " -0.352872\n", - " -0.192488\n", - " -0.331776\n", + " -0.261261\n", + " -0.237069\n", + " -0.208701\n", + " -0.141066\n", + " -0.323288\n", + " -0.259694\n", + " -0.421988\n", " -1.000000\n", " -1.000000\n", " -1.000000\n", " ...\n", - " -0.257143\n", - " -0.373788\n", - " -0.227586\n", - " -0.227586\n", - " -0.319728\n", - " -0.186207\n", - " -0.369863\n", - " -0.186207\n", - " -0.337931\n", - " -0.261411\n", + " -0.104242\n", + " -0.373977\n", + " -0.252336\n", + " -0.227166\n", + " -0.201405\n", + " -0.134818\n", + " -0.322392\n", + " -0.253801\n", + " -0.418224\n", + " -0.256975\n", " \n", " \n", " 1\n", - " -0.181712\n", - " -0.092614\n", - " -0.133803\n", - " -0.029308\n", - " -0.216882\n", - " -0.150235\n", - " -0.404206\n", - " -0.999959\n", - " -0.999958\n", - " -0.999959\n", + " -0.533490\n", + " -0.478056\n", + " -0.490888\n", + " -0.385580\n", + " -0.510372\n", + " -0.443400\n", + " -0.554384\n", + " -0.999830\n", + " -0.999825\n", + " -0.999825\n", " ...\n", - " -0.352381\n", - " -0.373708\n", - " -0.186207\n", - " -0.117241\n", - " -0.115646\n", - " -0.062069\n", - " -0.232877\n", - " -0.144828\n", - " -0.406897\n", - " -0.170124\n", + " -0.252336\n", + " -0.373635\n", + " -0.525701\n", + " -0.470726\n", + " -0.482436\n", + " -0.378664\n", + " -0.507620\n", + " -0.440936\n", + " -0.553738\n", + " -0.480853\n", " \n", " \n", " 2\n", - " -0.038687\n", - " -0.141852\n", - " -0.039906\n", - " -0.071512\n", - " -0.348183\n", - " -0.208920\n", - " -0.355140\n", - " -0.999911\n", - " -0.999917\n", - " -0.999916\n", + " -0.414414\n", + " -0.359718\n", + " -0.346267\n", + " -0.297806\n", + " -0.447358\n", + " -0.360360\n", + " -0.484866\n", + " -0.999670\n", + " -0.999656\n", + " -0.999656\n", " ...\n", - " -0.396825\n", - " -0.373627\n", - " -0.048276\n", - " -0.144828\n", - " -0.047619\n", - " -0.075862\n", - " -0.356164\n", - " -0.227586\n", - " -0.365517\n", - " -0.165975\n", + " -0.237958\n", + " -0.373286\n", + " -0.408879\n", + " -0.348946\n", + " -0.337237\n", + " -0.294256\n", + " -0.439625\n", + " -0.354386\n", + " -0.483645\n", + " -0.381043\n", " \n", " \n", " 3\n", - " -0.390387\n", - " -0.273154\n", - " -0.255869\n", - " -0.146542\n", - " -0.249707\n", - " -0.164319\n", - " -0.352804\n", - " -0.999881\n", - " -0.999883\n", - " -0.999881\n", + " -0.402664\n", + " -0.396160\n", + " -0.428180\n", + " -0.374608\n", + " -0.499022\n", + " -0.464943\n", + " -0.567663\n", + " -0.999499\n", + " -0.999485\n", + " -0.999494\n", " ...\n", - " -0.333333\n", - " -0.373556\n", - " -0.393103\n", - " -0.282759\n", - " -0.238095\n", - " -0.131034\n", - " -0.260274\n", - " -0.158621\n", - " -0.365517\n", - " -0.259336\n", + " -0.272466\n", + " -0.372968\n", + " -0.397196\n", + " -0.388759\n", + " -0.421546\n", + " -0.371630\n", + " -0.495897\n", + " -0.461988\n", + " -0.565421\n", + " -0.442731\n", " \n", " \n", " 4\n", - " -0.366940\n", - " -0.268464\n", - " -0.258216\n", - " -0.160610\n", - " -0.378664\n", - " -0.417840\n", - " -0.490654\n", - " -0.999849\n", - " -0.999849\n", - " -0.999846\n", + " -0.280846\n", + " -0.264107\n", + " -0.184793\n", + " -0.222962\n", + " -0.336986\n", + " -0.271837\n", + " -0.354814\n", + " -0.999296\n", + " -0.999295\n", + " -0.999301\n", " ...\n", - " -0.390476\n", - " -0.373492\n", - " -0.365517\n", - " -0.282759\n", - " -0.224490\n", - " -0.172414\n", - " -0.397260\n", - " -0.420690\n", - " -0.489655\n", - " -0.331950\n", + " -0.216391\n", + " -0.372599\n", + " -0.282710\n", + " -0.271663\n", + " -0.210773\n", + " -0.237984\n", + " -0.343494\n", + " -0.295906\n", + " -0.413551\n", + " -0.289205\n", " \n", " \n", "\n", @@ -1498,30 +1624,30 @@ ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", - "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", - "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", - "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", - "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", - "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", "\n", " 7 8 9 ... 16 17 18 19 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.257143 -0.373788 -0.227586 -0.227586 \n", - "1 -0.999959 -0.999958 -0.999959 ... -0.352381 -0.373708 -0.186207 -0.117241 \n", - "2 -0.999911 -0.999917 -0.999916 ... -0.396825 -0.373627 -0.048276 -0.144828 \n", - "3 -0.999881 -0.999883 -0.999881 ... -0.333333 -0.373556 -0.393103 -0.282759 \n", - "4 -0.999849 -0.999849 -0.999846 ... -0.390476 -0.373492 -0.365517 -0.282759 \n", + "0 -1.000000 -1.000000 -1.000000 ... -0.104242 -0.373977 -0.252336 -0.227166 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.252336 -0.373635 -0.525701 -0.470726 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.237958 -0.373286 -0.408879 -0.348946 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.272466 -0.372968 -0.397196 -0.388759 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.216391 -0.372599 -0.282710 -0.271663 \n", "\n", " 20 21 22 23 24 25 \n", - "0 -0.319728 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 \n", - "1 -0.115646 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 \n", - "2 -0.047619 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 \n", - "3 -0.238095 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 \n", - "4 -0.224490 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 \n", + "0 -0.201405 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 \n", + "1 -0.482436 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 \n", + "2 -0.337237 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 \n", + "3 -0.421546 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 \n", + "4 -0.210773 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 \n", "\n", "[5 rows x 26 columns]" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1543,17 +1669,17 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "step = 6\n", + "step = 7\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1562,7 +1688,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 28, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1573,7 +1699,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1623,122 +1749,122 @@ " \n", " \n", " 0\n", - " -0.235639\n", - " -0.242673\n", - " -0.328638\n", - " -0.177022\n", - " -0.352872\n", - " -0.192488\n", - " -0.331776\n", + " -0.261261\n", + " -0.237069\n", + " -0.208701\n", + " -0.141066\n", + " -0.323288\n", + " -0.259694\n", + " -0.421988\n", " -1.000000\n", " -1.000000\n", " -1.000000\n", " ...\n", - " -0.373788\n", - " -0.227586\n", - " -0.227586\n", - " -0.319728\n", - " -0.186207\n", - " -0.369863\n", - " -0.186207\n", - " -0.337931\n", - " -0.261411\n", + " -0.373977\n", + " -0.252336\n", + " -0.227166\n", + " -0.201405\n", + " -0.134818\n", + " -0.322392\n", + " -0.253801\n", + " -0.418224\n", + " -0.256975\n", " T001\n", " \n", " \n", " 1\n", - " -0.181712\n", - " -0.092614\n", - " -0.133803\n", - " -0.029308\n", - " -0.216882\n", - " -0.150235\n", - " -0.404206\n", - " -0.999959\n", - " -0.999958\n", - " -0.999959\n", + " -0.533490\n", + " -0.478056\n", + " -0.490888\n", + " -0.385580\n", + " -0.510372\n", + " -0.443400\n", + " -0.554384\n", + " -0.999830\n", + " -0.999825\n", + " -0.999825\n", " ...\n", - " -0.373708\n", - " -0.186207\n", - " -0.117241\n", - " -0.115646\n", - " -0.062069\n", - " -0.232877\n", - " -0.144828\n", - " -0.406897\n", - " -0.170124\n", + " -0.373635\n", + " -0.525701\n", + " -0.470726\n", + " -0.482436\n", + " -0.378664\n", + " -0.507620\n", + " -0.440936\n", + " -0.553738\n", + " -0.480853\n", " T001\n", " \n", " \n", " 2\n", - " -0.038687\n", - " -0.141852\n", - " -0.039906\n", - " -0.071512\n", - " -0.348183\n", - " -0.208920\n", - " -0.355140\n", - " -0.999911\n", - " -0.999917\n", - " -0.999916\n", + " -0.414414\n", + " -0.359718\n", + " -0.346267\n", + " -0.297806\n", + " -0.447358\n", + " -0.360360\n", + " -0.484866\n", + " -0.999670\n", + " -0.999656\n", + " -0.999656\n", " ...\n", - " -0.373627\n", - " -0.048276\n", - " -0.144828\n", - " -0.047619\n", - " -0.075862\n", - " -0.356164\n", - " -0.227586\n", - " -0.365517\n", - " -0.165975\n", + " -0.373286\n", + " -0.408879\n", + " -0.348946\n", + " -0.337237\n", + " -0.294256\n", + " -0.439625\n", + " -0.354386\n", + " -0.483645\n", + " -0.381043\n", " T001\n", " \n", " \n", " 3\n", - " -0.390387\n", - " -0.273154\n", - " -0.255869\n", - " -0.146542\n", - " -0.249707\n", - " -0.164319\n", - " -0.352804\n", - " -0.999881\n", - " -0.999883\n", - " -0.999881\n", + " -0.402664\n", + " -0.396160\n", + " -0.428180\n", + " -0.374608\n", + " -0.499022\n", + " -0.464943\n", + " -0.567663\n", + " -0.999499\n", + " -0.999485\n", + " -0.999494\n", " ...\n", - " -0.373556\n", - " -0.393103\n", - " -0.282759\n", - " -0.238095\n", - " -0.131034\n", - " -0.260274\n", - " -0.158621\n", - " -0.365517\n", - " -0.259336\n", + " -0.372968\n", + " -0.397196\n", + " -0.388759\n", + " -0.421546\n", + " -0.371630\n", + " -0.495897\n", + " -0.461988\n", + " -0.565421\n", + " -0.442731\n", " T001\n", " \n", " \n", " 4\n", - " -0.366940\n", - " -0.268464\n", - " -0.258216\n", - " -0.160610\n", - " -0.378664\n", - " -0.417840\n", - " -0.490654\n", - " -0.999849\n", - " -0.999849\n", - " -0.999846\n", + " -0.280846\n", + " -0.264107\n", + " -0.184793\n", + " -0.222962\n", + " -0.336986\n", + " -0.271837\n", + " -0.354814\n", + " -0.999296\n", + " -0.999295\n", + " -0.999301\n", " ...\n", - " -0.373492\n", - " -0.365517\n", - " -0.282759\n", - " -0.224490\n", - " -0.172414\n", - " -0.397260\n", - " -0.420690\n", - " -0.489655\n", - " -0.331950\n", + " -0.372599\n", + " -0.282710\n", + " -0.271663\n", + " -0.210773\n", + " -0.237984\n", + " -0.343494\n", + " -0.295906\n", + " -0.413551\n", + " -0.289205\n", " T001\n", " \n", " \n", @@ -1748,30 +1874,30 @@ ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", - "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", - "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", - "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", - "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", - "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", "\n", " 7 8 9 ... 17 18 19 20 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.373788 -0.227586 -0.227586 -0.319728 \n", - "1 -0.999959 -0.999958 -0.999959 ... -0.373708 -0.186207 -0.117241 -0.115646 \n", - "2 -0.999911 -0.999917 -0.999916 ... -0.373627 -0.048276 -0.144828 -0.047619 \n", - "3 -0.999881 -0.999883 -0.999881 ... -0.373556 -0.393103 -0.282759 -0.238095 \n", - "4 -0.999849 -0.999849 -0.999846 ... -0.373492 -0.365517 -0.282759 -0.224490 \n", + "0 -1.000000 -1.000000 -1.000000 ... -0.373977 -0.252336 -0.227166 -0.201405 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.373635 -0.525701 -0.470726 -0.482436 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.373286 -0.408879 -0.348946 -0.337237 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.372968 -0.397196 -0.388759 -0.421546 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.372599 -0.282710 -0.271663 -0.210773 \n", "\n", " 21 22 23 24 25 turbine_id \n", - "0 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 T001 \n", - "1 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 T001 \n", - "2 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 T001 \n", - "3 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 T001 \n", - "4 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 T001 \n", + "0 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 T001 \n", + "1 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 T001 \n", + "2 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 T001 \n", + "3 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 T001 \n", + "4 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 T001 \n", "\n", "[5 rows x 27 columns]" ] }, - "execution_count": 29, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1793,17 +1919,17 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "step = 7\n", + "step = 8\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1812,7 +1938,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 31, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -1823,7 +1949,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -1873,123 +1999,123 @@ " \n", " \n", " 0\n", - " -0.235639\n", - " -0.242673\n", - " -0.328638\n", - " -0.177022\n", - " -0.352872\n", - " -0.192488\n", - " -0.331776\n", + " -0.261261\n", + " -0.237069\n", + " -0.208701\n", + " -0.141066\n", + " -0.323288\n", + " -0.259694\n", + " -0.421988\n", " -1.000000\n", " -1.000000\n", " -1.000000\n", " ...\n", - " -0.227586\n", - " -0.227586\n", - " -0.319728\n", - " -0.186207\n", - " -0.369863\n", - " -0.186207\n", - " -0.337931\n", - " -0.261411\n", + " -0.252336\n", + " -0.227166\n", + " -0.201405\n", + " -0.134818\n", + " -0.322392\n", + " -0.253801\n", + " -0.418224\n", + " -0.256975\n", " T001\n", " 2013-01-10 00:00:00\n", " \n", " \n", " 1\n", - " -0.181712\n", - " -0.092614\n", - " -0.133803\n", - " -0.029308\n", - " -0.216882\n", - " -0.150235\n", - " -0.404206\n", - " -0.999959\n", - " -0.999958\n", - " -0.999959\n", + " -0.533490\n", + " -0.478056\n", + " -0.490888\n", + " -0.385580\n", + " -0.510372\n", + " -0.443400\n", + " -0.554384\n", + " -0.999830\n", + " -0.999825\n", + " -0.999825\n", " ...\n", - " -0.186207\n", - " -0.117241\n", - " -0.115646\n", - " -0.062069\n", - " -0.232877\n", - " -0.144828\n", - " -0.406897\n", - " -0.170124\n", + " -0.525701\n", + " -0.470726\n", + " -0.482436\n", + " -0.378664\n", + " -0.507620\n", + " -0.440936\n", + " -0.553738\n", + " -0.480853\n", " T001\n", - " 2013-01-10 00:10:00\n", + " 2013-01-10 01:00:00\n", " \n", " \n", " 2\n", - " -0.038687\n", - " -0.141852\n", - " -0.039906\n", - " -0.071512\n", - " -0.348183\n", - " -0.208920\n", - " -0.355140\n", - " -0.999911\n", - " -0.999917\n", - " -0.999916\n", + " -0.414414\n", + " -0.359718\n", + " -0.346267\n", + " -0.297806\n", + " -0.447358\n", + " -0.360360\n", + " -0.484866\n", + " -0.999670\n", + " -0.999656\n", + " -0.999656\n", " ...\n", - " -0.048276\n", - " -0.144828\n", - " -0.047619\n", - " -0.075862\n", - " -0.356164\n", - " -0.227586\n", - " -0.365517\n", - " -0.165975\n", + " -0.408879\n", + " -0.348946\n", + " -0.337237\n", + " -0.294256\n", + " -0.439625\n", + " -0.354386\n", + " -0.483645\n", + " -0.381043\n", " T001\n", - " 2013-01-10 00:20:00\n", + " 2013-01-10 02:00:00\n", " \n", " \n", " 3\n", - " -0.390387\n", - " -0.273154\n", - " -0.255869\n", - " -0.146542\n", - " -0.249707\n", - " -0.164319\n", - " -0.352804\n", - " -0.999881\n", - " -0.999883\n", - " -0.999881\n", + " -0.402664\n", + " -0.396160\n", + " -0.428180\n", + " -0.374608\n", + " -0.499022\n", + " -0.464943\n", + " -0.567663\n", + " -0.999499\n", + " -0.999485\n", + " -0.999494\n", " ...\n", - " -0.393103\n", - " -0.282759\n", - " -0.238095\n", - " -0.131034\n", - " -0.260274\n", - " -0.158621\n", - " -0.365517\n", - " -0.259336\n", + " -0.397196\n", + " -0.388759\n", + " -0.421546\n", + " -0.371630\n", + " -0.495897\n", + " -0.461988\n", + " -0.565421\n", + " -0.442731\n", " T001\n", - " 2013-01-10 00:30:00\n", + " 2013-01-10 03:00:00\n", " \n", " \n", " 4\n", - " -0.366940\n", - " -0.268464\n", - " -0.258216\n", - " -0.160610\n", - " -0.378664\n", - " -0.417840\n", - " -0.490654\n", - " -0.999849\n", - " -0.999849\n", - " -0.999846\n", + " -0.280846\n", + " -0.264107\n", + " -0.184793\n", + " -0.222962\n", + " -0.336986\n", + " -0.271837\n", + " -0.354814\n", + " -0.999296\n", + " -0.999295\n", + " -0.999301\n", " ...\n", - " -0.365517\n", - " -0.282759\n", - " -0.224490\n", - " -0.172414\n", - " -0.397260\n", - " -0.420690\n", - " -0.489655\n", - " -0.331950\n", + " -0.282710\n", + " -0.271663\n", + " -0.210773\n", + " -0.237984\n", + " -0.343494\n", + " -0.295906\n", + " -0.413551\n", + " -0.289205\n", " T001\n", - " 2013-01-10 00:40:00\n", + " 2013-01-10 04:00:00\n", " \n", " \n", "\n", @@ -1998,30 +2124,30 @@ ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", - "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", - "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", - "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", - "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", - "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", "\n", " 7 8 9 ... 18 19 20 21 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.227586 -0.227586 -0.319728 -0.186207 \n", - "1 -0.999959 -0.999958 -0.999959 ... -0.186207 -0.117241 -0.115646 -0.062069 \n", - "2 -0.999911 -0.999917 -0.999916 ... -0.048276 -0.144828 -0.047619 -0.075862 \n", - "3 -0.999881 -0.999883 -0.999881 ... -0.393103 -0.282759 -0.238095 -0.131034 \n", - "4 -0.999849 -0.999849 -0.999846 ... -0.365517 -0.282759 -0.224490 -0.172414 \n", + "0 -1.000000 -1.000000 -1.000000 ... -0.252336 -0.227166 -0.201405 -0.134818 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.525701 -0.470726 -0.482436 -0.378664 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.408879 -0.348946 -0.337237 -0.294256 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.397196 -0.388759 -0.421546 -0.371630 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.282710 -0.271663 -0.210773 -0.237984 \n", "\n", " 22 23 24 25 turbine_id timestamp \n", - "0 -0.369863 -0.186207 -0.337931 -0.261411 T001 2013-01-10 00:00:00 \n", - "1 -0.232877 -0.144828 -0.406897 -0.170124 T001 2013-01-10 00:10:00 \n", - "2 -0.356164 -0.227586 -0.365517 -0.165975 T001 2013-01-10 00:20:00 \n", - "3 -0.260274 -0.158621 -0.365517 -0.259336 T001 2013-01-10 00:30:00 \n", - "4 -0.397260 -0.420690 -0.489655 -0.331950 T001 2013-01-10 00:40:00 \n", + "0 -0.322392 -0.253801 -0.418224 -0.256975 T001 2013-01-10 00:00:00 \n", + "1 -0.507620 -0.440936 -0.553738 -0.480853 T001 2013-01-10 01:00:00 \n", + "2 -0.439625 -0.354386 -0.483645 -0.381043 T001 2013-01-10 02:00:00 \n", + "3 -0.495897 -0.461988 -0.565421 -0.442731 T001 2013-01-10 03:00:00 \n", + "4 -0.343494 -0.295906 -0.413551 -0.289205 T001 2013-01-10 04:00:00 \n", "\n", "[5 rows x 28 columns]" ] }, - "execution_count": 32, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2044,7 +2170,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -2053,29 +2179,29 @@ "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" ] }, - "execution_count": 33, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline._pipeline.get_hyperparameters()[\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" + " 'mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1']" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "step = 8\n", + "step = 9\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2084,7 +2210,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 35, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2095,16 +2221,16 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(51121, 28)" + "(8521, 28)" ] }, - "execution_count": 36, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -2115,7 +2241,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -2124,7 +2250,7 @@ "(353,)" ] }, - "execution_count": 37, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2135,7 +2261,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -2144,7 +2270,7 @@ "(353, 24, 26)" ] }, - "execution_count": 38, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -2155,7 +2281,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 42, "metadata": { "scrolled": true }, @@ -2163,153 +2289,153 @@ { "data": { "text/plain": [ - "array([[-0.66002345, -0.57327081, -0.64084507, -0.57796014, -0.6014068 ,\n", - " -0.56103286, -0.55140187, -0.9928135 , -0.99291267, -0.99315058,\n", - " -0.99304288, -0.99346346, -0.99352632, -0.99395333, -0.42553191,\n", - " -0.41772152, -0.58730159, -0.35996294, -0.66896552, -0.57241379,\n", - " -0.61904762, -0.5862069 , -0.60273973, -0.55862069, -0.55862069,\n", - " -0.59751037],\n", - " [-0.2989449 , -0.38569754, -0.48591549, -0.47713951, -0.66705744,\n", - " -0.5915493 , -0.77336449, -0.99278389, -0.9928852 , -0.99312701,\n", - " -0.99301988, -0.9934481 , -0.9935075 , -0.9939459 , -0.39361702,\n", - " -0.40506329, -0.54285714, -0.35992014, -0.40689655, -0.42068966,\n", - " -0.46938776, -0.48965517, -0.67123288, -0.5862069 , -0.83448276,\n", - " -0.5560166 ],\n", - " [-0.33645955, -0.40679953, -0.39906103, -0.38569754, -0.56154748,\n", - " -0.43192488, -0.45560748, -0.99275498, -0.9928584 , -0.99310017,\n", - " -0.99299431, -0.99342739, -0.99348349, -0.99392294, -0.29787234,\n", - " -0.3164557 , -0.49206349, -0.35986854, -0.42068966, -0.43448276,\n", - " -0.40136054, -0.43448276, -0.56164384, -0.47586207, -0.51724138,\n", - " -0.46473029],\n", - " [-0.36928488, -0.41148886, -0.51173709, -0.47010551, -0.54982415,\n", - " -0.48122066, -0.51869159, -0.99272467, -0.9928316 , -0.99307791,\n", - " -0.99297067, -0.99340669, -0.99346079, -0.99390066, -0.29787234,\n", - " -0.35443038, -0.49206349, -0.35981854, -0.39310345, -0.43448276,\n", - " -0.49659864, -0.47586207, -0.56164384, -0.50344828, -0.53103448,\n", - " -0.4813278 ],\n", - " [-0.20750293, -0.35287222, -0.37089202, -0.2989449 , -0.32473623,\n", - " -0.37793427, -0.45794393, -0.99269435, -0.99280347, -0.99305173,\n", - " -0.99294447, -0.9933793 , -0.99343419, -0.9938777 , -0.32978723,\n", - " -0.39240506, -0.49206349, -0.35976314, -0.39310345, -0.40689655,\n", - " -0.41496599, -0.42068966, -0.42465753, -0.42068966, -0.51724138,\n", - " -0.4253112 ],\n", - " [-0.55685815, -0.60375147, -0.64084507, -0.54513482, -0.55685815,\n", - " -0.58215962, -0.63785047, -0.99267179, -0.99278404, -0.99303471,\n", - " -0.99292338, -0.99335793, -0.99341472, -0.99386014, -0.29787234,\n", - " -0.34177215, -0.51746032, -0.35972353, -0.54482759, -0.5862069 ,\n", - " -0.60544218, -0.53103448, -0.54794521, -0.57241379, -0.62758621,\n", - " -0.58921162],\n", - " [-0.66705744, -0.67643611, -0.69014085, -0.64361079, -0.74443142,\n", - " -0.7370892 , -0.7546729 , -0.99265487, -0.99276863, -0.99302096,\n", - " -0.9929074 , -0.99334657, -0.9934024 , -0.99384934, -0.39361702,\n", - " -0.48101266, -0.51746032, -0.35969533, -0.65517241, -0.66896552,\n", - " -0.67346939, -0.64137931, -0.75342466, -0.72413793, -0.76551724,\n", - " -0.70746888],\n", - " [-0.53341149, -0.60375147, -0.63849765, -0.61547479, -0.71395076,\n", - " -0.70187793, -0.72897196, -0.99263231, -0.99275054, -0.99300394,\n", - " -0.99289014, -0.99333255, -0.99338877, -0.9938365 , -0.38297872,\n", - " -0.37974684, -0.54920635, -0.35966173, -0.54482759, -0.6137931 ,\n", - " -0.60544218, -0.6137931 , -0.69863014, -0.69655172, -0.72413793,\n", - " -0.65145228],\n", - " [-0.44196952, -0.4021102 , -0.49295775, -0.49355217, -0.62719812,\n", - " -0.62676056, -0.71728972, -0.99260481, -0.99272173, -0.99298103,\n", - " -0.99286777, -0.99331518, -0.9933719 , -0.99382367, -0.38297872,\n", - " -0.4556962 , -0.54285714, -0.35961793, -0.44827586, -0.39310345,\n", - " -0.4829932 , -0.50344828, -0.63013699, -0.62758621, -0.72413793,\n", - " -0.54564315],\n", - " [-0.46307151, -0.38100821, -0.35446009, -0.44900352, -0.50293083,\n", - " -0.4741784 , -0.63317757, -0.99257731, -0.99269226, -0.99295157,\n", - " -0.99284285, -0.99329247, -0.9933479 , -0.99380612, -0.28723404,\n", - " -0.3164557 , -0.47301587, -0.35956633, -0.44827586, -0.37931034,\n", - " -0.34693878, -0.44827586, -0.52054795, -0.47586207, -0.62758621,\n", - " -0.46473029],\n", - " [-0.26611958, -0.26611958, -0.29107981, -0.34349355, -0.3950762 ,\n", - " -0.29577465, -0.43925234, -0.9925477 , -0.99266278, -0.99292211,\n", - " -0.99281601, -0.99326575, -0.99331805, -0.99378316, -0.28723404,\n", - " -0.39240506, -0.46031746, -0.35950873, -0.40689655, -0.37931034,\n", - " -0.34693878, -0.40689655, -0.43835616, -0.35172414, -0.51724138,\n", - " -0.40248963],\n", - " [-0.46307151, -0.35990621, -0.43192488, -0.36928488, -0.47245018,\n", - " -0.44600939, -0.41121495, -0.99252091, -0.9926333 , -0.99289592,\n", - " -0.99278789, -0.99324104, -0.99329275, -0.99375547, -0.28723404,\n", - " -0.43037975, -0.46666667, -0.35945292, -0.46206897, -0.37931034,\n", - " -0.41496599, -0.37931034, -0.47945205, -0.44827586, -0.42068966,\n", - " -0.42116183],\n", - " [-0.44431419, -0.4021102 , -0.38732394, -0.3059789 , -0.35990621,\n", - " -0.28403756, -0.40420561, -0.99249341, -0.99260583, -0.99287039,\n", - " -0.99275913, -0.99321298, -0.99326226, -0.99372846, -0.30851064,\n", - " -0.4556962 , -0.46031746, -0.35939572, -0.44827586, -0.42068966,\n", - " -0.42857143, -0.36551724, -0.4109589 , -0.33793103, -0.43448276,\n", - " -0.406639 ],\n", - " [-0.43962485, -0.36459555, -0.35211268, -0.35052755, -0.44665885,\n", - " -0.34741784, -0.44859813, -0.99246592, -0.99257703, -0.99284028,\n", - " -0.99273037, -0.99318693, -0.99323176, -0.99370279, -0.28723404,\n", - " -0.36708861, -0.48571429, -0.35933712, -0.44827586, -0.39310345,\n", - " -0.33333333, -0.36551724, -0.45205479, -0.33793103, -0.46206897,\n", - " -0.39211618],\n", - " [-0.2028136 , -0.25439625, -0.30751174, -0.3130129 , -0.37631887,\n", - " -0.3685446 , -0.46495327, -0.99243067, -0.99254152, -0.9928082 ,\n", - " -0.99269906, -0.99315821, -0.99320322, -0.99367781, -0.27659574,\n", - " -0.32911392, -0.47301587, -0.35927332, -0.29655172, -0.25517241,\n", - " -0.29251701, -0.31034483, -0.39726027, -0.37931034, -0.47586207,\n", - " -0.33817427],\n", - " [-0.23329426, -0.27080891, -0.31924883, -0.24736225, -0.35521688,\n", - " -0.33098592, -0.4182243 , -0.99239753, -0.99250668, -0.99277743,\n", - " -0.99266518, -0.99312815, -0.99317272, -0.99365012, -0.26595745,\n", - " -0.40506329, -0.46666667, -0.35920811, -0.33793103, -0.26896552,\n", - " -0.31972789, -0.25517241, -0.36986301, -0.33793103, -0.42068966,\n", - " -0.32365145],\n", - " [-0.12778429, -0.11137163, -0.10798122, -0.05275498, -0.25439625,\n", - " -0.23474178, -0.28271028, -0.99236228, -0.99247117, -0.99274143,\n", - " -0.99263131, -0.99309876, -0.99314028, -0.99362108, -0.24468085,\n", - " -0.32911392, -0.43492063, -0.35914011, -0.29655172, -0.25517241,\n", - " -0.21088435, -0.25517241, -0.38356164, -0.29655172, -0.39310345,\n", - " -0.29460581],\n", - " [-0.14185229, -0.2028136 , -0.31690141, -0.17467761, -0.24970692,\n", - " -0.25117371, -0.37383178, -0.9923242 , -0.99243567, -0.99271066,\n", - " -0.9925968 , -0.9930667 , -0.99310849, -0.99359204, -0.22340426,\n", - " -0.3164557 , -0.41587302, -0.35907171, -0.24137931, -0.25517241,\n", - " -0.31972789, -0.24137931, -0.32876712, -0.31034483, -0.39310345,\n", - " -0.29045643],\n", - " [-0.4021102 , -0.32708089, -0.33802817, -0.28018757, -0.3950762 ,\n", - " -0.40140845, -0.48364486, -0.99229459, -0.99240284, -0.99268055,\n", - " -0.99256421, -0.99303731, -0.99308059, -0.99356773, -0.25531915,\n", - " -0.29113924, -0.40952381, -0.35901131, -0.40689655, -0.31034483,\n", - " -0.33333333, -0.28275862, -0.38356164, -0.39310345, -0.48965517,\n", - " -0.37344398],\n", - " [-0.27549824, -0.3059789 , -0.37089202, -0.20046893, -0.34818288,\n", - " -0.33802817, -0.42056075, -0.99225863, -0.99237068, -0.99265109,\n", - " -0.99252778, -0.99300725, -0.99305075, -0.99354072, -0.28723404,\n", - " -0.41772152, -0.48571429, -0.3589459 , -0.28275862, -0.32413793,\n", - " -0.34693878, -0.2 , -0.36986301, -0.35172414, -0.43448276,\n", - " -0.32157676],\n", - " [-0.30832356, -0.3059789 , -0.3286385 , -0.31066823, -0.32473623,\n", - " -0.34741784, -0.38785047, -0.99222479, -0.99233786, -0.99262032,\n", - " -0.9924971 , -0.99297519, -0.9930209 , -0.99351168, -0.28723404,\n", - " -0.3164557 , -0.47936508, -0.3588813 , -0.32413793, -0.31034483,\n", - " -0.31972789, -0.32413793, -0.32876712, -0.35172414, -0.39310345,\n", - " -0.32987552],\n", - " [-0.33645955, -0.2098476 , -0.24413146, -0.2919109 , -0.41383353,\n", - " -0.41079812, -0.46495327, -0.99219025, -0.99230168, -0.99258563,\n", - " -0.99246579, -0.99294781, -0.99299365, -0.9934867 , -0.24468085,\n", - " -0.29113924, -0.42857143, -0.3588177 , -0.31034483, -0.24137931,\n", - " -0.23809524, -0.31034483, -0.42465753, -0.40689655, -0.47586207,\n", - " -0.34024896],\n", - " [-0.24267292, -0.15357562, -0.19248826, -0.13950762, -0.35052755,\n", - " -0.30046948, -0.37616822, -0.99215358, -0.99226215, -0.99254831,\n", - " -0.99242872, -0.99291708, -0.99296121, -0.99345766, -0.22340426,\n", - " -0.25316456, -0.42857143, -0.3587457 , -0.26896552, -0.17241379,\n", - " -0.18367347, -0.1862069 , -0.35616438, -0.29655172, -0.39310345,\n", - " -0.25311203],\n", - " [-0.2989449 , -0.26377491, -0.27699531, -0.15592028, -0.34583822,\n", - " -0.34976526, -0.48831776, -0.99211763, -0.99222731, -0.99251493,\n", - " -0.99239038, -0.99288636, -0.99293072, -0.99343267, -0.20212766,\n", - " -0.24050633, -0.3968254 , -0.35867929, -0.28275862, -0.26896552,\n", - " -0.26530612, -0.15862069, -0.35616438, -0.33793103, -0.47586207,\n", - " -0.31120332]])" + "array([[-0.58793576, -0.60305643, -0.63981971, -0.61481191, -0.69823875,\n", + " -0.65021543, -0.68912322, -0.99436914, -0.99439755, -0.99454249,\n", + " -0.99446788, -0.99476185, -0.99490997, -0.99529511, -0.34701493,\n", + " -0.33886256, -0.33860532, -0.36301186, -0.57943925, -0.59250585,\n", + " -0.6323185 , -0.60609613, -0.69284877, -0.64444444, -0.68691589,\n", + " -0.63853752],\n", + " [-0.56600078, -0.5846395 , -0.63002156, -0.61559561, -0.70880626,\n", + " -0.66392479, -0.69732474, -0.9942427 , -0.99427986, -0.9944408 ,\n", + " -0.99436498, -0.99468147, -0.99482011, -0.99521249, -0.33955224,\n", + " -0.31516588, -0.38892883, -0.36280656, -0.55841121, -0.57611241,\n", + " -0.62295082, -0.61078546, -0.70222743, -0.65847953, -0.69392523,\n", + " -0.63645815],\n", + " [-0.64081473, -0.64184953, -0.67038997, -0.63597179, -0.71350294,\n", + " -0.65844105, -0.66764304, -0.99412236, -0.99416864, -0.99434228,\n", + " -0.99426059, -0.99459663, -0.99472365, -0.99511795, -0.34328358,\n", + " -0.30094787, -0.36304817, -0.36259859, -0.63317757, -0.6323185 ,\n", + " -0.66042155, -0.62954279, -0.70926143, -0.65380117, -0.66588785,\n", + " -0.66002426],\n", + " [-0.73678026, -0.72139498, -0.72800314, -0.69239812, -0.71350294,\n", + " -0.68233451, -0.69732474, -0.99403811, -0.99408512, -0.9942623 ,\n", + " -0.99417111, -0.99451525, -0.99463206, -0.9950315 , -0.40671642,\n", + " -0.36018957, -0.44644141, -0.36242395, -0.72897196, -0.71194379,\n", + " -0.71896956, -0.68347011, -0.70926143, -0.6748538 , -0.69392523,\n", + " -0.71027552],\n", + " [-0.75401488, -0.74333856, -0.75112679, -0.71590909, -0.76555773,\n", + " -0.73599687, -0.75278266, -0.99395808, -0.99400684, -0.99419094,\n", + " -0.99409367, -0.99444556, -0.99455517, -0.99495418, -0.43656716,\n", + " -0.3957346 , -0.465133 , -0.36226933, -0.7453271 , -0.735363 ,\n", + " -0.74004684, -0.70926143, -0.76084408, -0.73099415, -0.75 ,\n", + " -0.7463178 ],\n", + " [-0.79866823, -0.76684953, -0.7558299 , -0.72688088, -0.76125245,\n", + " -0.75714845, -0.78363601, -0.99389098, -0.99393583, -0.99411958,\n", + " -0.99401538, -0.99437709, -0.99448423, -0.99489036, -0.43843284,\n", + " -0.37914692, -0.49388929, -0.36212623, -0.78971963, -0.75644028,\n", + " -0.7470726 , -0.72098476, -0.75615475, -0.7497076 , -0.78037383,\n", + " -0.76572518],\n", + " [-0.84919702, -0.83855799, -0.82245738, -0.78134796, -0.75225049,\n", + " -0.70661966, -0.65787932, -0.99384186, -0.99388279, -0.9940635 ,\n", + " -0.99395157, -0.9943113 , -0.99441264, -0.99481202, -0.51679104,\n", + " -0.50473934, -0.53414809, -0.36199904, -0.8411215 , -0.83138173,\n", + " -0.81264637, -0.77256741, -0.74677608, -0.70292398, -0.65654206,\n", + " -0.77438919],\n", + " [-0.69134352, -0.705721 , -0.73584166, -0.70297806, -0.75225049,\n", + " -0.72659616, -0.71724273, -0.99377229, -0.99381646, -0.99400032,\n", + " -0.99387925, -0.99423682, -0.99433003, -0.99471624, -0.43843284,\n", + " -0.40521327, -0.48094896, -0.36184615, -0.68457944, -0.69555035,\n", + " -0.72599532, -0.6975381 , -0.74677608, -0.71929825, -0.71261682,\n", + " -0.71893953],\n", + " [-0.84488837, -0.82915361, -0.83578287, -0.81896552, -0.86105675,\n", + " -0.8613396 , -0.86330795, -0.99369779, -0.99374656, -0.99393715,\n", + " -0.99381182, -0.99418494, -0.99427639, -0.99466379, -0.49253731,\n", + " -0.48104265, -0.51545651, -0.36172116, -0.8364486 , -0.81967213,\n", + " -0.82435597, -0.81008206, -0.85463072, -0.85497076, -0.86214953,\n", + " -0.84889967],\n", + " [-0.77908343, -0.78761755, -0.78757594, -0.78918495, -0.82348337,\n", + " -0.82491187, -0.85276313, -0.99365725, -0.99370625, -0.99389819,\n", + " -0.99377113, -0.99415254, -0.99424222, -0.99463329, -0.52798507,\n", + " -0.6042654 , -0.51545651, -0.36164779, -0.77336449, -0.77985948,\n", + " -0.78220141, -0.78429074, -0.86635404, -0.82222222, -0.85046729,\n", + " -0.81562987],\n", + " [-0.70544458, -0.64733542, -0.64844209, -0.61833856, -0.6481409 ,\n", + " -0.66392479, -0.71646163, -0.99356747, -0.99360832, -0.99380327,\n", + " -0.99367558, -0.99407272, -0.99415647, -0.99456035, -0.36567164,\n", + " -0.4549763 , -0.34291876, -0.36146698, -0.70560748, -0.63934426,\n", + " -0.63934426, -0.62016413, -0.64830012, -0.65847953, -0.72663551,\n", + " -0.66868827],\n", + " [-0.70387779, -0.67202194, -0.69508132, -0.72413793, -0.73228963,\n", + " -0.72816295, -0.72310096, -0.99348204, -0.99351955, -0.99372023,\n", + " -0.99359367, -0.99399256, -0.99407882, -0.99449203, -0.38432836,\n", + " -0.58530806, -0.33141625, -0.36130226, -0.69392523, -0.66042155,\n", + " -0.68384075, -0.71629543, -0.72801876, -0.72163743, -0.72196262,\n", + " -0.7113152 ],\n", + " [-0.8515472 , -0.81073668, -0.776602 , -0.76724138, -0.78277886,\n", + " -0.75832354, -0.74262839, -0.99341682, -0.99344607, -0.99364669,\n", + " -0.99352762, -0.99392743, -0.99401037, -0.99441763, -0.44029851,\n", + " -0.5521327 , -0.38461538, -0.36116102, -0.84345794, -0.80327869,\n", + " -0.76814988, -0.76084408, -0.77725674, -0.75204678, -0.73831776,\n", + " -0.7865188 ],\n", + " [-0.80258519, -0.83659875, -0.83499902, -0.79741379, -0.80821918,\n", + " -0.81629456, -0.79379028, -0.99336347, -0.99339091, -0.99358745,\n", + " -0.99346147, -0.9938642 , -0.99394733, -0.99434605, -0.44962687,\n", + " -0.6563981 , -0.34579439, -0.36103606, -0.79439252, -0.82669789,\n", + " -0.82669789, -0.78898007, -0.80304807, -0.81052632, -0.79205607,\n", + " -0.81632299],\n", + " [-0.83313749, -0.87539185, -0.90241035, -0.88440439, -0.86771037,\n", + " -0.87935762, -0.87580551, -0.99331764, -0.99335898, -0.99355602,\n", + " -0.99342259, -0.99382267, -0.99390959, -0.99430418, -0.54291045,\n", + " -0.72274882, -0.42918763, -0.36096002, -0.82943925, -0.87119438,\n", + " -0.89461358, -0.87573271, -0.86166471, -0.87134503, -0.87383178,\n", + " -0.88078323],\n", + " [-0.56678418, -0.60031348, -0.64295512, -0.78409091, -0.76164384,\n", + " -0.78535057, -0.82464362, -0.99321481, -0.99327557, -0.99349034,\n", + " -0.99337881, -0.9937915 , -0.99387347, -0.99427367, -0.32835821,\n", + " -0.47630332, -0.25808771, -0.36084678, -0.56074766, -0.59250585,\n", + " -0.6323185 , -0.77960141, -0.84759672, -0.78947368, -0.8364486 ,\n", + " -0.72621729],\n", + " [-0.77007442, -0.81230408, -0.83186361, -0.85540752, -0.85870841,\n", + " -0.86486486, -0.847686 , -0.99311634, -0.99319338, -0.99341516,\n", + " -0.99332651, -0.99374196, -0.99381551, -0.99422246, -0.46641791,\n", + " -0.65165877, -0.39324227, -0.36071245, -0.76168224, -0.80093677,\n", + " -0.82201405, -0.84759672, -0.85463072, -0.85730994, -0.84579439,\n", + " -0.83780974],\n", + " [-0.87622405, -0.92163009, -0.91377621, -0.89224138, -0.84540117,\n", + " -0.83431257, -0.82112869, -0.99306816, -0.99315821, -0.99338734,\n", + " -0.99329935, -0.99370611, -0.99377885, -0.9941789 , -0.55783582,\n", + " -0.65402844, -0.50970525, -0.36064058, -0.86682243, -0.91334895,\n", + " -0.90632319, -0.88745604, -0.84056272, -0.82923977, -0.81775701,\n", + " -0.87731762],\n", + " [-0.82843713, -0.83111285, -0.84166177, -0.8322884 , -0.84579256,\n", + " -0.8515472 , -0.86057411, -0.99302656, -0.99312426, -0.99335155,\n", + " -0.99325919, -0.99365991, -0.99373278, -0.99413129, -0.50559701,\n", + " -0.53791469, -0.52120776, -0.36055736, -0.82242991, -0.82201405,\n", + " -0.83138173, -0.82415006, -0.84056272, -0.84327485, -0.85747664,\n", + " -0.84508751],\n", + " [-0.74539757, -0.73824451, -0.76484421, -0.72100313, -0.73228963,\n", + " -0.70975323, -0.739504 , -0.99296569, -0.99306553, -0.99329699,\n", + " -0.9932005 , -0.99360224, -0.99367493, -0.99407862, -0.45149254,\n", + " -0.46208531, -0.48382459, -0.36044105, -0.73598131, -0.73067916,\n", + " -0.75644028, -0.71629543, -0.72801876, -0.70526316, -0.73831776,\n", + " -0.73696067],\n", + " [-0.40814728, -0.4596395 , -0.51087596, -0.46316614, -0.54598826,\n", + " -0.50607129, -0.57039641, -0.99283748, -0.99294147, -0.9931881 ,\n", + " -0.99308418, -0.99349681, -0.99356041, -0.99398047, -0.30597015,\n", + " -0.29383886, -0.34867002, -0.36020709, -0.46728972, -0.470726 ,\n", + " -0.5175644 , -0.48651817, -0.55685815, -0.51812865, -0.59579439,\n", + " -0.5179345 ],\n", + " [-0.47591069, -0.45219436, -0.48579267, -0.48981191, -0.57847358,\n", + " -0.54876616, -0.61882445, -0.99268659, -0.99280044, -0.99306033,\n", + " -0.99295359, -0.99338192, -0.99344287, -0.9938794 , -0.30223881,\n", + " -0.33649289, -0.32278936, -0.35994787, -0.49065421, -0.46370023,\n", + " -0.4941452 , -0.49589683, -0.58264947, -0.55321637, -0.62850467,\n", + " -0.53110379],\n", + " [-0.26792009, -0.27115987, -0.30080345, -0.24412226, -0.34246575,\n", + " -0.30434783, -0.40285101, -0.99250927, -0.99261854, -0.99288914,\n", + " -0.99278188, -0.99322495, -0.99327569, -0.9937324 , -0.22947761,\n", + " -0.28909953, -0.26096334, -0.35960139, -0.33878505, -0.29976581,\n", + " -0.32786885, -0.2919109 , -0.38100821, -0.32865497, -0.42523364,\n", + " -0.3394559 ],\n", + " [-0.31374853, -0.26449843, -0.2941407 , -0.23315047, -0.36516634,\n", + " -0.35957697, -0.44112478, -0.9923035 , -0.99241264, -0.99269787,\n", + " -0.99258055, -0.99304482, -0.99309553, -0.99356987, -0.2108209 ,\n", + " -0.21563981, -0.23652049, -0.35921021, -0.30607477, -0.26229508,\n", + " -0.29039813, -0.23563892, -0.35990621, -0.35204678, -0.43925234,\n", + " -0.32004852]])" ] }, - "execution_count": 39, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -2331,22 +2457,21 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2022-01-18 05:32:48.464559: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-01-18 05:32:48.495873: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fba31d9b0c0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", - "2022-01-18 05:32:48.495892: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + "2023-04-13 18:15:22.843587: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n", + "2023-04-13 18:15:22.880000: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f80a9a82ef0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2023-04-13 18:15:22.880015: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" ] } ], "source": [ - "step = 9\n", + "step = 10\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] } @@ -2367,7 +2492,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/tutorials/pipelines/lstm_regressor_with_unstack.ipynb b/tutorials/pipelines/lstm_regressor_with_unstack.ipynb index 516c6da..41a5af8 100644 --- a/tutorials/pipelines/lstm_regressor_with_unstack.ipynb +++ b/tutorials/pipelines/lstm_regressor_with_unstack.ipynb @@ -17,12 +17,12 @@ "source": [ "from draco.demo import load_demo\n", "\n", - "train_target_times, test_target_times, readings = load_demo()" + "target_times, readings = load_demo()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "80315927", "metadata": {}, "outputs": [], @@ -32,7 +32,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "1073a88a", "metadata": {}, "outputs": [], @@ -44,10 +44,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "1c6cb15d", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "['pandas.DataFrame.resample',\n", + " 'pandas.DataFrame.unstack',\n", + " 'pandas.DataFrame.pop',\n", + " 'pandas.DataFrame.pop',\n", + " 'sklearn.impute.SimpleImputer',\n", + " 'sklearn.preprocessing.MinMaxScaler',\n", + " 'pandas.DataFrame',\n", + " 'pandas.DataFrame.set',\n", + " 'pandas.DataFrame.set',\n", + " 'mlstars.custom.timeseries_preprocessing.cutoff_window_sequences',\n", + " 'keras.Sequential.LSTMTimeSeriesRegressor']" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "pipeline.template['primitives']" ] @@ -70,7 +91,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "a2396b1c", "metadata": {}, "outputs": [ @@ -96,61 +117,61 @@ " \n", " \n", " turbine_id\n", - " timestamp\n", " signal_id\n", + " timestamp\n", " value\n", " \n", " \n", " \n", " \n", " 0\n", - " 1\n", - " 2013-01-12 00:10:00\n", - " operational setting 1\n", - " -0.0007\n", + " T001\n", + " S01\n", + " 2013-01-10\n", + " 323.0\n", " \n", " \n", " 1\n", - " 1\n", - " 2013-01-12 00:20:00\n", - " operational setting 1\n", - " 0.0019\n", + " T001\n", + " S02\n", + " 2013-01-10\n", + " 320.0\n", " \n", " \n", " 2\n", - " 1\n", - " 2013-01-12 00:30:00\n", - " operational setting 1\n", - " -0.0043\n", + " T001\n", + " S03\n", + " 2013-01-10\n", + " 284.0\n", " \n", " \n", " 3\n", - " 1\n", - " 2013-01-12 00:40:00\n", - " operational setting 1\n", - " 0.0007\n", + " T001\n", + " S04\n", + " 2013-01-10\n", + " 348.0\n", " \n", " \n", " 4\n", - " 1\n", - " 2013-01-12 00:50:00\n", - " operational setting 1\n", - " -0.0019\n", + " T001\n", + " S05\n", + " 2013-01-10\n", + " 273.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " turbine_id timestamp signal_id value\n", - "0 1 2013-01-12 00:10:00 operational setting 1 -0.0007\n", - "1 1 2013-01-12 00:20:00 operational setting 1 0.0019\n", - "2 1 2013-01-12 00:30:00 operational setting 1 -0.0043\n", - "3 1 2013-01-12 00:40:00 operational setting 1 0.0007\n", - "4 1 2013-01-12 00:50:00 operational setting 1 -0.0019" + " turbine_id signal_id timestamp value\n", + "0 T001 S01 2013-01-10 323.0\n", + "1 T001 S02 2013-01-10 320.0\n", + "2 T001 S03 2013-01-10 284.0\n", + "3 T001 S04 2013-01-10 348.0\n", + "4 T001 S05 2013-01-10 273.0" ] }, - "execution_count": 2, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -161,7 +182,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 6, "id": "3cd80f1f", "metadata": {}, "outputs": [ @@ -194,60 +215,116 @@ " \n", " \n", " 0\n", - " 1\n", - " 2013-01-12 04:20:00\n", - " 166\n", + " T001\n", + " 2013-01-12\n", + " 0\n", " \n", " \n", " 1\n", - " 1\n", - " 2013-01-12 04:30:00\n", - " 165\n", + " T001\n", + " 2013-01-13\n", + " 0\n", " \n", " \n", " 2\n", - " 1\n", - " 2013-01-12 04:40:00\n", - " 164\n", + " T001\n", + " 2013-01-14\n", + " 0\n", " \n", " \n", " 3\n", + " T001\n", + " 2013-01-15\n", " 1\n", - " 2013-01-12 04:50:00\n", - " 163\n", " \n", " \n", " 4\n", - " 1\n", - " 2013-01-12 05:00:00\n", - " 162\n", + " T001\n", + " 2013-01-16\n", + " 0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " turbine_id cutoff_time target\n", - "0 1 2013-01-12 04:20:00 166\n", - "1 1 2013-01-12 04:30:00 165\n", - "2 1 2013-01-12 04:40:00 164\n", - "3 1 2013-01-12 04:50:00 163\n", - "4 1 2013-01-12 05:00:00 162" + " turbine_id cutoff_time target\n", + "0 T001 2013-01-12 0\n", + "1 T001 2013-01-13 0\n", + "2 T001 2013-01-14 0\n", + "3 T001 2013-01-15 1\n", + "4 T001 2013-01-16 0" ] }, - "execution_count": 39, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "train_target_times.head()" + "target_times.head()" + ] + }, + { + "cell_type": "markdown", + "id": "a956f746", + "metadata": {}, + "source": [ + "## Data Preparation (part of Draco Pipeline)\n", + "\n", + "* Input: target_times, readings, turbines\n", + "* Output: X, y, readings, turbines\n", + "* Effect: target_times has been split into X and y" + ] + }, + { + "cell_type": "markdown", + "id": "a813a966", + "metadata": {}, + "source": [ + "## pandas.DataFrame.resample\n", + "\n", + "* Input: readings\n", + "* Output: readings (resampled)\n", + "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", + " signal_id and timestamp have been set as a multi-index" ] }, { "cell_type": "code", - "execution_count": 40, - "id": "6a759b57", + "execution_count": 7, + "id": "bb00b3b8", + "metadata": {}, + "outputs": [], + "source": [ + "context = pipeline.fit(target_times, readings, output_=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "381e361d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b41f13c1", "metadata": {}, "outputs": [ { @@ -271,111 +348,69 @@ " \n", " \n", " \n", + " \n", + " \n", + " value\n", + " \n", + " \n", " turbine_id\n", - " cutoff_time\n", - " target\n", + " signal_id\n", + " timestamp\n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", - " 1\n", - " 2013-01-13 13:10:00\n", - " 112.0\n", + " T001\n", + " S01\n", + " 2013-01-10 00:00:00\n", + " 323.0\n", " \n", " \n", - " 1\n", - " 2\n", - " 2013-01-14 08:00:00\n", - " 98.0\n", + " 2013-01-10 00:10:00\n", + " 346.0\n", " \n", " \n", - " 2\n", - " 3\n", - " 2013-01-14 02:50:00\n", - " 69.0\n", + " 2013-01-10 00:20:00\n", + " 407.0\n", " \n", " \n", - " 3\n", - " 4\n", - " 2013-01-14 01:10:00\n", - " 82.0\n", + " 2013-01-10 00:30:00\n", + " 257.0\n", " \n", " \n", - " 4\n", - " 5\n", - " 2013-01-14 13:10:00\n", - " 91.0\n", + " 2013-01-10 00:40:00\n", + " 267.0\n", " \n", " \n", "\n", "" ], "text/plain": [ - " turbine_id cutoff_time target\n", - "0 1 2013-01-13 13:10:00 112.0\n", - "1 2 2013-01-14 08:00:00 98.0\n", - "2 3 2013-01-14 02:50:00 69.0\n", - "3 4 2013-01-14 01:10:00 82.0\n", - "4 5 2013-01-14 13:10:00 91.0" + " value\n", + "turbine_id signal_id timestamp \n", + "T001 S01 2013-01-10 00:00:00 323.0\n", + " 2013-01-10 00:10:00 346.0\n", + " 2013-01-10 00:20:00 407.0\n", + " 2013-01-10 00:30:00 257.0\n", + " 2013-01-10 00:40:00 267.0" ] }, - "execution_count": 40, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "test_target_times.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "feb3daa6", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "training shape (18131, 3)\n", - "testing shape (100, 3)\n" - ] - } - ], - "source": [ - "print(\"training shape\", train_target_times.shape)\n", - "print(\"testing shape\", test_target_times.shape)" - ] - }, - { - "cell_type": "markdown", - "id": "a956f746", - "metadata": {}, - "source": [ - "## Data Preparation (part of Draco Pipeline)\n", - "\n", - "* Input: target_times, readings, turbines\n", - "* Output: X, y, readings, turbines\n", - "* Effect: target_times has been split into X and y" + "context['readings'].head()" ] }, { "cell_type": "markdown", - "id": "a813a966", + "id": "4b46bf1f", "metadata": {}, "source": [ - "## mlblocks.MLPipeline\n", - "\n", - "### pandas.DataFrame.resample\n", - "\n", - "* Input: readings\n", - "* Output: readings (resampled)\n", - "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index\n", - " \n", - "### pandas.DataFrame.unstack\n", + "## pandas.DataFrame.unstack\n", "\n", "* Input: readings (resampled)\n", "* Output: readings (unstacked)\n", @@ -384,18 +419,19 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "bb00b3b8", + "execution_count": 10, + "id": "bb0bac75", "metadata": {}, "outputs": [], "source": [ - "context = pipeline.fit(train_target_times, readings, output_=0)" + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "381e361d", + "execution_count": 11, + "id": "1009407e", "metadata": {}, "outputs": [ { @@ -404,7 +440,7 @@ "dict_keys(['readings', 'turbines', 'X', 'y'])" ] }, - "execution_count": 6, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -415,8 +451,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "b41f13c1", + "execution_count": 12, + "id": "83855579", "metadata": {}, "outputs": [ { @@ -442,228 +478,186 @@ " \n", " turbine_id\n", " timestamp\n", - " value_operational setting 1\n", - " value_operational setting 2\n", - " value_operational setting 3\n", - " value_sensor measurement 1\n", - " value_sensor measurement 10\n", - " value_sensor measurement 11\n", - " value_sensor measurement 12\n", - " value_sensor measurement 13\n", + " value_S01\n", + " value_S02\n", + " value_S03\n", + " value_S04\n", + " value_S05\n", + " value_S06\n", + " value_S07\n", + " value_S08\n", " ...\n", - " value_sensor measurement 2\n", - " value_sensor measurement 20\n", - " value_sensor measurement 21\n", - " value_sensor measurement 3\n", - " value_sensor measurement 4\n", - " value_sensor measurement 5\n", - " value_sensor measurement 6\n", - " value_sensor measurement 7\n", - " value_sensor measurement 8\n", - " value_sensor measurement 9\n", + " value_S17\n", + " value_S18\n", + " value_S19\n", + " value_S20\n", + " value_S21\n", + " value_S22\n", + " value_S23\n", + " value_S24\n", + " value_S25\n", + " value_S26\n", " \n", " \n", " \n", " \n", " 0\n", - " 1\n", - " 2013-01-12 00:10:00\n", - " -0.0007\n", - " -0.0004\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.47\n", - " 521.66\n", - " 2388.02\n", + " T001\n", + " 2013-01-10 00:00:00\n", + " 323.0\n", + " 320.0\n", + " 284.0\n", + " 348.0\n", + " 273.0\n", + " 342.0\n", + " 280.0\n", + " 3197842.0\n", " ...\n", - " 641.82\n", - " 39.06\n", - " 23.4190\n", - " 1589.70\n", - " 1400.60\n", - " 14.62\n", - " 21.61\n", - " 554.36\n", - " 2388.06\n", - " 9046.19\n", + " 11.7\n", + " 3131020.0\n", + " 55.0\n", + " 55.0\n", + " 47.0\n", + " 58.0\n", + " 45.0\n", + " 58.0\n", + " 47.0\n", + " 356.0\n", " \n", " \n", " 1\n", - " 1\n", - " 2013-01-12 00:20:00\n", - " 0.0019\n", - " -0.0003\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.49\n", - " 522.28\n", - " 2388.07\n", + " T001\n", + " 2013-01-10 00:10:00\n", + " 346.0\n", + " 384.0\n", + " 367.0\n", + " 411.0\n", + " 331.0\n", + " 360.0\n", + " 249.0\n", + " 3197900.0\n", " ...\n", - " 642.15\n", - " 39.00\n", - " 23.4236\n", - " 1591.82\n", - " 1403.14\n", - " 14.62\n", - " 21.61\n", - " 553.75\n", - " 2388.04\n", - " 9044.07\n", + " 10.2\n", + " 3131420.0\n", + " 58.0\n", + " 63.0\n", + " 62.0\n", + " 67.0\n", + " 55.0\n", + " 61.0\n", + " 42.0\n", + " 400.0\n", " \n", " \n", " 2\n", - " 1\n", - " 2013-01-12 00:30:00\n", - " -0.0043\n", - " 0.0003\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.27\n", - " 522.42\n", - " 2388.03\n", + " T001\n", + " 2013-01-10 00:20:00\n", + " 407.0\n", + " 363.0\n", + " 407.0\n", + " 393.0\n", + " 275.0\n", + " 335.0\n", + " 270.0\n", + " 3197968.0\n", " ...\n", - " 642.35\n", - " 38.95\n", - " 23.3442\n", - " 1587.99\n", - " 1404.20\n", - " 14.62\n", - " 21.61\n", - " 554.26\n", - " 2388.08\n", - " 9052.94\n", + " 9.5\n", + " 3131822.0\n", + " 68.0\n", + " 61.0\n", + " 67.0\n", + " 66.0\n", + " 46.0\n", + " 55.0\n", + " 45.0\n", + " 402.0\n", " \n", " \n", " 3\n", - " 1\n", - " 2013-01-12 00:40:00\n", - " 0.0007\n", - " 0.0000\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.13\n", - " 522.86\n", - " 2388.08\n", + " T001\n", + " 2013-01-10 00:30:00\n", + " 257.0\n", + " 307.0\n", + " 315.0\n", + " 361.0\n", + " 317.0\n", + " 354.0\n", + " 271.0\n", + " 3198011.0\n", " ...\n", - " 642.35\n", - " 38.88\n", - " 23.3739\n", - " 1582.79\n", - " 1401.87\n", - " 14.62\n", - " 21.61\n", - " 554.45\n", - " 2388.11\n", - " 9049.48\n", + " 10.5\n", + " 3132179.0\n", + " 43.0\n", + " 51.0\n", + " 53.0\n", + " 62.0\n", + " 53.0\n", + " 60.0\n", + " 45.0\n", + " 357.0\n", " \n", " \n", " 4\n", - " 1\n", - " 2013-01-12 00:50:00\n", - " -0.0019\n", - " -0.0002\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.28\n", - " 522.19\n", - " 2388.04\n", + " T001\n", + " 2013-01-10 00:40:00\n", + " 267.0\n", + " 309.0\n", + " 314.0\n", + " 355.0\n", + " 262.0\n", + " 246.0\n", + " 212.0\n", + " 3198056.0\n", " ...\n", - " 642.37\n", - " 38.90\n", - " 23.4044\n", - " 1582.85\n", - " 1406.22\n", - " 14.62\n", - " 21.61\n", - " 554.00\n", - " 2388.06\n", - " 9055.15\n", + " 9.6\n", + " 3132501.0\n", + " 45.0\n", + " 51.0\n", + " 54.0\n", + " 59.0\n", + " 43.0\n", + " 41.0\n", + " 36.0\n", + " 322.0\n", " \n", " \n", "\n", - "

5 rows × 26 columns

\n", + "

5 rows × 28 columns

\n", "" ], "text/plain": [ - " turbine_id timestamp value_operational setting 1 \\\n", - "0 1 2013-01-12 00:10:00 -0.0007 \n", - "1 1 2013-01-12 00:20:00 0.0019 \n", - "2 1 2013-01-12 00:30:00 -0.0043 \n", - "3 1 2013-01-12 00:40:00 0.0007 \n", - "4 1 2013-01-12 00:50:00 -0.0019 \n", - "\n", - " value_operational setting 2 value_operational setting 3 \\\n", - "0 -0.0004 100.0 \n", - "1 -0.0003 100.0 \n", - "2 0.0003 100.0 \n", - "3 0.0000 100.0 \n", - "4 -0.0002 100.0 \n", - "\n", - " value_sensor measurement 1 value_sensor measurement 10 \\\n", - "0 518.67 1.3 \n", - "1 518.67 1.3 \n", - "2 518.67 1.3 \n", - "3 518.67 1.3 \n", - "4 518.67 1.3 \n", - "\n", - " value_sensor measurement 11 value_sensor measurement 12 \\\n", - "0 47.47 521.66 \n", - "1 47.49 522.28 \n", - "2 47.27 522.42 \n", - "3 47.13 522.86 \n", - "4 47.28 522.19 \n", + " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", + "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", + "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", + "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", + "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", "\n", - " value_sensor measurement 13 ... value_sensor measurement 2 \\\n", - "0 2388.02 ... 641.82 \n", - "1 2388.07 ... 642.15 \n", - "2 2388.03 ... 642.35 \n", - "3 2388.08 ... 642.35 \n", - "4 2388.04 ... 642.37 \n", + " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", + "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", + "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", + "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", + "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", + "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", "\n", - " value_sensor measurement 20 value_sensor measurement 21 \\\n", - "0 39.06 23.4190 \n", - "1 39.00 23.4236 \n", - "2 38.95 23.3442 \n", - "3 38.88 23.3739 \n", - "4 38.90 23.4044 \n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", "\n", - " value_sensor measurement 3 value_sensor measurement 4 \\\n", - "0 1589.70 1400.60 \n", - "1 1591.82 1403.14 \n", - "2 1587.99 1404.20 \n", - "3 1582.79 1401.87 \n", - "4 1582.85 1406.22 \n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", "\n", - " value_sensor measurement 5 value_sensor measurement 6 \\\n", - "0 14.62 21.61 \n", - "1 14.62 21.61 \n", - "2 14.62 21.61 \n", - "3 14.62 21.61 \n", - "4 14.62 21.61 \n", - "\n", - " value_sensor measurement 7 value_sensor measurement 8 \\\n", - "0 554.36 2388.06 \n", - "1 553.75 2388.04 \n", - "2 554.26 2388.08 \n", - "3 554.45 2388.11 \n", - "4 554.00 2388.06 \n", - "\n", - " value_sensor measurement 9 \n", - "0 9046.19 \n", - "1 9044.07 \n", - "2 9052.94 \n", - "3 9049.48 \n", - "4 9055.15 \n", - "\n", - "[5 rows x 26 columns]" + "[5 rows x 28 columns]" ] }, - "execution_count": 7, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -686,19 +680,19 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "bb0bac75", + "execution_count": 13, + "id": "6a422d33", "metadata": {}, "outputs": [], "source": [ - "step = 1\n", + "step = 2\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "1009407e", + "execution_count": 14, + "id": "334867d7", "metadata": {}, "outputs": [ { @@ -707,7 +701,7 @@ "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" ] }, - "execution_count": 9, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -718,22 +712,22 @@ }, { "cell_type": "code", - "execution_count": 10, - "id": "93104c3b", + "execution_count": 15, + "id": "df9dbf59", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 1\n", - "1 1\n", - "2 1\n", - "3 1\n", - "4 1\n", - "Name: turbine_id, dtype: int64" + "0 T001\n", + "1 T001\n", + "2 T001\n", + "3 T001\n", + "4 T001\n", + "Name: turbine_id, dtype: object" ] }, - "execution_count": 10, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -744,8 +738,8 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "83855579", + "execution_count": 16, + "id": "7891112d", "metadata": {}, "outputs": [ { @@ -770,229 +764,187 @@ " \n", " \n", " timestamp\n", - " value_operational setting 1\n", - " value_operational setting 2\n", - " value_operational setting 3\n", - " value_sensor measurement 1\n", - " value_sensor measurement 10\n", - " value_sensor measurement 11\n", - " value_sensor measurement 12\n", - " value_sensor measurement 13\n", - " value_sensor measurement 14\n", + " value_S01\n", + " value_S02\n", + " value_S03\n", + " value_S04\n", + " value_S05\n", + " value_S06\n", + " value_S07\n", + " value_S08\n", + " value_S09\n", " ...\n", - " value_sensor measurement 2\n", - " value_sensor measurement 20\n", - " value_sensor measurement 21\n", - " value_sensor measurement 3\n", - " value_sensor measurement 4\n", - " value_sensor measurement 5\n", - " value_sensor measurement 6\n", - " value_sensor measurement 7\n", - " value_sensor measurement 8\n", - " value_sensor measurement 9\n", + " value_S17\n", + " value_S18\n", + " value_S19\n", + " value_S20\n", + " value_S21\n", + " value_S22\n", + " value_S23\n", + " value_S24\n", + " value_S25\n", + " value_S26\n", " \n", " \n", " \n", " \n", " 0\n", - " 2013-01-12 00:10:00\n", - " -0.0007\n", - " -0.0004\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.47\n", - " 521.66\n", - " 2388.02\n", - " 8138.62\n", + " 2013-01-10 00:00:00\n", + " 323.0\n", + " 320.0\n", + " 284.0\n", + " 348.0\n", + " 273.0\n", + " 342.0\n", + " 280.0\n", + " 3197842.0\n", + " 695000.0\n", " ...\n", - " 641.82\n", - " 39.06\n", - " 23.4190\n", - " 1589.70\n", - " 1400.60\n", - " 14.62\n", - " 21.61\n", - " 554.36\n", - " 2388.06\n", - " 9046.19\n", + " 11.7\n", + " 3131020.0\n", + " 55.0\n", + " 55.0\n", + " 47.0\n", + " 58.0\n", + " 45.0\n", + " 58.0\n", + " 47.0\n", + " 356.0\n", " \n", " \n", " 1\n", - " 2013-01-12 00:20:00\n", - " 0.0019\n", - " -0.0003\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.49\n", - " 522.28\n", - " 2388.07\n", - " 8131.49\n", + " 2013-01-10 00:10:00\n", + " 346.0\n", + " 384.0\n", + " 367.0\n", + " 411.0\n", + " 331.0\n", + " 360.0\n", + " 249.0\n", + " 3197900.0\n", + " 695063.0\n", " ...\n", - " 642.15\n", - " 39.00\n", - " 23.4236\n", - " 1591.82\n", - " 1403.14\n", - " 14.62\n", - " 21.61\n", - " 553.75\n", - " 2388.04\n", - " 9044.07\n", + " 10.2\n", + " 3131420.0\n", + " 58.0\n", + " 63.0\n", + " 62.0\n", + " 67.0\n", + " 55.0\n", + " 61.0\n", + " 42.0\n", + " 400.0\n", " \n", " \n", " 2\n", - " 2013-01-12 00:30:00\n", - " -0.0043\n", - " 0.0003\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.27\n", - " 522.42\n", - " 2388.03\n", - " 8133.23\n", + " 2013-01-10 00:20:00\n", + " 407.0\n", + " 363.0\n", + " 407.0\n", + " 393.0\n", + " 275.0\n", + " 335.0\n", + " 270.0\n", + " 3197968.0\n", + " 695124.0\n", " ...\n", - " 642.35\n", - " 38.95\n", - " 23.3442\n", - " 1587.99\n", - " 1404.20\n", - " 14.62\n", - " 21.61\n", - " 554.26\n", - " 2388.08\n", - " 9052.94\n", + " 9.5\n", + " 3131822.0\n", + " 68.0\n", + " 61.0\n", + " 67.0\n", + " 66.0\n", + " 46.0\n", + " 55.0\n", + " 45.0\n", + " 402.0\n", " \n", " \n", " 3\n", - " 2013-01-12 00:40:00\n", - " 0.0007\n", - " 0.0000\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.13\n", - " 522.86\n", - " 2388.08\n", - " 8133.83\n", + " 2013-01-10 00:30:00\n", + " 257.0\n", + " 307.0\n", + " 315.0\n", + " 361.0\n", + " 317.0\n", + " 354.0\n", + " 271.0\n", + " 3198011.0\n", + " 695175.0\n", " ...\n", - " 642.35\n", - " 38.88\n", - " 23.3739\n", - " 1582.79\n", - " 1401.87\n", - " 14.62\n", - " 21.61\n", - " 554.45\n", - " 2388.11\n", - " 9049.48\n", + " 10.5\n", + " 3132179.0\n", + " 43.0\n", + " 51.0\n", + " 53.0\n", + " 62.0\n", + " 53.0\n", + " 60.0\n", + " 45.0\n", + " 357.0\n", " \n", " \n", " 4\n", - " 2013-01-12 00:50:00\n", - " -0.0019\n", - " -0.0002\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.28\n", - " 522.19\n", - " 2388.04\n", - " 8133.80\n", + " 2013-01-10 00:40:00\n", + " 267.0\n", + " 309.0\n", + " 314.0\n", + " 355.0\n", + " 262.0\n", + " 246.0\n", + " 212.0\n", + " 3198056.0\n", + " 695226.0\n", " ...\n", - " 642.37\n", - " 38.90\n", - " 23.4044\n", - " 1582.85\n", - " 1406.22\n", - " 14.62\n", - " 21.61\n", - " 554.00\n", - " 2388.06\n", - " 9055.15\n", + " 9.6\n", + " 3132501.0\n", + " 45.0\n", + " 51.0\n", + " 54.0\n", + " 59.0\n", + " 43.0\n", + " 41.0\n", + " 36.0\n", + " 322.0\n", " \n", " \n", "\n", - "

5 rows × 25 columns

\n", + "

5 rows × 27 columns

\n", "" ], "text/plain": [ - " timestamp value_operational setting 1 \\\n", - "0 2013-01-12 00:10:00 -0.0007 \n", - "1 2013-01-12 00:20:00 0.0019 \n", - "2 2013-01-12 00:30:00 -0.0043 \n", - "3 2013-01-12 00:40:00 0.0007 \n", - "4 2013-01-12 00:50:00 -0.0019 \n", - "\n", - " value_operational setting 2 value_operational setting 3 \\\n", - "0 -0.0004 100.0 \n", - "1 -0.0003 100.0 \n", - "2 0.0003 100.0 \n", - "3 0.0000 100.0 \n", - "4 -0.0002 100.0 \n", - "\n", - " value_sensor measurement 1 value_sensor measurement 10 \\\n", - "0 518.67 1.3 \n", - "1 518.67 1.3 \n", - "2 518.67 1.3 \n", - "3 518.67 1.3 \n", - "4 518.67 1.3 \n", - "\n", - " value_sensor measurement 11 value_sensor measurement 12 \\\n", - "0 47.47 521.66 \n", - "1 47.49 522.28 \n", - "2 47.27 522.42 \n", - "3 47.13 522.86 \n", - "4 47.28 522.19 \n", - "\n", - " value_sensor measurement 13 value_sensor measurement 14 ... \\\n", - "0 2388.02 8138.62 ... \n", - "1 2388.07 8131.49 ... \n", - "2 2388.03 8133.23 ... \n", - "3 2388.08 8133.83 ... \n", - "4 2388.04 8133.80 ... \n", - "\n", - " value_sensor measurement 2 value_sensor measurement 20 \\\n", - "0 641.82 39.06 \n", - "1 642.15 39.00 \n", - "2 642.35 38.95 \n", - "3 642.35 38.88 \n", - "4 642.37 38.90 \n", + " timestamp value_S01 value_S02 value_S03 value_S04 value_S05 \\\n", + "0 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 273.0 \n", + "1 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 331.0 \n", + "2 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 275.0 \n", + "3 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 317.0 \n", + "4 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 262.0 \n", "\n", - " value_sensor measurement 21 value_sensor measurement 3 \\\n", - "0 23.4190 1589.70 \n", - "1 23.4236 1591.82 \n", - "2 23.3442 1587.99 \n", - "3 23.3739 1582.79 \n", - "4 23.4044 1582.85 \n", + " value_S06 value_S07 value_S08 value_S09 ... value_S17 value_S18 \\\n", + "0 342.0 280.0 3197842.0 695000.0 ... 11.7 3131020.0 \n", + "1 360.0 249.0 3197900.0 695063.0 ... 10.2 3131420.0 \n", + "2 335.0 270.0 3197968.0 695124.0 ... 9.5 3131822.0 \n", + "3 354.0 271.0 3198011.0 695175.0 ... 10.5 3132179.0 \n", + "4 246.0 212.0 3198056.0 695226.0 ... 9.6 3132501.0 \n", "\n", - " value_sensor measurement 4 value_sensor measurement 5 \\\n", - "0 1400.60 14.62 \n", - "1 1403.14 14.62 \n", - "2 1404.20 14.62 \n", - "3 1401.87 14.62 \n", - "4 1406.22 14.62 \n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", "\n", - " value_sensor measurement 6 value_sensor measurement 7 \\\n", - "0 21.61 554.36 \n", - "1 21.61 553.75 \n", - "2 21.61 554.26 \n", - "3 21.61 554.45 \n", - "4 21.61 554.00 \n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", "\n", - " value_sensor measurement 8 value_sensor measurement 9 \n", - "0 2388.06 9046.19 \n", - "1 2388.04 9044.07 \n", - "2 2388.08 9052.94 \n", - "3 2388.11 9049.48 \n", - "4 2388.06 9055.15 \n", - "\n", - "[5 rows x 25 columns]" + "[5 rows x 27 columns]" ] }, - "execution_count": 11, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1015,18 +967,18 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "id": "ebcad5cd", "metadata": {}, "outputs": [], "source": [ - "step = 2\n", + "step = 3\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "id": "d497ab07", "metadata": {}, "outputs": [ @@ -1036,7 +988,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" ] }, - "execution_count": 13, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1047,22 +999,22 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "id": "2c3bfa0b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0 2013-01-12 00:10:00\n", - "1 2013-01-12 00:20:00\n", - "2 2013-01-12 00:30:00\n", - "3 2013-01-12 00:40:00\n", - "4 2013-01-12 00:50:00\n", + "0 2013-01-10 00:00:00\n", + "1 2013-01-10 00:10:00\n", + "2 2013-01-10 00:20:00\n", + "3 2013-01-10 00:30:00\n", + "4 2013-01-10 00:40:00\n", "Name: timestamp, dtype: datetime64[ns]" ] }, - "execution_count": 14, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -1073,7 +1025,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 20, "id": "3c837b44", "metadata": {}, "outputs": [ @@ -1098,230 +1050,188 @@ " \n", " \n", " \n", - " value_operational setting 1\n", - " value_operational setting 2\n", - " value_operational setting 3\n", - " value_sensor measurement 1\n", - " value_sensor measurement 10\n", - " value_sensor measurement 11\n", - " value_sensor measurement 12\n", - " value_sensor measurement 13\n", - " value_sensor measurement 14\n", - " value_sensor measurement 15\n", + " value_S01\n", + " value_S02\n", + " value_S03\n", + " value_S04\n", + " value_S05\n", + " value_S06\n", + " value_S07\n", + " value_S08\n", + " value_S09\n", + " value_S10\n", " ...\n", - " value_sensor measurement 2\n", - " value_sensor measurement 20\n", - " value_sensor measurement 21\n", - " value_sensor measurement 3\n", - " value_sensor measurement 4\n", - " value_sensor measurement 5\n", - " value_sensor measurement 6\n", - " value_sensor measurement 7\n", - " value_sensor measurement 8\n", - " value_sensor measurement 9\n", + " value_S17\n", + " value_S18\n", + " value_S19\n", + " value_S20\n", + " value_S21\n", + " value_S22\n", + " value_S23\n", + " value_S24\n", + " value_S25\n", + " value_S26\n", " \n", " \n", " \n", " \n", " 0\n", - " -0.0007\n", - " -0.0004\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.47\n", - " 521.66\n", - " 2388.02\n", - " 8138.62\n", - " 8.4195\n", + " 323.0\n", + " 320.0\n", + " 284.0\n", + " 348.0\n", + " 273.0\n", + " 342.0\n", + " 280.0\n", + " 3197842.0\n", + " 695000.0\n", + " 3348234.0\n", " ...\n", - " 641.82\n", - " 39.06\n", - " 23.4190\n", - " 1589.70\n", - " 1400.60\n", - " 14.62\n", - " 21.61\n", - " 554.36\n", - " 2388.06\n", - " 9046.19\n", + " 11.7\n", + " 3131020.0\n", + " 55.0\n", + " 55.0\n", + " 47.0\n", + " 58.0\n", + " 45.0\n", + " 58.0\n", + " 47.0\n", + " 356.0\n", " \n", " \n", " 1\n", - " 0.0019\n", - " -0.0003\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.49\n", - " 522.28\n", - " 2388.07\n", - " 8131.49\n", - " 8.4318\n", + " 346.0\n", + " 384.0\n", + " 367.0\n", + " 411.0\n", + " 331.0\n", + " 360.0\n", + " 249.0\n", + " 3197900.0\n", + " 695063.0\n", + " 3348296.0\n", " ...\n", - " 642.15\n", - " 39.00\n", - " 23.4236\n", - " 1591.82\n", - " 1403.14\n", - " 14.62\n", - " 21.61\n", - " 553.75\n", - " 2388.04\n", - " 9044.07\n", + " 10.2\n", + " 3131420.0\n", + " 58.0\n", + " 63.0\n", + " 62.0\n", + " 67.0\n", + " 55.0\n", + " 61.0\n", + " 42.0\n", + " 400.0\n", " \n", " \n", " 2\n", - " -0.0043\n", - " 0.0003\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.27\n", - " 522.42\n", - " 2388.03\n", - " 8133.23\n", - " 8.4178\n", + " 407.0\n", + " 363.0\n", + " 407.0\n", + " 393.0\n", + " 275.0\n", + " 335.0\n", + " 270.0\n", + " 3197968.0\n", + " 695124.0\n", + " 3348363.0\n", " ...\n", - " 642.35\n", - " 38.95\n", - " 23.3442\n", - " 1587.99\n", - " 1404.20\n", - " 14.62\n", - " 21.61\n", - " 554.26\n", - " 2388.08\n", - " 9052.94\n", + " 9.5\n", + " 3131822.0\n", + " 68.0\n", + " 61.0\n", + " 67.0\n", + " 66.0\n", + " 46.0\n", + " 55.0\n", + " 45.0\n", + " 402.0\n", " \n", " \n", " 3\n", - " 0.0007\n", - " 0.0000\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.13\n", - " 522.86\n", - " 2388.08\n", - " 8133.83\n", - " 8.3682\n", + " 257.0\n", + " 307.0\n", + " 315.0\n", + " 361.0\n", + " 317.0\n", + " 354.0\n", + " 271.0\n", + " 3198011.0\n", + " 695175.0\n", + " 3348416.0\n", " ...\n", - " 642.35\n", - " 38.88\n", - " 23.3739\n", - " 1582.79\n", - " 1401.87\n", - " 14.62\n", - " 21.61\n", - " 554.45\n", - " 2388.11\n", - " 9049.48\n", + " 10.5\n", + " 3132179.0\n", + " 43.0\n", + " 51.0\n", + " 53.0\n", + " 62.0\n", + " 53.0\n", + " 60.0\n", + " 45.0\n", + " 357.0\n", " \n", " \n", " 4\n", - " -0.0019\n", - " -0.0002\n", - " 100.0\n", - " 518.67\n", - " 1.3\n", - " 47.28\n", - " 522.19\n", - " 2388.04\n", - " 8133.80\n", - " 8.4294\n", + " 267.0\n", + " 309.0\n", + " 314.0\n", + " 355.0\n", + " 262.0\n", + " 246.0\n", + " 212.0\n", + " 3198056.0\n", + " 695226.0\n", + " 3348470.0\n", " ...\n", - " 642.37\n", - " 38.90\n", - " 23.4044\n", - " 1582.85\n", - " 1406.22\n", - " 14.62\n", - " 21.61\n", - " 554.00\n", - " 2388.06\n", - " 9055.15\n", + " 9.6\n", + " 3132501.0\n", + " 45.0\n", + " 51.0\n", + " 54.0\n", + " 59.0\n", + " 43.0\n", + " 41.0\n", + " 36.0\n", + " 322.0\n", " \n", " \n", "\n", - "

5 rows × 24 columns

\n", + "

5 rows × 26 columns

\n", "" ], "text/plain": [ - " value_operational setting 1 value_operational setting 2 \\\n", - "0 -0.0007 -0.0004 \n", - "1 0.0019 -0.0003 \n", - "2 -0.0043 0.0003 \n", - "3 0.0007 0.0000 \n", - "4 -0.0019 -0.0002 \n", - "\n", - " value_operational setting 3 value_sensor measurement 1 \\\n", - "0 100.0 518.67 \n", - "1 100.0 518.67 \n", - "2 100.0 518.67 \n", - "3 100.0 518.67 \n", - "4 100.0 518.67 \n", - "\n", - " value_sensor measurement 10 value_sensor measurement 11 \\\n", - "0 1.3 47.47 \n", - "1 1.3 47.49 \n", - "2 1.3 47.27 \n", - "3 1.3 47.13 \n", - "4 1.3 47.28 \n", - "\n", - " value_sensor measurement 12 value_sensor measurement 13 \\\n", - "0 521.66 2388.02 \n", - "1 522.28 2388.07 \n", - "2 522.42 2388.03 \n", - "3 522.86 2388.08 \n", - "4 522.19 2388.04 \n", - "\n", - " value_sensor measurement 14 value_sensor measurement 15 ... \\\n", - "0 8138.62 8.4195 ... \n", - "1 8131.49 8.4318 ... \n", - "2 8133.23 8.4178 ... \n", - "3 8133.83 8.3682 ... \n", - "4 8133.80 8.4294 ... \n", - "\n", - " value_sensor measurement 2 value_sensor measurement 20 \\\n", - "0 641.82 39.06 \n", - "1 642.15 39.00 \n", - "2 642.35 38.95 \n", - "3 642.35 38.88 \n", - "4 642.37 38.90 \n", + " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", + "0 323.0 320.0 284.0 348.0 273.0 342.0 \n", + "1 346.0 384.0 367.0 411.0 331.0 360.0 \n", + "2 407.0 363.0 407.0 393.0 275.0 335.0 \n", + "3 257.0 307.0 315.0 361.0 317.0 354.0 \n", + "4 267.0 309.0 314.0 355.0 262.0 246.0 \n", "\n", - " value_sensor measurement 21 value_sensor measurement 3 \\\n", - "0 23.4190 1589.70 \n", - "1 23.4236 1591.82 \n", - "2 23.3442 1587.99 \n", - "3 23.3739 1582.79 \n", - "4 23.4044 1582.85 \n", + " value_S07 value_S08 value_S09 value_S10 ... value_S17 value_S18 \\\n", + "0 280.0 3197842.0 695000.0 3348234.0 ... 11.7 3131020.0 \n", + "1 249.0 3197900.0 695063.0 3348296.0 ... 10.2 3131420.0 \n", + "2 270.0 3197968.0 695124.0 3348363.0 ... 9.5 3131822.0 \n", + "3 271.0 3198011.0 695175.0 3348416.0 ... 10.5 3132179.0 \n", + "4 212.0 3198056.0 695226.0 3348470.0 ... 9.6 3132501.0 \n", "\n", - " value_sensor measurement 4 value_sensor measurement 5 \\\n", - "0 1400.60 14.62 \n", - "1 1403.14 14.62 \n", - "2 1404.20 14.62 \n", - "3 1401.87 14.62 \n", - "4 1406.22 14.62 \n", + " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", + "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", + "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", + "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", + "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", + "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", "\n", - " value_sensor measurement 6 value_sensor measurement 7 \\\n", - "0 21.61 554.36 \n", - "1 21.61 553.75 \n", - "2 21.61 554.26 \n", - "3 21.61 554.45 \n", - "4 21.61 554.00 \n", + " value_S25 value_S26 \n", + "0 47.0 356.0 \n", + "1 42.0 400.0 \n", + "2 45.0 402.0 \n", + "3 45.0 357.0 \n", + "4 36.0 322.0 \n", "\n", - " value_sensor measurement 8 value_sensor measurement 9 \n", - "0 2388.06 9046.19 \n", - "1 2388.04 9044.07 \n", - "2 2388.08 9052.94 \n", - "3 2388.11 9049.48 \n", - "4 2388.06 9055.15 \n", - "\n", - "[5 rows x 24 columns]" + "[5 rows x 26 columns]" ] }, - "execution_count": 15, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1344,18 +1254,27 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 21, "id": "3ad08e01", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/sklearn/impute/_base.py:356: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "step = 3\n", + "step = 4\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 22, "id": "19c4ee50", "metadata": {}, "outputs": [ @@ -1365,7 +1284,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 17, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1376,46 +1295,51 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 23, "id": "af5f9dc1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[-7.00000e-04, -4.00000e-04, 1.00000e+02, 5.18670e+02,\n", - " 1.30000e+00, 4.74700e+01, 5.21660e+02, 2.38802e+03,\n", - " 8.13862e+03, 8.41950e+00, 3.00000e-02, 3.92000e+02,\n", - " 2.38800e+03, 1.00000e+02, 6.41820e+02, 3.90600e+01,\n", - " 2.34190e+01, 1.58970e+03, 1.40060e+03, 1.46200e+01,\n", - " 2.16100e+01, 5.54360e+02, 2.38806e+03, 9.04619e+03],\n", - " [ 1.90000e-03, -3.00000e-04, 1.00000e+02, 5.18670e+02,\n", - " 1.30000e+00, 4.74900e+01, 5.22280e+02, 2.38807e+03,\n", - " 8.13149e+03, 8.43180e+00, 3.00000e-02, 3.92000e+02,\n", - " 2.38800e+03, 1.00000e+02, 6.42150e+02, 3.90000e+01,\n", - " 2.34236e+01, 1.59182e+03, 1.40314e+03, 1.46200e+01,\n", - " 2.16100e+01, 5.53750e+02, 2.38804e+03, 9.04407e+03],\n", - " [-4.30000e-03, 3.00000e-04, 1.00000e+02, 5.18670e+02,\n", - " 1.30000e+00, 4.72700e+01, 5.22420e+02, 2.38803e+03,\n", - " 8.13323e+03, 8.41780e+00, 3.00000e-02, 3.90000e+02,\n", - " 2.38800e+03, 1.00000e+02, 6.42350e+02, 3.89500e+01,\n", - " 2.33442e+01, 1.58799e+03, 1.40420e+03, 1.46200e+01,\n", - " 2.16100e+01, 5.54260e+02, 2.38808e+03, 9.05294e+03],\n", - " [ 7.00000e-04, 0.00000e+00, 1.00000e+02, 5.18670e+02,\n", - " 1.30000e+00, 4.71300e+01, 5.22860e+02, 2.38808e+03,\n", - " 8.13383e+03, 8.36820e+00, 3.00000e-02, 3.92000e+02,\n", - " 2.38800e+03, 1.00000e+02, 6.42350e+02, 3.88800e+01,\n", - " 2.33739e+01, 1.58279e+03, 1.40187e+03, 1.46200e+01,\n", - " 2.16100e+01, 5.54450e+02, 2.38811e+03, 9.04948e+03],\n", - " [-1.90000e-03, -2.00000e-04, 1.00000e+02, 5.18670e+02,\n", - " 1.30000e+00, 4.72800e+01, 5.22190e+02, 2.38804e+03,\n", - " 8.13380e+03, 8.42940e+00, 3.00000e-02, 3.93000e+02,\n", - " 2.38800e+03, 1.00000e+02, 6.42370e+02, 3.89000e+01,\n", - " 2.34044e+01, 1.58285e+03, 1.40622e+03, 1.46200e+01,\n", - " 2.16100e+01, 5.54000e+02, 2.38806e+03, 9.05515e+03]])" + "array([[3.230000e+02, 3.200000e+02, 2.840000e+02, 3.480000e+02,\n", + " 2.730000e+02, 3.420000e+02, 2.800000e+02, 3.197842e+06,\n", + " 6.950000e+05, 3.348234e+06, 3.436762e+06, 3.322362e+06,\n", + " 3.357952e+06, 3.223797e+06, 8.300000e+00, 6.000000e+00,\n", + " 1.170000e+01, 3.131020e+06, 5.500000e+01, 5.500000e+01,\n", + " 4.700000e+01, 5.800000e+01, 4.500000e+01, 5.800000e+01,\n", + " 4.700000e+01, 3.560000e+02],\n", + " [3.460000e+02, 3.840000e+02, 3.670000e+02, 4.110000e+02,\n", + " 3.310000e+02, 3.600000e+02, 2.490000e+02, 3.197900e+06,\n", + " 6.950630e+05, 3.348296e+06, 3.436829e+06, 3.322417e+06,\n", + " 3.358013e+06, 3.223839e+06, 7.600000e+00, 5.000000e+00,\n", + " 1.020000e+01, 3.131420e+06, 5.800000e+01, 6.300000e+01,\n", + " 6.200000e+01, 6.700000e+01, 5.500000e+01, 6.100000e+01,\n", + " 4.200000e+01, 4.000000e+02],\n", + " [4.070000e+02, 3.630000e+02, 4.070000e+02, 3.930000e+02,\n", + " 2.750000e+02, 3.350000e+02, 2.700000e+02, 3.197968e+06,\n", + " 6.951240e+05, 3.348363e+06, 3.436895e+06, 3.322463e+06,\n", + " 3.358068e+06, 3.223884e+06, 7.800000e+00, 5.700000e+00,\n", + " 9.500000e+00, 3.131822e+06, 6.800000e+01, 6.100000e+01,\n", + " 6.700000e+01, 6.600000e+01, 4.600000e+01, 5.500000e+01,\n", + " 4.500000e+01, 4.020000e+02],\n", + " [2.570000e+02, 3.070000e+02, 3.150000e+02, 3.610000e+02,\n", + " 3.170000e+02, 3.540000e+02, 2.710000e+02, 3.198011e+06,\n", + " 6.951750e+05, 3.348416e+06, 3.436957e+06, 3.322516e+06,\n", + " 3.358128e+06, 3.223929e+06, 8.600000e+00, 6.600000e+00,\n", + " 1.050000e+01, 3.132179e+06, 4.300000e+01, 5.100000e+01,\n", + " 5.300000e+01, 6.200000e+01, 5.300000e+01, 6.000000e+01,\n", + " 4.500000e+01, 3.570000e+02],\n", + " [2.670000e+02, 3.090000e+02, 3.140000e+02, 3.550000e+02,\n", + " 2.620000e+02, 2.460000e+02, 2.120000e+02, 3.198056e+06,\n", + " 6.952260e+05, 3.348470e+06, 3.437016e+06, 3.322559e+06,\n", + " 3.358169e+06, 3.223965e+06, 7.500000e+00, 5.900000e+00,\n", + " 9.600000e+00, 3.132501e+06, 4.500000e+01, 5.100000e+01,\n", + " 5.400000e+01, 5.900000e+01, 4.300000e+01, 4.100000e+01,\n", + " 3.600000e+01, 3.220000e+02]])" ] }, - "execution_count": 18, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1438,18 +1362,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 24, "id": "f50662d2", "metadata": {}, "outputs": [], "source": [ - "step = 4\n", + "step = 5\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 25, "id": "37bf8d65", "metadata": {}, "outputs": [ @@ -1459,7 +1383,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 20, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1470,41 +1394,46 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 26, "id": "73c5d941", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[-0.08045977, -0.69230769, -1. , -1. , -1. ,\n", - " -0.22543353, 0.17159763, -0.58823529, -0.60078439, -0.2720277 ,\n", - " -1. , -0.33333333, -1. , -1. , -0.59411765,\n", - " 0.42635659, 0.40377157, -0.13682891, -0.38048616, -1. ,\n", - " 1. , 0.45249597, -0.49253731, -0.78048999],\n", - " [ 0.2183908 , -0.53846154, -1. , -1. , -1. ,\n", - " -0.20231214, 0.41617357, -0.44117647, -0.674373 , -0.17737591,\n", - " -1. , -0.33333333, -1. , -1. , -0.4 ,\n", - " 0.33333333, 0.41607597, -0.04825569, -0.29473329, -1. ,\n", - " 1. , 0.25603865, -0.55223881, -0.79951539],\n", - " [-0.49425287, 0.38461538, -1. , -1. , -1. ,\n", - " -0.4566474 , 0.47140039, -0.55882353, -0.65641449, -0.28510966,\n", - " -1. , -0.66666667, -1. , -1. , -0.28235294,\n", - " 0.25581395, 0.20369132, -0.2082724 , -0.25894666, -1. ,\n", - " 1. , 0.42028986, -0.43283582, -0.71991385],\n", - " [ 0.08045977, -0.07692308, -1. , -1. , -1. ,\n", - " -0.61849711, 0.64497041, -0.41176471, -0.6502219 , -0.66679492,\n", - " -1. , -0.33333333, -1. , -1. , -0.28235294,\n", - " 0.14728682, 0.28313495, -0.42552747, -0.33760972, -1. ,\n", - " 1. , 0.48148148, -0.34328358, -0.75096473],\n", - " [-0.2183908 , -0.38461538, -1. , -1. , -1. ,\n", - " -0.44508671, 0.38067061, -0.52941176, -0.65053153, -0.19584456,\n", - " -1. , -0.16666667, -1. , -1. , -0.27058824,\n", - " 0.17829457, 0.36471847, -0.42302068, -0.19074949, -1. ,\n", - " 1. , 0.33655395, -0.49253731, -0.70008077]])" + "array([[-0.23563892, -0.24267292, -0.3286385 , -0.17702227, -0.35287222,\n", + " -0.19248826, -0.3317757 , -1. , -1. , -1. ,\n", + " -1. , -1. , -1. , -1. , -0.11702128,\n", + " -0.24050633, -0.25714286, -0.37378787, -0.22758621, -0.22758621,\n", + " -0.31972789, -0.1862069 , -0.36986301, -0.1862069 , -0.33793103,\n", + " -0.26141079],\n", + " [-0.18171161, -0.0926143 , -0.13380282, -0.02930832, -0.21688159,\n", + " -0.15023474, -0.40420561, -0.99995911, -0.99995779, -0.99995941,\n", + " -0.99995718, -0.99996326, -0.99996042, -0.99997164, -0.19148936,\n", + " -0.36708861, -0.35238095, -0.37370786, -0.1862069 , -0.11724138,\n", + " -0.11564626, -0.06206897, -0.23287671, -0.14482759, -0.40689655,\n", + " -0.17012448],\n", + " [-0.03868699, -0.14185229, -0.0399061 , -0.07151231, -0.34818288,\n", + " -0.20892019, -0.35514019, -0.99991116, -0.99991693, -0.99991555,\n", + " -0.999915 , -0.99993254, -0.99992474, -0.99994125, -0.17021277,\n", + " -0.27848101, -0.3968254 , -0.37362746, -0.04827586, -0.14482759,\n", + " -0.04761905, -0.07586207, -0.35616438, -0.22758621, -0.36551724,\n", + " -0.1659751 ],\n", + " [-0.39038687, -0.27315358, -0.25586854, -0.14654162, -0.24970692,\n", + " -0.16431925, -0.35280374, -0.99988085, -0.99988276, -0.99988086,\n", + " -0.99987538, -0.99989714, -0.99988581, -0.99991086, -0.08510638,\n", + " -0.16455696, -0.33333333, -0.37355606, -0.39310345, -0.28275862,\n", + " -0.23809524, -0.13103448, -0.26027397, -0.15862069, -0.36551724,\n", + " -0.2593361 ],\n", + " [-0.36694021, -0.26846424, -0.25821596, -0.16060961, -0.37866354,\n", + " -0.41784038, -0.49065421, -0.99984912, -0.99984859, -0.99984551,\n", + " -0.99983767, -0.99986841, -0.99985921, -0.99988655, -0.20212766,\n", + " -0.25316456, -0.39047619, -0.37349166, -0.36551724, -0.28275862,\n", + " -0.2244898 , -0.17241379, -0.39726027, -0.42068966, -0.48965517,\n", + " -0.33195021]])" ] }, - "execution_count": 21, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1527,18 +1456,18 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 27, "id": "4722001e", "metadata": {}, "outputs": [], "source": [ - "step = 5\n", + "step = 6\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 28, "id": "34b5d2ca", "metadata": {}, "outputs": [ @@ -1548,7 +1477,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 23, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1559,7 +1488,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 29, "id": "011b9c51", "metadata": {}, "outputs": [ @@ -1595,8 +1524,6 @@ " 8\n", " 9\n", " ...\n", - " 14\n", - " 15\n", " 16\n", " 17\n", " 18\n", @@ -1605,160 +1532,162 @@ " 21\n", " 22\n", " 23\n", + " 24\n", + " 25\n", " \n", " \n", " \n", " \n", " 0\n", - " -0.080460\n", - " -0.692308\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.225434\n", - " 0.171598\n", - " -0.588235\n", - " -0.600784\n", - " -0.272028\n", + " -0.235639\n", + " -0.242673\n", + " -0.328638\n", + " -0.177022\n", + " -0.352872\n", + " -0.192488\n", + " -0.331776\n", + " -1.000000\n", + " -1.000000\n", + " -1.000000\n", " ...\n", - " -0.594118\n", - " 0.426357\n", - " 0.403772\n", - " -0.136829\n", - " -0.380486\n", - " -1.0\n", - " 1.0\n", - " 0.452496\n", - " -0.492537\n", - " -0.780490\n", + " -0.257143\n", + " -0.373788\n", + " -0.227586\n", + " -0.227586\n", + " -0.319728\n", + " -0.186207\n", + " -0.369863\n", + " -0.186207\n", + " -0.337931\n", + " -0.261411\n", " \n", " \n", " 1\n", - " 0.218391\n", - " -0.538462\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.202312\n", - " 0.416174\n", - " -0.441176\n", - " -0.674373\n", - " -0.177376\n", + " -0.181712\n", + " -0.092614\n", + " -0.133803\n", + " -0.029308\n", + " -0.216882\n", + " -0.150235\n", + " -0.404206\n", + " -0.999959\n", + " -0.999958\n", + " -0.999959\n", " ...\n", - " -0.400000\n", - " 0.333333\n", - " 0.416076\n", - " -0.048256\n", - " -0.294733\n", - " -1.0\n", - " 1.0\n", - " 0.256039\n", - " -0.552239\n", - " -0.799515\n", + " -0.352381\n", + " -0.373708\n", + " -0.186207\n", + " -0.117241\n", + " -0.115646\n", + " -0.062069\n", + " -0.232877\n", + " -0.144828\n", + " -0.406897\n", + " -0.170124\n", " \n", " \n", " 2\n", - " -0.494253\n", - " 0.384615\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.456647\n", - " 0.471400\n", - " -0.558824\n", - " -0.656414\n", - " -0.285110\n", + " -0.038687\n", + " -0.141852\n", + " -0.039906\n", + " -0.071512\n", + " -0.348183\n", + " -0.208920\n", + " -0.355140\n", + " -0.999911\n", + " -0.999917\n", + " -0.999916\n", " ...\n", - " -0.282353\n", - " 0.255814\n", - " 0.203691\n", - " -0.208272\n", - " -0.258947\n", - " -1.0\n", - " 1.0\n", - " 0.420290\n", - " -0.432836\n", - " -0.719914\n", + " -0.396825\n", + " -0.373627\n", + " -0.048276\n", + " -0.144828\n", + " -0.047619\n", + " -0.075862\n", + " -0.356164\n", + " -0.227586\n", + " -0.365517\n", + " -0.165975\n", " \n", " \n", " 3\n", - " 0.080460\n", - " -0.076923\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.618497\n", - " 0.644970\n", - " -0.411765\n", - " -0.650222\n", - " -0.666795\n", + " -0.390387\n", + " -0.273154\n", + " -0.255869\n", + " -0.146542\n", + " -0.249707\n", + " -0.164319\n", + " -0.352804\n", + " -0.999881\n", + " -0.999883\n", + " -0.999881\n", " ...\n", - " -0.282353\n", - " 0.147287\n", - " 0.283135\n", - " -0.425527\n", - " -0.337610\n", - " -1.0\n", - " 1.0\n", - " 0.481481\n", - " -0.343284\n", - " -0.750965\n", + " -0.333333\n", + " -0.373556\n", + " -0.393103\n", + " -0.282759\n", + " -0.238095\n", + " -0.131034\n", + " -0.260274\n", + " -0.158621\n", + " -0.365517\n", + " -0.259336\n", " \n", " \n", " 4\n", - " -0.218391\n", - " -0.384615\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.445087\n", - " 0.380671\n", - " -0.529412\n", - " -0.650532\n", - " -0.195845\n", + " -0.366940\n", + " -0.268464\n", + " -0.258216\n", + " -0.160610\n", + " -0.378664\n", + " -0.417840\n", + " -0.490654\n", + " -0.999849\n", + " -0.999849\n", + " -0.999846\n", " ...\n", - " -0.270588\n", - " 0.178295\n", - " 0.364718\n", - " -0.423021\n", - " -0.190749\n", - " -1.0\n", - " 1.0\n", - " 0.336554\n", - " -0.492537\n", - " -0.700081\n", + " -0.390476\n", + " -0.373492\n", + " -0.365517\n", + " -0.282759\n", + " -0.224490\n", + " -0.172414\n", + " -0.397260\n", + " -0.420690\n", + " -0.489655\n", + " -0.331950\n", " \n", " \n", "\n", - "

5 rows × 24 columns

\n", + "

5 rows × 26 columns

\n", "" ], "text/plain": [ - " 0 1 2 3 4 5 6 7 8 \\\n", - "0 -0.080460 -0.692308 -1.0 -1.0 -1.0 -0.225434 0.171598 -0.588235 -0.600784 \n", - "1 0.218391 -0.538462 -1.0 -1.0 -1.0 -0.202312 0.416174 -0.441176 -0.674373 \n", - "2 -0.494253 0.384615 -1.0 -1.0 -1.0 -0.456647 0.471400 -0.558824 -0.656414 \n", - "3 0.080460 -0.076923 -1.0 -1.0 -1.0 -0.618497 0.644970 -0.411765 -0.650222 \n", - "4 -0.218391 -0.384615 -1.0 -1.0 -1.0 -0.445087 0.380671 -0.529412 -0.650532 \n", + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", "\n", - " 9 ... 14 15 16 17 18 19 20 \\\n", - "0 -0.272028 ... -0.594118 0.426357 0.403772 -0.136829 -0.380486 -1.0 1.0 \n", - "1 -0.177376 ... -0.400000 0.333333 0.416076 -0.048256 -0.294733 -1.0 1.0 \n", - "2 -0.285110 ... -0.282353 0.255814 0.203691 -0.208272 -0.258947 -1.0 1.0 \n", - "3 -0.666795 ... -0.282353 0.147287 0.283135 -0.425527 -0.337610 -1.0 1.0 \n", - "4 -0.195845 ... -0.270588 0.178295 0.364718 -0.423021 -0.190749 -1.0 1.0 \n", + " 7 8 9 ... 16 17 18 19 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.257143 -0.373788 -0.227586 -0.227586 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.352381 -0.373708 -0.186207 -0.117241 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.396825 -0.373627 -0.048276 -0.144828 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.333333 -0.373556 -0.393103 -0.282759 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.390476 -0.373492 -0.365517 -0.282759 \n", "\n", - " 21 22 23 \n", - "0 0.452496 -0.492537 -0.780490 \n", - "1 0.256039 -0.552239 -0.799515 \n", - "2 0.420290 -0.432836 -0.719914 \n", - "3 0.481481 -0.343284 -0.750965 \n", - "4 0.336554 -0.492537 -0.700081 \n", + " 20 21 22 23 24 25 \n", + "0 -0.319728 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 \n", + "1 -0.115646 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 \n", + "2 -0.047619 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 \n", + "3 -0.238095 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 \n", + "4 -0.224490 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 \n", "\n", - "[5 rows x 24 columns]" + "[5 rows x 26 columns]" ] }, - "execution_count": 24, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1781,18 +1710,18 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 30, "id": "d58c17c1", "metadata": {}, "outputs": [], "source": [ - "step = 6\n", + "step = 7\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 31, "id": "b5b62c52", "metadata": {}, "outputs": [ @@ -1802,7 +1731,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 26, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1813,7 +1742,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 32, "id": "8bedb44e", "metadata": {}, "outputs": [ @@ -1849,8 +1778,6 @@ " 8\n", " 9\n", " ...\n", - " 15\n", - " 16\n", " 17\n", " 18\n", " 19\n", @@ -1858,161 +1785,163 @@ " 21\n", " 22\n", " 23\n", + " 24\n", + " 25\n", " turbine_id\n", " \n", " \n", " \n", " \n", " 0\n", - " -0.080460\n", - " -0.692308\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.225434\n", - " 0.171598\n", - " -0.588235\n", - " -0.600784\n", - " -0.272028\n", + " -0.235639\n", + " -0.242673\n", + " -0.328638\n", + " -0.177022\n", + " -0.352872\n", + " -0.192488\n", + " -0.331776\n", + " -1.000000\n", + " -1.000000\n", + " -1.000000\n", " ...\n", - " 0.426357\n", - " 0.403772\n", - " -0.136829\n", - " -0.380486\n", - " -1.0\n", - " 1.0\n", - " 0.452496\n", - " -0.492537\n", - " -0.780490\n", - " 1\n", + " -0.373788\n", + " -0.227586\n", + " -0.227586\n", + " -0.319728\n", + " -0.186207\n", + " -0.369863\n", + " -0.186207\n", + " -0.337931\n", + " -0.261411\n", + " T001\n", " \n", " \n", " 1\n", - " 0.218391\n", - " -0.538462\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.202312\n", - " 0.416174\n", - " -0.441176\n", - " -0.674373\n", - " -0.177376\n", + " -0.181712\n", + " -0.092614\n", + " -0.133803\n", + " -0.029308\n", + " -0.216882\n", + " -0.150235\n", + " -0.404206\n", + " -0.999959\n", + " -0.999958\n", + " -0.999959\n", " ...\n", - " 0.333333\n", - " 0.416076\n", - " -0.048256\n", - " -0.294733\n", - " -1.0\n", - " 1.0\n", - " 0.256039\n", - " -0.552239\n", - " -0.799515\n", - " 1\n", + " -0.373708\n", + " -0.186207\n", + " -0.117241\n", + " -0.115646\n", + " -0.062069\n", + " -0.232877\n", + " -0.144828\n", + " -0.406897\n", + " -0.170124\n", + " T001\n", " \n", " \n", " 2\n", - " -0.494253\n", - " 0.384615\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.456647\n", - " 0.471400\n", - " -0.558824\n", - " -0.656414\n", - " -0.285110\n", + " -0.038687\n", + " -0.141852\n", + " -0.039906\n", + " -0.071512\n", + " -0.348183\n", + " -0.208920\n", + " -0.355140\n", + " -0.999911\n", + " -0.999917\n", + " -0.999916\n", " ...\n", - " 0.255814\n", - " 0.203691\n", - " -0.208272\n", - " -0.258947\n", - " -1.0\n", - " 1.0\n", - " 0.420290\n", - " -0.432836\n", - " -0.719914\n", - " 1\n", + " -0.373627\n", + " -0.048276\n", + " -0.144828\n", + " -0.047619\n", + " -0.075862\n", + " -0.356164\n", + " -0.227586\n", + " -0.365517\n", + " -0.165975\n", + " T001\n", " \n", " \n", " 3\n", - " 0.080460\n", - " -0.076923\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.618497\n", - " 0.644970\n", - " -0.411765\n", - " -0.650222\n", - " -0.666795\n", + " -0.390387\n", + " -0.273154\n", + " -0.255869\n", + " -0.146542\n", + " -0.249707\n", + " -0.164319\n", + " -0.352804\n", + " -0.999881\n", + " -0.999883\n", + " -0.999881\n", " ...\n", - " 0.147287\n", - " 0.283135\n", - " -0.425527\n", - " -0.337610\n", - " -1.0\n", - " 1.0\n", - " 0.481481\n", - " -0.343284\n", - " -0.750965\n", - " 1\n", + " -0.373556\n", + " -0.393103\n", + " -0.282759\n", + " -0.238095\n", + " -0.131034\n", + " -0.260274\n", + " -0.158621\n", + " -0.365517\n", + " -0.259336\n", + " T001\n", " \n", " \n", " 4\n", - " -0.218391\n", - " -0.384615\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.445087\n", - " 0.380671\n", - " -0.529412\n", - " -0.650532\n", - " -0.195845\n", + " -0.366940\n", + " -0.268464\n", + " -0.258216\n", + " -0.160610\n", + " -0.378664\n", + " -0.417840\n", + " -0.490654\n", + " -0.999849\n", + " -0.999849\n", + " -0.999846\n", " ...\n", - " 0.178295\n", - " 0.364718\n", - " -0.423021\n", - " -0.190749\n", - " -1.0\n", - " 1.0\n", - " 0.336554\n", - " -0.492537\n", - " -0.700081\n", - " 1\n", + " -0.373492\n", + " -0.365517\n", + " -0.282759\n", + " -0.224490\n", + " -0.172414\n", + " -0.397260\n", + " -0.420690\n", + " -0.489655\n", + " -0.331950\n", + " T001\n", " \n", " \n", "\n", - "

5 rows × 25 columns

\n", + "

5 rows × 27 columns

\n", "" ], "text/plain": [ - " 0 1 2 3 4 5 6 7 8 \\\n", - "0 -0.080460 -0.692308 -1.0 -1.0 -1.0 -0.225434 0.171598 -0.588235 -0.600784 \n", - "1 0.218391 -0.538462 -1.0 -1.0 -1.0 -0.202312 0.416174 -0.441176 -0.674373 \n", - "2 -0.494253 0.384615 -1.0 -1.0 -1.0 -0.456647 0.471400 -0.558824 -0.656414 \n", - "3 0.080460 -0.076923 -1.0 -1.0 -1.0 -0.618497 0.644970 -0.411765 -0.650222 \n", - "4 -0.218391 -0.384615 -1.0 -1.0 -1.0 -0.445087 0.380671 -0.529412 -0.650532 \n", + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", "\n", - " 9 ... 15 16 17 18 19 20 21 \\\n", - "0 -0.272028 ... 0.426357 0.403772 -0.136829 -0.380486 -1.0 1.0 0.452496 \n", - "1 -0.177376 ... 0.333333 0.416076 -0.048256 -0.294733 -1.0 1.0 0.256039 \n", - "2 -0.285110 ... 0.255814 0.203691 -0.208272 -0.258947 -1.0 1.0 0.420290 \n", - "3 -0.666795 ... 0.147287 0.283135 -0.425527 -0.337610 -1.0 1.0 0.481481 \n", - "4 -0.195845 ... 0.178295 0.364718 -0.423021 -0.190749 -1.0 1.0 0.336554 \n", + " 7 8 9 ... 17 18 19 20 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.373788 -0.227586 -0.227586 -0.319728 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.373708 -0.186207 -0.117241 -0.115646 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.373627 -0.048276 -0.144828 -0.047619 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.373556 -0.393103 -0.282759 -0.238095 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.373492 -0.365517 -0.282759 -0.224490 \n", "\n", - " 22 23 turbine_id \n", - "0 -0.492537 -0.780490 1 \n", - "1 -0.552239 -0.799515 1 \n", - "2 -0.432836 -0.719914 1 \n", - "3 -0.343284 -0.750965 1 \n", - "4 -0.492537 -0.700081 1 \n", + " 21 22 23 24 25 turbine_id \n", + "0 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 T001 \n", + "1 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 T001 \n", + "2 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 T001 \n", + "3 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 T001 \n", + "4 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 T001 \n", "\n", - "[5 rows x 25 columns]" + "[5 rows x 27 columns]" ] }, - "execution_count": 27, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -2035,18 +1964,18 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 33, "id": "9896ef19", "metadata": {}, "outputs": [], "source": [ - "step = 7\n", + "step = 8\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 34, "id": "384e4e91", "metadata": {}, "outputs": [ @@ -2056,7 +1985,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 29, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -2067,7 +1996,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 35, "id": "7dcc2b2c", "metadata": {}, "outputs": [ @@ -2103,14 +2032,14 @@ " 8\n", " 9\n", " ...\n", - " 16\n", - " 17\n", " 18\n", " 19\n", " 20\n", " 21\n", " 22\n", " 23\n", + " 24\n", + " 25\n", " turbine_id\n", " timestamp\n", " \n", @@ -2118,155 +2047,155 @@ " \n", " \n", " 0\n", - " -0.080460\n", - " -0.692308\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.225434\n", - " 0.171598\n", - " -0.588235\n", - " -0.600784\n", - " -0.272028\n", + " -0.235639\n", + " -0.242673\n", + " -0.328638\n", + " -0.177022\n", + " -0.352872\n", + " -0.192488\n", + " -0.331776\n", + " -1.000000\n", + " -1.000000\n", + " -1.000000\n", " ...\n", - " 0.403772\n", - " -0.136829\n", - " -0.380486\n", - " -1.0\n", - " 1.0\n", - " 0.452496\n", - " -0.492537\n", - " -0.780490\n", - " 1\n", - " 2013-01-12 00:10:00\n", + " -0.227586\n", + " -0.227586\n", + " -0.319728\n", + " -0.186207\n", + " -0.369863\n", + " -0.186207\n", + " -0.337931\n", + " -0.261411\n", + " T001\n", + " 2013-01-10 00:00:00\n", " \n", " \n", " 1\n", - " 0.218391\n", - " -0.538462\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.202312\n", - " 0.416174\n", - " -0.441176\n", - " -0.674373\n", - " -0.177376\n", + " -0.181712\n", + " -0.092614\n", + " -0.133803\n", + " -0.029308\n", + " -0.216882\n", + " -0.150235\n", + " -0.404206\n", + " -0.999959\n", + " -0.999958\n", + " -0.999959\n", " ...\n", - " 0.416076\n", - " -0.048256\n", - " -0.294733\n", - " -1.0\n", - " 1.0\n", - " 0.256039\n", - " -0.552239\n", - " -0.799515\n", - " 1\n", - " 2013-01-12 00:20:00\n", + " -0.186207\n", + " -0.117241\n", + " -0.115646\n", + " -0.062069\n", + " -0.232877\n", + " -0.144828\n", + " -0.406897\n", + " -0.170124\n", + " T001\n", + " 2013-01-10 00:10:00\n", " \n", " \n", " 2\n", - " -0.494253\n", - " 0.384615\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.456647\n", - " 0.471400\n", - " -0.558824\n", - " -0.656414\n", - " -0.285110\n", + " -0.038687\n", + " -0.141852\n", + " -0.039906\n", + " -0.071512\n", + " -0.348183\n", + " -0.208920\n", + " -0.355140\n", + " -0.999911\n", + " -0.999917\n", + " -0.999916\n", " ...\n", - " 0.203691\n", - " -0.208272\n", - " -0.258947\n", - " -1.0\n", - " 1.0\n", - " 0.420290\n", - " -0.432836\n", - " -0.719914\n", - " 1\n", - " 2013-01-12 00:30:00\n", + " -0.048276\n", + " -0.144828\n", + " -0.047619\n", + " -0.075862\n", + " -0.356164\n", + " -0.227586\n", + " -0.365517\n", + " -0.165975\n", + " T001\n", + " 2013-01-10 00:20:00\n", " \n", " \n", " 3\n", - " 0.080460\n", - " -0.076923\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.618497\n", - " 0.644970\n", - " -0.411765\n", - " -0.650222\n", - " -0.666795\n", + " -0.390387\n", + " -0.273154\n", + " -0.255869\n", + " -0.146542\n", + " -0.249707\n", + " -0.164319\n", + " -0.352804\n", + " -0.999881\n", + " -0.999883\n", + " -0.999881\n", " ...\n", - " 0.283135\n", - " -0.425527\n", - " -0.337610\n", - " -1.0\n", - " 1.0\n", - " 0.481481\n", - " -0.343284\n", - " -0.750965\n", - " 1\n", - " 2013-01-12 00:40:00\n", + " -0.393103\n", + " -0.282759\n", + " -0.238095\n", + " -0.131034\n", + " -0.260274\n", + " -0.158621\n", + " -0.365517\n", + " -0.259336\n", + " T001\n", + " 2013-01-10 00:30:00\n", " \n", " \n", " 4\n", - " -0.218391\n", - " -0.384615\n", - " -1.0\n", - " -1.0\n", - " -1.0\n", - " -0.445087\n", - " 0.380671\n", - " -0.529412\n", - " -0.650532\n", - " -0.195845\n", + " -0.366940\n", + " -0.268464\n", + " -0.258216\n", + " -0.160610\n", + " -0.378664\n", + " -0.417840\n", + " -0.490654\n", + " -0.999849\n", + " -0.999849\n", + " -0.999846\n", " ...\n", - " 0.364718\n", - " -0.423021\n", - " -0.190749\n", - " -1.0\n", - " 1.0\n", - " 0.336554\n", - " -0.492537\n", - " -0.700081\n", - " 1\n", - " 2013-01-12 00:50:00\n", + " -0.365517\n", + " -0.282759\n", + " -0.224490\n", + " -0.172414\n", + " -0.397260\n", + " -0.420690\n", + " -0.489655\n", + " -0.331950\n", + " T001\n", + " 2013-01-10 00:40:00\n", " \n", " \n", "\n", - "

5 rows × 26 columns

\n", + "

5 rows × 28 columns

\n", "" ], "text/plain": [ - " 0 1 2 3 4 5 6 7 8 \\\n", - "0 -0.080460 -0.692308 -1.0 -1.0 -1.0 -0.225434 0.171598 -0.588235 -0.600784 \n", - "1 0.218391 -0.538462 -1.0 -1.0 -1.0 -0.202312 0.416174 -0.441176 -0.674373 \n", - "2 -0.494253 0.384615 -1.0 -1.0 -1.0 -0.456647 0.471400 -0.558824 -0.656414 \n", - "3 0.080460 -0.076923 -1.0 -1.0 -1.0 -0.618497 0.644970 -0.411765 -0.650222 \n", - "4 -0.218391 -0.384615 -1.0 -1.0 -1.0 -0.445087 0.380671 -0.529412 -0.650532 \n", + " 0 1 2 3 4 5 6 \\\n", + "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", + "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", + "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", + "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", + "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", "\n", - " 9 ... 16 17 18 19 20 21 22 \\\n", - "0 -0.272028 ... 0.403772 -0.136829 -0.380486 -1.0 1.0 0.452496 -0.492537 \n", - "1 -0.177376 ... 0.416076 -0.048256 -0.294733 -1.0 1.0 0.256039 -0.552239 \n", - "2 -0.285110 ... 0.203691 -0.208272 -0.258947 -1.0 1.0 0.420290 -0.432836 \n", - "3 -0.666795 ... 0.283135 -0.425527 -0.337610 -1.0 1.0 0.481481 -0.343284 \n", - "4 -0.195845 ... 0.364718 -0.423021 -0.190749 -1.0 1.0 0.336554 -0.492537 \n", + " 7 8 9 ... 18 19 20 21 \\\n", + "0 -1.000000 -1.000000 -1.000000 ... -0.227586 -0.227586 -0.319728 -0.186207 \n", + "1 -0.999959 -0.999958 -0.999959 ... -0.186207 -0.117241 -0.115646 -0.062069 \n", + "2 -0.999911 -0.999917 -0.999916 ... -0.048276 -0.144828 -0.047619 -0.075862 \n", + "3 -0.999881 -0.999883 -0.999881 ... -0.393103 -0.282759 -0.238095 -0.131034 \n", + "4 -0.999849 -0.999849 -0.999846 ... -0.365517 -0.282759 -0.224490 -0.172414 \n", "\n", - " 23 turbine_id timestamp \n", - "0 -0.780490 1 2013-01-12 00:10:00 \n", - "1 -0.799515 1 2013-01-12 00:20:00 \n", - "2 -0.719914 1 2013-01-12 00:30:00 \n", - "3 -0.750965 1 2013-01-12 00:40:00 \n", - "4 -0.700081 1 2013-01-12 00:50:00 \n", + " 22 23 24 25 turbine_id timestamp \n", + "0 -0.369863 -0.186207 -0.337931 -0.261411 T001 2013-01-10 00:00:00 \n", + "1 -0.232877 -0.144828 -0.406897 -0.170124 T001 2013-01-10 00:10:00 \n", + "2 -0.356164 -0.227586 -0.365517 -0.165975 T001 2013-01-10 00:20:00 \n", + "3 -0.260274 -0.158621 -0.365517 -0.259336 T001 2013-01-10 00:30:00 \n", + "4 -0.397260 -0.420690 -0.489655 -0.331950 T001 2013-01-10 00:40:00 \n", "\n", - "[5 rows x 26 columns]" + "[5 rows x 28 columns]" ] }, - "execution_count": 30, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2290,7 +2219,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 36, "id": "b4ff2d0a", "metadata": {}, "outputs": [ @@ -2300,30 +2229,30 @@ "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" ] }, - "execution_count": 31, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline._pipeline.get_hyperparameters()[\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" + " 'mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1']" ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 37, "id": "2c8fd174", "metadata": {}, "outputs": [], "source": [ - "step = 8\n", + "step = 9\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 38, "id": "b051da01", "metadata": {}, "outputs": [ @@ -2333,7 +2262,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 33, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2344,17 +2273,17 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 39, "id": "a802d22b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(33727, 26)" + "(51121, 28)" ] }, - "execution_count": 34, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -2365,17 +2294,17 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 40, "id": "cc53012b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(18131,)" + "(353,)" ] }, - "execution_count": 35, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2386,17 +2315,17 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 41, "id": "b1212aaf", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(18131, 24, 24)" + "(353, 24, 26)" ] }, - "execution_count": 36, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -2407,31 +2336,34 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 42, "id": "87abb56d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[ 0.2183908 , -0.53846154, -1. , -1. , -1. ,\n", - " -0.20231214, 0.41617357, -0.44117647, -0.674373 , -0.17737591,\n", - " -1. , -0.33333333, -1. , -1. , -0.4 ,\n", - " 0.33333333, 0.41607597, -0.04825569, -0.29473329, -1. ,\n", - " 1. , 0.25603865, -0.55223881, -0.79951539],\n", - " [-0.49425287, 0.38461538, -1. , -1. , -1. ,\n", - " -0.4566474 , 0.47140039, -0.55882353, -0.65641449, -0.28510966,\n", - " -1. , -0.66666667, -1. , -1. , -0.28235294,\n", - " 0.25581395, 0.20369132, -0.2082724 , -0.25894666, -1. ,\n", - " 1. , 0.42028986, -0.43283582, -0.71991385],\n", - " [ 0.08045977, -0.07692308, -1. , -1. , -1. ,\n", - " -0.61849711, 0.64497041, -0.41176471, -0.6502219 , -0.66679492,\n", - " -1. , -0.33333333, -1. , -1. , -0.28235294,\n", - " 0.14728682, 0.28313495, -0.42552747, -0.33760972, -1. ,\n", - " 1. , 0.48148148, -0.34328358, -0.75096473]])" + "array([[-0.66002345, -0.57327081, -0.64084507, -0.57796014, -0.6014068 ,\n", + " -0.56103286, -0.55140187, -0.9928135 , -0.99291267, -0.99315058,\n", + " -0.99304288, -0.99346346, -0.99352632, -0.99395333, -0.42553191,\n", + " -0.41772152, -0.58730159, -0.35996294, -0.66896552, -0.57241379,\n", + " -0.61904762, -0.5862069 , -0.60273973, -0.55862069, -0.55862069,\n", + " -0.59751037],\n", + " [-0.2989449 , -0.38569754, -0.48591549, -0.47713951, -0.66705744,\n", + " -0.5915493 , -0.77336449, -0.99278389, -0.9928852 , -0.99312701,\n", + " -0.99301988, -0.9934481 , -0.9935075 , -0.9939459 , -0.39361702,\n", + " -0.40506329, -0.54285714, -0.35992014, -0.40689655, -0.42068966,\n", + " -0.46938776, -0.48965517, -0.67123288, -0.5862069 , -0.83448276,\n", + " -0.5560166 ],\n", + " [-0.33645955, -0.40679953, -0.39906103, -0.38569754, -0.56154748,\n", + " -0.43192488, -0.45560748, -0.99275498, -0.9928584 , -0.99310017,\n", + " -0.99299431, -0.99342739, -0.99348349, -0.99392294, -0.29787234,\n", + " -0.3164557 , -0.49206349, -0.35986854, -0.42068966, -0.43448276,\n", + " -0.40136054, -0.43448276, -0.56164384, -0.47586207, -0.51724138,\n", + " -0.46473029]])" ] }, - "execution_count": 37, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -2454,7 +2386,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 43, "id": "561c3e09", "metadata": {}, "outputs": [ @@ -2462,15 +2394,49 @@ "name": "stderr", "output_type": "stream", "text": [ - "2022-02-01 10:08:21.044547: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-02-01 10:08:21.080727: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7f8579596430 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", - "2022-02-01 10:08:21.080742: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + "2023-04-13 18:20:05.852611: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n", + "2023-04-13 18:20:05.887442: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fc91ac474f0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2023-04-13 18:20:05.887460: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/35\n", + "5/5 [==============================] - 1s 123ms/step - loss: 0.2339 - mse: 0.2339 - val_loss: 0.1641 - val_mse: 0.1641\n", + "Epoch 2/35\n", + "5/5 [==============================] - 0s 22ms/step - loss: 0.1780 - mse: 0.1780 - val_loss: 0.1505 - val_mse: 0.1505\n", + "Epoch 3/35\n", + "5/5 [==============================] - 0s 21ms/step - loss: 0.1540 - mse: 0.1540 - val_loss: 0.1559 - val_mse: 0.1559\n", + "Epoch 4/35\n", + "5/5 [==============================] - 0s 21ms/step - loss: 0.1532 - mse: 0.1532 - val_loss: 0.1446 - val_mse: 0.1446\n", + "Epoch 5/35\n", + "5/5 [==============================] - 0s 21ms/step - loss: 0.1438 - mse: 0.1438 - val_loss: 0.1511 - val_mse: 0.1511\n", + "Epoch 6/35\n", + "5/5 [==============================] - 0s 24ms/step - loss: 0.1449 - mse: 0.1449 - val_loss: 0.1470 - val_mse: 0.1470\n", + "Epoch 7/35\n", + "5/5 [==============================] - 0s 21ms/step - loss: 0.1437 - mse: 0.1437 - val_loss: 0.1537 - val_mse: 0.1537\n", + "Epoch 8/35\n", + "5/5 [==============================] - 0s 25ms/step - loss: 0.1529 - mse: 0.1529 - val_loss: 0.1910 - val_mse: 0.1910\n", + "Epoch 9/35\n", + "5/5 [==============================] - 0s 25ms/step - loss: 0.1406 - mse: 0.1406 - val_loss: 0.1551 - val_mse: 0.1551\n", + "Epoch 10/35\n", + "5/5 [==============================] - 0s 22ms/step - loss: 0.1360 - mse: 0.1360 - val_loss: 0.1745 - val_mse: 0.1745\n", + "Epoch 11/35\n", + "5/5 [==============================] - 0s 22ms/step - loss: 0.1314 - mse: 0.1314 - val_loss: 0.1848 - val_mse: 0.1848\n", + "Epoch 12/35\n", + "5/5 [==============================] - 0s 21ms/step - loss: 0.1306 - mse: 0.1306 - val_loss: 0.1734 - val_mse: 0.1734\n", + "Epoch 13/35\n", + "5/5 [==============================] - 0s 21ms/step - loss: 0.1258 - mse: 0.1258 - val_loss: 0.1816 - val_mse: 0.1816\n", + "Epoch 14/35\n", + "5/5 [==============================] - 0s 21ms/step - loss: 0.1230 - mse: 0.1230 - val_loss: 0.1820 - val_mse: 0.1820\n", + "6/6 [==============================] - 0s 5ms/step\n" ] } ], "source": [ - "step = 9\n", + "step = 10\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] } @@ -2491,7 +2457,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.8.16" } }, "nbformat": 4, diff --git a/tutorials/pipelines/lstm_with_unstack.ipynb b/tutorials/pipelines/lstm_with_unstack.ipynb index 799b90e..3793f21 100644 --- a/tutorials/pipelines/lstm_with_unstack.ipynb +++ b/tutorials/pipelines/lstm_with_unstack.ipynb @@ -46,7 +46,8 @@ { "data": { "text/plain": [ - "['mlblocks.MLPipeline',\n", + "['pandas.DataFrame.resample',\n", + " 'pandas.DataFrame.unstack',\n", " 'pandas.DataFrame.pop',\n", " 'pandas.DataFrame.pop',\n", " 'sklearn.impute.SimpleImputer',\n", @@ -54,7 +55,7 @@ " 'pandas.DataFrame',\n", " 'pandas.DataFrame.set',\n", " 'pandas.DataFrame.set',\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences',\n", + " 'mlstars.custom.timeseries_preprocessing.cutoff_window_sequences',\n", " 'keras.Sequential.LSTMTimeSeriesClassifier']" ] }, @@ -270,20 +271,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## mlblocks.MLPipeline\n", - "\n", - "### pandas.DataFrame.resample\n", + "## pandas.DataFrame.resample\n", "\n", "* Input: readings\n", "* Output: readings (resampled)\n", "* Effect: readings have been resampled to the indicated resample rule and turbine_id,\n", - " signal_id and timestamp have been set as a multi-index\n", - " \n", - "### pandas.DataFrame.unstack\n", - "\n", - "* Input: readings (resampled)\n", - "* Output: readings (unstacked)\n", - "* Effect: readings have been unstacked" + " signal_id and timestamp have been set as a multi-index" ] }, { @@ -319,6 +312,130 @@ "cell_type": "code", "execution_count": 9, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
turbine_idsignal_idtimestamp
T001S012013-01-10 00:00:00313.333333
2013-01-10 01:00:00197.500000
2013-01-10 02:00:00248.166667
2013-01-10 03:00:00253.166667
2013-01-10 04:00:00305.000000
\n", + "
" + ], + "text/plain": [ + " value\n", + "turbine_id signal_id timestamp \n", + "T001 S01 2013-01-10 00:00:00 313.333333\n", + " 2013-01-10 01:00:00 197.500000\n", + " 2013-01-10 02:00:00 248.166667\n", + " 2013-01-10 03:00:00 253.166667\n", + " 2013-01-10 04:00:00 305.000000" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context['readings'].head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## pandas.DataFrame.unstack\n", + "\n", + "* Input: readings (resampled)\n", + "* Output: readings (unstacked)\n", + "* Effect: readings have been unstacked" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "step = 1\n", + "context = pipeline.fit(**context, output_=step, start_=step)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "dict_keys(['readings', 'turbines', 'X', 'y'])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "context.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, "outputs": [ { "data": { @@ -369,121 +486,121 @@ " 0\n", " T001\n", " 2013-01-10 00:00:00\n", - " 323.0\n", - " 320.0\n", - " 284.0\n", - " 348.0\n", - " 273.0\n", - " 342.0\n", - " 280.0\n", - " 3197842.0\n", + " 313.333333\n", + " 323.833333\n", + " 336.000000\n", + " 364.666667\n", + " 286.500000\n", + " 314.000000\n", + " 243.166667\n", + " 3.197980e+06\n", " ...\n", - " 11.7\n", - " 3131020.0\n", - " 55.0\n", - " 55.0\n", - " 47.0\n", - " 58.0\n", - " 45.0\n", - " 58.0\n", - " 47.0\n", - " 356.0\n", + " 10.383333\n", + " 3.131958e+06\n", + " 52.666667\n", + " 54.333333\n", + " 56.166667\n", + " 61.000000\n", + " 47.666667\n", + " 52.666667\n", + " 40.833333\n", + " 357.333333\n", " \n", " \n", " 1\n", " T001\n", - " 2013-01-10 00:10:00\n", - " 346.0\n", - " 384.0\n", - " 367.0\n", - " 411.0\n", - " 331.0\n", - " 360.0\n", - " 249.0\n", - " 3197900.0\n", + " 2013-01-10 01:00:00\n", + " 197.500000\n", + " 221.333333\n", + " 216.000000\n", + " 260.666667\n", + " 206.833333\n", + " 235.833333\n", + " 186.666667\n", + " 3.198221e+06\n", " ...\n", - " 10.2\n", - " 3131420.0\n", - " 58.0\n", - " 63.0\n", - " 62.0\n", - " 67.0\n", - " 55.0\n", - " 61.0\n", - " 42.0\n", - " 400.0\n", + " 8.666667\n", + " 3.133668e+06\n", + " 33.166667\n", + " 37.000000\n", + " 36.166667\n", + " 43.666667\n", + " 34.500000\n", + " 39.333333\n", + " 31.166667\n", + " 249.666667\n", " \n", " \n", " 2\n", " T001\n", - " 2013-01-10 00:20:00\n", - " 407.0\n", - " 363.0\n", - " 407.0\n", - " 393.0\n", - " 275.0\n", - " 335.0\n", - " 270.0\n", - " 3197968.0\n", + " 2013-01-10 02:00:00\n", + " 248.166667\n", + " 271.666667\n", + " 277.500000\n", + " 298.000000\n", + " 233.666667\n", + " 271.166667\n", + " 216.333333\n", + " 3.198448e+06\n", " ...\n", - " 9.5\n", - " 3131822.0\n", - " 68.0\n", - " 61.0\n", - " 67.0\n", - " 66.0\n", - " 46.0\n", - " 55.0\n", - " 45.0\n", - " 402.0\n", + " 8.833333\n", + " 3.135413e+06\n", + " 41.500000\n", + " 45.666667\n", + " 46.500000\n", + " 49.666667\n", + " 39.333333\n", + " 45.500000\n", + " 36.166667\n", + " 297.666667\n", " \n", " \n", " 3\n", " T001\n", - " 2013-01-10 00:30:00\n", - " 257.0\n", - " 307.0\n", - " 315.0\n", - " 361.0\n", - " 317.0\n", - " 354.0\n", - " 271.0\n", - " 3198011.0\n", + " 2013-01-10 03:00:00\n", + " 253.166667\n", + " 256.166667\n", + " 242.666667\n", + " 265.333333\n", + " 211.666667\n", + " 226.666667\n", + " 181.000000\n", + " 3.198691e+06\n", " ...\n", - " 10.5\n", - " 3132179.0\n", - " 43.0\n", - " 51.0\n", - " 53.0\n", - " 62.0\n", - " 53.0\n", - " 60.0\n", - " 45.0\n", - " 357.0\n", + " 8.433333\n", + " 3.137001e+06\n", + " 42.333333\n", + " 42.833333\n", + " 40.500000\n", + " 44.166667\n", + " 35.333333\n", + " 37.833333\n", + " 30.333333\n", + " 268.000000\n", " \n", " \n", " 4\n", " T001\n", - " 2013-01-10 00:40:00\n", - " 267.0\n", - " 309.0\n", - " 314.0\n", - " 355.0\n", - " 262.0\n", - " 246.0\n", - " 212.0\n", - " 3198056.0\n", + " 2013-01-10 04:00:00\n", + " 305.000000\n", + " 312.333333\n", + " 346.166667\n", + " 329.833333\n", + " 280.666667\n", + " 308.833333\n", + " 271.833333\n", + " 3.198978e+06\n", " ...\n", - " 9.6\n", - " 3132501.0\n", - " 45.0\n", - " 51.0\n", - " 54.0\n", - " 59.0\n", - " 43.0\n", - " 41.0\n", - " 36.0\n", - " 322.0\n", + " 9.083333\n", + " 3.138843e+06\n", + " 50.500000\n", + " 51.166667\n", + " 55.500000\n", + " 53.666667\n", + " 46.166667\n", + " 49.666667\n", + " 41.166667\n", + " 341.833333\n", " \n", " \n", "\n", @@ -491,38 +608,38 @@ "" ], "text/plain": [ - " turbine_id timestamp value_S01 value_S02 value_S03 value_S04 \\\n", - "0 T001 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 \n", - "1 T001 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 \n", - "2 T001 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 \n", - "3 T001 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 \n", - "4 T001 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 \n", + " turbine_id timestamp value_S01 value_S02 value_S03 \\\n", + "0 T001 2013-01-10 00:00:00 313.333333 323.833333 336.000000 \n", + "1 T001 2013-01-10 01:00:00 197.500000 221.333333 216.000000 \n", + "2 T001 2013-01-10 02:00:00 248.166667 271.666667 277.500000 \n", + "3 T001 2013-01-10 03:00:00 253.166667 256.166667 242.666667 \n", + "4 T001 2013-01-10 04:00:00 305.000000 312.333333 346.166667 \n", "\n", - " value_S05 value_S06 value_S07 value_S08 ... value_S17 value_S18 \\\n", - "0 273.0 342.0 280.0 3197842.0 ... 11.7 3131020.0 \n", - "1 331.0 360.0 249.0 3197900.0 ... 10.2 3131420.0 \n", - "2 275.0 335.0 270.0 3197968.0 ... 9.5 3131822.0 \n", - "3 317.0 354.0 271.0 3198011.0 ... 10.5 3132179.0 \n", - "4 262.0 246.0 212.0 3198056.0 ... 9.6 3132501.0 \n", + " value_S04 value_S05 value_S06 value_S07 value_S08 ... \\\n", + "0 364.666667 286.500000 314.000000 243.166667 3.197980e+06 ... \n", + "1 260.666667 206.833333 235.833333 186.666667 3.198221e+06 ... \n", + "2 298.000000 233.666667 271.166667 216.333333 3.198448e+06 ... \n", + "3 265.333333 211.666667 226.666667 181.000000 3.198691e+06 ... \n", + "4 329.833333 280.666667 308.833333 271.833333 3.198978e+06 ... \n", "\n", - " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", - "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", - "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", - "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", - "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", - "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", + "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", + "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", + "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", + "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", "\n", - " value_S25 value_S26 \n", - "0 47.0 356.0 \n", - "1 42.0 400.0 \n", - "2 45.0 402.0 \n", - "3 45.0 357.0 \n", - "4 36.0 322.0 \n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 47.666667 52.666667 40.833333 357.333333 \n", + "1 34.500000 39.333333 31.166667 249.666667 \n", + "2 39.333333 45.500000 36.166667 297.666667 \n", + "3 35.333333 37.833333 30.333333 268.000000 \n", + "4 46.166667 49.666667 41.166667 341.833333 \n", "\n", "[5 rows x 28 columns]" ] }, - "execution_count": 9, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -544,17 +661,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "step = 1\n", + "step = 2\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -563,7 +680,7 @@ "dict_keys(['readings', 'turbines', 'X', 'y', 'turbine_id'])" ] }, - "execution_count": 11, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -574,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -588,7 +705,7 @@ "Name: turbine_id, dtype: object" ] }, - "execution_count": 12, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -599,7 +716,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -650,122 +767,122 @@ " \n", " 0\n", " 2013-01-10 00:00:00\n", - " 323.0\n", - " 320.0\n", - " 284.0\n", - " 348.0\n", - " 273.0\n", - " 342.0\n", - " 280.0\n", - " 3197842.0\n", - " 695000.0\n", + " 313.333333\n", + " 323.833333\n", + " 336.000000\n", + " 364.666667\n", + " 286.500000\n", + " 314.000000\n", + " 243.166667\n", + " 3.197980e+06\n", + " 695143.166667\n", " ...\n", - " 11.7\n", - " 3131020.0\n", - " 55.0\n", - " 55.0\n", - " 47.0\n", - " 58.0\n", - " 45.0\n", - " 58.0\n", - " 47.0\n", - " 356.0\n", + " 10.383333\n", + " 3.131958e+06\n", + " 52.666667\n", + " 54.333333\n", + " 56.166667\n", + " 61.000000\n", + " 47.666667\n", + " 52.666667\n", + " 40.833333\n", + " 357.333333\n", " \n", " \n", " 1\n", - " 2013-01-10 00:10:00\n", - " 346.0\n", - " 384.0\n", - " 367.0\n", - " 411.0\n", - " 331.0\n", - " 360.0\n", - " 249.0\n", - " 3197900.0\n", - " 695063.0\n", + " 2013-01-10 01:00:00\n", + " 197.500000\n", + " 221.333333\n", + " 216.000000\n", + " 260.666667\n", + " 206.833333\n", + " 235.833333\n", + " 186.666667\n", + " 3.198221e+06\n", + " 695403.666667\n", " ...\n", - " 10.2\n", - " 3131420.0\n", - " 58.0\n", - " 63.0\n", - " 62.0\n", - " 67.0\n", - " 55.0\n", - " 61.0\n", - " 42.0\n", - " 400.0\n", + " 8.666667\n", + " 3.133668e+06\n", + " 33.166667\n", + " 37.000000\n", + " 36.166667\n", + " 43.666667\n", + " 34.500000\n", + " 39.333333\n", + " 31.166667\n", + " 249.666667\n", " \n", " \n", " 2\n", - " 2013-01-10 00:20:00\n", - " 407.0\n", - " 363.0\n", - " 407.0\n", - " 393.0\n", - " 275.0\n", - " 335.0\n", - " 270.0\n", - " 3197968.0\n", - " 695124.0\n", + " 2013-01-10 02:00:00\n", + " 248.166667\n", + " 271.666667\n", + " 277.500000\n", + " 298.000000\n", + " 233.666667\n", + " 271.166667\n", + " 216.333333\n", + " 3.198448e+06\n", + " 695656.500000\n", " ...\n", - " 9.5\n", - " 3131822.0\n", - " 68.0\n", - " 61.0\n", - " 67.0\n", - " 66.0\n", - " 46.0\n", - " 55.0\n", - " 45.0\n", - " 402.0\n", + " 8.833333\n", + " 3.135413e+06\n", + " 41.500000\n", + " 45.666667\n", + " 46.500000\n", + " 49.666667\n", + " 39.333333\n", + " 45.500000\n", + " 36.166667\n", + " 297.666667\n", " \n", " \n", " 3\n", - " 2013-01-10 00:30:00\n", - " 257.0\n", - " 307.0\n", - " 315.0\n", - " 361.0\n", - " 317.0\n", - " 354.0\n", - " 271.0\n", - " 3198011.0\n", - " 695175.0\n", + " 2013-01-10 03:00:00\n", + " 253.166667\n", + " 256.166667\n", + " 242.666667\n", + " 265.333333\n", + " 211.666667\n", + " 226.666667\n", + " 181.000000\n", + " 3.198691e+06\n", + " 695911.333333\n", " ...\n", - " 10.5\n", - " 3132179.0\n", - " 43.0\n", - " 51.0\n", - " 53.0\n", - " 62.0\n", - " 53.0\n", - " 60.0\n", - " 45.0\n", - " 357.0\n", + " 8.433333\n", + " 3.137001e+06\n", + " 42.333333\n", + " 42.833333\n", + " 40.500000\n", + " 44.166667\n", + " 35.333333\n", + " 37.833333\n", + " 30.333333\n", + " 268.000000\n", " \n", " \n", " 4\n", - " 2013-01-10 00:40:00\n", - " 267.0\n", - " 309.0\n", - " 314.0\n", - " 355.0\n", - " 262.0\n", - " 246.0\n", - " 212.0\n", - " 3198056.0\n", - " 695226.0\n", + " 2013-01-10 04:00:00\n", + " 305.000000\n", + " 312.333333\n", + " 346.166667\n", + " 329.833333\n", + " 280.666667\n", + " 308.833333\n", + " 271.833333\n", + " 3.198978e+06\n", + " 696195.833333\n", " ...\n", - " 9.6\n", - " 3132501.0\n", - " 45.0\n", - " 51.0\n", - " 54.0\n", - " 59.0\n", - " 43.0\n", - " 41.0\n", - " 36.0\n", - " 322.0\n", + " 9.083333\n", + " 3.138843e+06\n", + " 50.500000\n", + " 51.166667\n", + " 55.500000\n", + " 53.666667\n", + " 46.166667\n", + " 49.666667\n", + " 41.166667\n", + " 341.833333\n", " \n", " \n", "\n", @@ -773,38 +890,38 @@ "" ], "text/plain": [ - " timestamp value_S01 value_S02 value_S03 value_S04 value_S05 \\\n", - "0 2013-01-10 00:00:00 323.0 320.0 284.0 348.0 273.0 \n", - "1 2013-01-10 00:10:00 346.0 384.0 367.0 411.0 331.0 \n", - "2 2013-01-10 00:20:00 407.0 363.0 407.0 393.0 275.0 \n", - "3 2013-01-10 00:30:00 257.0 307.0 315.0 361.0 317.0 \n", - "4 2013-01-10 00:40:00 267.0 309.0 314.0 355.0 262.0 \n", + " timestamp value_S01 value_S02 value_S03 value_S04 \\\n", + "0 2013-01-10 00:00:00 313.333333 323.833333 336.000000 364.666667 \n", + "1 2013-01-10 01:00:00 197.500000 221.333333 216.000000 260.666667 \n", + "2 2013-01-10 02:00:00 248.166667 271.666667 277.500000 298.000000 \n", + "3 2013-01-10 03:00:00 253.166667 256.166667 242.666667 265.333333 \n", + "4 2013-01-10 04:00:00 305.000000 312.333333 346.166667 329.833333 \n", "\n", - " value_S06 value_S07 value_S08 value_S09 ... value_S17 value_S18 \\\n", - "0 342.0 280.0 3197842.0 695000.0 ... 11.7 3131020.0 \n", - "1 360.0 249.0 3197900.0 695063.0 ... 10.2 3131420.0 \n", - "2 335.0 270.0 3197968.0 695124.0 ... 9.5 3131822.0 \n", - "3 354.0 271.0 3198011.0 695175.0 ... 10.5 3132179.0 \n", - "4 246.0 212.0 3198056.0 695226.0 ... 9.6 3132501.0 \n", + " value_S05 value_S06 value_S07 value_S08 value_S09 ... \\\n", + "0 286.500000 314.000000 243.166667 3.197980e+06 695143.166667 ... \n", + "1 206.833333 235.833333 186.666667 3.198221e+06 695403.666667 ... \n", + "2 233.666667 271.166667 216.333333 3.198448e+06 695656.500000 ... \n", + "3 211.666667 226.666667 181.000000 3.198691e+06 695911.333333 ... \n", + "4 280.666667 308.833333 271.833333 3.198978e+06 696195.833333 ... \n", "\n", - " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", - "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", - "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", - "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", - "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", - "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + " value_S17 value_S18 value_S19 value_S20 value_S21 value_S22 \\\n", + "0 10.383333 3.131958e+06 52.666667 54.333333 56.166667 61.000000 \n", + "1 8.666667 3.133668e+06 33.166667 37.000000 36.166667 43.666667 \n", + "2 8.833333 3.135413e+06 41.500000 45.666667 46.500000 49.666667 \n", + "3 8.433333 3.137001e+06 42.333333 42.833333 40.500000 44.166667 \n", + "4 9.083333 3.138843e+06 50.500000 51.166667 55.500000 53.666667 \n", "\n", - " value_S25 value_S26 \n", - "0 47.0 356.0 \n", - "1 42.0 400.0 \n", - "2 45.0 402.0 \n", - "3 45.0 357.0 \n", - "4 36.0 322.0 \n", + " value_S23 value_S24 value_S25 value_S26 \n", + "0 47.666667 52.666667 40.833333 357.333333 \n", + "1 34.500000 39.333333 31.166667 249.666667 \n", + "2 39.333333 45.500000 36.166667 297.666667 \n", + "3 35.333333 37.833333 30.333333 268.000000 \n", + "4 46.166667 49.666667 41.166667 341.833333 \n", "\n", "[5 rows x 27 columns]" ] }, - "execution_count": 13, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -826,17 +943,17 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ - "step = 2\n", + "step = 3\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -845,7 +962,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'X', 'y', 'timestamp'])" ] }, - "execution_count": 15, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -856,21 +973,21 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 2013-01-10 00:00:00\n", - "1 2013-01-10 00:10:00\n", - "2 2013-01-10 00:20:00\n", - "3 2013-01-10 00:30:00\n", - "4 2013-01-10 00:40:00\n", + "1 2013-01-10 01:00:00\n", + "2 2013-01-10 02:00:00\n", + "3 2013-01-10 03:00:00\n", + "4 2013-01-10 04:00:00\n", "Name: timestamp, dtype: datetime64[ns]" ] }, - "execution_count": 16, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -881,7 +998,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -931,123 +1048,123 @@ " \n", " \n", " 0\n", - " 323.0\n", - " 320.0\n", - " 284.0\n", - " 348.0\n", - " 273.0\n", - " 342.0\n", - " 280.0\n", - " 3197842.0\n", - " 695000.0\n", - " 3348234.0\n", + " 313.333333\n", + " 323.833333\n", + " 336.000000\n", + " 364.666667\n", + " 286.500000\n", + " 314.000000\n", + " 243.166667\n", + " 3.197980e+06\n", + " 695143.166667\n", + " 3.348384e+06\n", " ...\n", - " 11.7\n", - " 3131020.0\n", - " 55.0\n", - " 55.0\n", - " 47.0\n", - " 58.0\n", - " 45.0\n", - " 58.0\n", - " 47.0\n", - " 356.0\n", + " 10.383333\n", + " 3.131958e+06\n", + " 52.666667\n", + " 54.333333\n", + " 56.166667\n", + " 61.000000\n", + " 47.666667\n", + " 52.666667\n", + " 40.833333\n", + " 357.333333\n", " \n", " \n", " 1\n", - " 346.0\n", - " 384.0\n", - " 367.0\n", - " 411.0\n", - " 331.0\n", - " 360.0\n", - " 249.0\n", - " 3197900.0\n", - " 695063.0\n", - " 3348296.0\n", + " 197.500000\n", + " 221.333333\n", + " 216.000000\n", + " 260.666667\n", + " 206.833333\n", + " 235.833333\n", + " 186.666667\n", + " 3.198221e+06\n", + " 695403.666667\n", + " 3.348651e+06\n", " ...\n", - " 10.2\n", - " 3131420.0\n", - " 58.0\n", - " 63.0\n", - " 62.0\n", - " 67.0\n", - " 55.0\n", - " 61.0\n", - " 42.0\n", - " 400.0\n", + " 8.666667\n", + " 3.133668e+06\n", + " 33.166667\n", + " 37.000000\n", + " 36.166667\n", + " 43.666667\n", + " 34.500000\n", + " 39.333333\n", + " 31.166667\n", + " 249.666667\n", " \n", " \n", " 2\n", - " 407.0\n", - " 363.0\n", - " 407.0\n", - " 393.0\n", - " 275.0\n", - " 335.0\n", - " 270.0\n", - " 3197968.0\n", - " 695124.0\n", - " 3348363.0\n", + " 248.166667\n", + " 271.666667\n", + " 277.500000\n", + " 298.000000\n", + " 233.666667\n", + " 271.166667\n", + " 216.333333\n", + " 3.198448e+06\n", + " 695656.500000\n", + " 3.348910e+06\n", " ...\n", - " 9.5\n", - " 3131822.0\n", - " 68.0\n", - " 61.0\n", - " 67.0\n", - " 66.0\n", - " 46.0\n", - " 55.0\n", - " 45.0\n", - " 402.0\n", + " 8.833333\n", + " 3.135413e+06\n", + " 41.500000\n", + " 45.666667\n", + " 46.500000\n", + " 49.666667\n", + " 39.333333\n", + " 45.500000\n", + " 36.166667\n", + " 297.666667\n", " \n", " \n", " 3\n", - " 257.0\n", - " 307.0\n", - " 315.0\n", - " 361.0\n", - " 317.0\n", - " 354.0\n", - " 271.0\n", - " 3198011.0\n", - " 695175.0\n", - " 3348416.0\n", + " 253.166667\n", + " 256.166667\n", + " 242.666667\n", + " 265.333333\n", + " 211.666667\n", + " 226.666667\n", + " 181.000000\n", + " 3.198691e+06\n", + " 695911.333333\n", + " 3.349157e+06\n", " ...\n", - " 10.5\n", - " 3132179.0\n", - " 43.0\n", - " 51.0\n", - " 53.0\n", - " 62.0\n", - " 53.0\n", - " 60.0\n", - " 45.0\n", - " 357.0\n", + " 8.433333\n", + " 3.137001e+06\n", + " 42.333333\n", + " 42.833333\n", + " 40.500000\n", + " 44.166667\n", + " 35.333333\n", + " 37.833333\n", + " 30.333333\n", + " 268.000000\n", " \n", " \n", " 4\n", - " 267.0\n", - " 309.0\n", - " 314.0\n", - " 355.0\n", - " 262.0\n", - " 246.0\n", - " 212.0\n", - " 3198056.0\n", - " 695226.0\n", - " 3348470.0\n", + " 305.000000\n", + " 312.333333\n", + " 346.166667\n", + " 329.833333\n", + " 280.666667\n", + " 308.833333\n", + " 271.833333\n", + " 3.198978e+06\n", + " 696195.833333\n", + " 3.349452e+06\n", " ...\n", - " 9.6\n", - " 3132501.0\n", - " 45.0\n", - " 51.0\n", - " 54.0\n", - " 59.0\n", - " 43.0\n", - " 41.0\n", - " 36.0\n", - " 322.0\n", + " 9.083333\n", + " 3.138843e+06\n", + " 50.500000\n", + " 51.166667\n", + " 55.500000\n", + " 53.666667\n", + " 46.166667\n", + " 49.666667\n", + " 41.166667\n", + " 341.833333\n", " \n", " \n", "\n", @@ -1055,38 +1172,38 @@ "" ], "text/plain": [ - " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", - "0 323.0 320.0 284.0 348.0 273.0 342.0 \n", - "1 346.0 384.0 367.0 411.0 331.0 360.0 \n", - "2 407.0 363.0 407.0 393.0 275.0 335.0 \n", - "3 257.0 307.0 315.0 361.0 317.0 354.0 \n", - "4 267.0 309.0 314.0 355.0 262.0 246.0 \n", + " value_S01 value_S02 value_S03 value_S04 value_S05 value_S06 \\\n", + "0 313.333333 323.833333 336.000000 364.666667 286.500000 314.000000 \n", + "1 197.500000 221.333333 216.000000 260.666667 206.833333 235.833333 \n", + "2 248.166667 271.666667 277.500000 298.000000 233.666667 271.166667 \n", + "3 253.166667 256.166667 242.666667 265.333333 211.666667 226.666667 \n", + "4 305.000000 312.333333 346.166667 329.833333 280.666667 308.833333 \n", "\n", - " value_S07 value_S08 value_S09 value_S10 ... value_S17 value_S18 \\\n", - "0 280.0 3197842.0 695000.0 3348234.0 ... 11.7 3131020.0 \n", - "1 249.0 3197900.0 695063.0 3348296.0 ... 10.2 3131420.0 \n", - "2 270.0 3197968.0 695124.0 3348363.0 ... 9.5 3131822.0 \n", - "3 271.0 3198011.0 695175.0 3348416.0 ... 10.5 3132179.0 \n", - "4 212.0 3198056.0 695226.0 3348470.0 ... 9.6 3132501.0 \n", + " value_S07 value_S08 value_S09 value_S10 ... value_S17 \\\n", + "0 243.166667 3.197980e+06 695143.166667 3.348384e+06 ... 10.383333 \n", + "1 186.666667 3.198221e+06 695403.666667 3.348651e+06 ... 8.666667 \n", + "2 216.333333 3.198448e+06 695656.500000 3.348910e+06 ... 8.833333 \n", + "3 181.000000 3.198691e+06 695911.333333 3.349157e+06 ... 8.433333 \n", + "4 271.833333 3.198978e+06 696195.833333 3.349452e+06 ... 9.083333 \n", "\n", - " value_S19 value_S20 value_S21 value_S22 value_S23 value_S24 \\\n", - "0 55.0 55.0 47.0 58.0 45.0 58.0 \n", - "1 58.0 63.0 62.0 67.0 55.0 61.0 \n", - "2 68.0 61.0 67.0 66.0 46.0 55.0 \n", - "3 43.0 51.0 53.0 62.0 53.0 60.0 \n", - "4 45.0 51.0 54.0 59.0 43.0 41.0 \n", + " value_S18 value_S19 value_S20 value_S21 value_S22 value_S23 \\\n", + "0 3.131958e+06 52.666667 54.333333 56.166667 61.000000 47.666667 \n", + "1 3.133668e+06 33.166667 37.000000 36.166667 43.666667 34.500000 \n", + "2 3.135413e+06 41.500000 45.666667 46.500000 49.666667 39.333333 \n", + "3 3.137001e+06 42.333333 42.833333 40.500000 44.166667 35.333333 \n", + "4 3.138843e+06 50.500000 51.166667 55.500000 53.666667 46.166667 \n", "\n", - " value_S25 value_S26 \n", - "0 47.0 356.0 \n", - "1 42.0 400.0 \n", - "2 45.0 402.0 \n", - "3 45.0 357.0 \n", - "4 36.0 322.0 \n", + " value_S24 value_S25 value_S26 \n", + "0 52.666667 40.833333 357.333333 \n", + "1 39.333333 31.166667 249.666667 \n", + "2 45.500000 36.166667 297.666667 \n", + "3 37.833333 30.333333 268.000000 \n", + "4 49.666667 41.166667 341.833333 \n", "\n", "[5 rows x 26 columns]" ] }, - "execution_count": 17, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1108,17 +1225,26 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/sarah/anaconda3/envs/draco/lib/python3.8/site-packages/sklearn/impute/_base.py:356: FutureWarning: The 'verbose' parameter was deprecated in version 1.1 and will be removed in 1.3. A warning will always be raised upon the removal of empty columns in the future version.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "step = 3\n", + "step = 4\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1127,7 +1253,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1138,50 +1264,50 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[3.230000e+02, 3.200000e+02, 2.840000e+02, 3.480000e+02,\n", - " 2.730000e+02, 3.420000e+02, 2.800000e+02, 3.197842e+06,\n", - " 6.950000e+05, 3.348234e+06, 3.436762e+06, 3.322362e+06,\n", - " 3.357952e+06, 3.223797e+06, 8.300000e+00, 6.000000e+00,\n", - " 1.170000e+01, 3.131020e+06, 5.500000e+01, 5.500000e+01,\n", - " 4.700000e+01, 5.800000e+01, 4.500000e+01, 5.800000e+01,\n", - " 4.700000e+01, 3.560000e+02],\n", - " [3.460000e+02, 3.840000e+02, 3.670000e+02, 4.110000e+02,\n", - " 3.310000e+02, 3.600000e+02, 2.490000e+02, 3.197900e+06,\n", - " 6.950630e+05, 3.348296e+06, 3.436829e+06, 3.322417e+06,\n", - " 3.358013e+06, 3.223839e+06, 7.600000e+00, 5.000000e+00,\n", - " 1.020000e+01, 3.131420e+06, 5.800000e+01, 6.300000e+01,\n", - " 6.200000e+01, 6.700000e+01, 5.500000e+01, 6.100000e+01,\n", - " 4.200000e+01, 4.000000e+02],\n", - " [4.070000e+02, 3.630000e+02, 4.070000e+02, 3.930000e+02,\n", - " 2.750000e+02, 3.350000e+02, 2.700000e+02, 3.197968e+06,\n", - " 6.951240e+05, 3.348363e+06, 3.436895e+06, 3.322463e+06,\n", - " 3.358068e+06, 3.223884e+06, 7.800000e+00, 5.700000e+00,\n", - " 9.500000e+00, 3.131822e+06, 6.800000e+01, 6.100000e+01,\n", - " 6.700000e+01, 6.600000e+01, 4.600000e+01, 5.500000e+01,\n", - " 4.500000e+01, 4.020000e+02],\n", - " [2.570000e+02, 3.070000e+02, 3.150000e+02, 3.610000e+02,\n", - " 3.170000e+02, 3.540000e+02, 2.710000e+02, 3.198011e+06,\n", - " 6.951750e+05, 3.348416e+06, 3.436957e+06, 3.322516e+06,\n", - " 3.358128e+06, 3.223929e+06, 8.600000e+00, 6.600000e+00,\n", - " 1.050000e+01, 3.132179e+06, 4.300000e+01, 5.100000e+01,\n", - " 5.300000e+01, 6.200000e+01, 5.300000e+01, 6.000000e+01,\n", - " 4.500000e+01, 3.570000e+02],\n", - " [2.670000e+02, 3.090000e+02, 3.140000e+02, 3.550000e+02,\n", - " 2.620000e+02, 2.460000e+02, 2.120000e+02, 3.198056e+06,\n", - " 6.952260e+05, 3.348470e+06, 3.437016e+06, 3.322559e+06,\n", - " 3.358169e+06, 3.223965e+06, 7.500000e+00, 5.900000e+00,\n", - " 9.600000e+00, 3.132501e+06, 4.500000e+01, 5.100000e+01,\n", - " 5.400000e+01, 5.900000e+01, 4.300000e+01, 4.100000e+01,\n", - " 3.600000e+01, 3.220000e+02]])" + "array([[3.13333333e+02, 3.23833333e+02, 3.36000000e+02, 3.64666667e+02,\n", + " 2.86500000e+02, 3.14000000e+02, 2.43166667e+02, 3.19798000e+06,\n", + " 6.95143167e+05, 3.34838383e+06, 3.43692150e+06, 3.32248667e+06,\n", + " 3.35809000e+06, 3.22390150e+06, 7.95000000e+00, 5.85000000e+00,\n", + " 1.03833333e+01, 3.13195833e+06, 5.26666667e+01, 5.43333333e+01,\n", + " 5.61666667e+01, 6.10000000e+01, 4.76666667e+01, 5.26666667e+01,\n", + " 4.08333333e+01, 3.57333333e+02],\n", + " [1.97500000e+02, 2.21333333e+02, 2.16000000e+02, 2.60666667e+02,\n", + " 2.06833333e+02, 2.35833333e+02, 1.86666667e+02, 3.19822067e+06,\n", + " 6.95403667e+05, 3.34865117e+06, 3.43722283e+06, 3.32272200e+06,\n", + " 3.35834000e+06, 3.22409567e+06, 6.83333333e+00, 5.15000000e+00,\n", + " 8.66666667e+00, 3.13366817e+06, 3.31666667e+01, 3.70000000e+01,\n", + " 3.61666667e+01, 4.36666667e+01, 3.45000000e+01, 3.93333333e+01,\n", + " 3.11666667e+01, 2.49666667e+02],\n", + " [2.48166667e+02, 2.71666667e+02, 2.77500000e+02, 2.98000000e+02,\n", + " 2.33666667e+02, 2.71166667e+02, 2.16333333e+02, 3.19844767e+06,\n", + " 6.95656500e+05, 3.34890967e+06, 3.43751900e+06, 3.32295950e+06,\n", + " 3.35862067e+06, 3.22432333e+06, 7.11666667e+00, 5.56666667e+00,\n", + " 8.83333333e+00, 3.13541283e+06, 4.15000000e+01, 4.56666667e+01,\n", + " 4.65000000e+01, 4.96666667e+01, 3.93333333e+01, 4.55000000e+01,\n", + " 3.61666667e+01, 2.97666667e+02],\n", + " [2.53166667e+02, 2.56166667e+02, 2.42666667e+02, 2.65333333e+02,\n", + " 2.11666667e+02, 2.26666667e+02, 1.81000000e+02, 3.19869117e+06,\n", + " 6.95911333e+05, 3.34915717e+06, 3.43778050e+06, 3.32316850e+06,\n", + " 3.35884883e+06, 3.22450217e+06, 6.71666667e+00, 5.16666667e+00,\n", + " 8.43333333e+00, 3.13700133e+06, 4.23333333e+01, 4.28333333e+01,\n", + " 4.05000000e+01, 4.41666667e+01, 3.53333333e+01, 3.78333333e+01,\n", + " 3.03333333e+01, 2.68000000e+02],\n", + " [3.05000000e+02, 3.12333333e+02, 3.46166667e+02, 3.29833333e+02,\n", + " 2.80666667e+02, 3.08833333e+02, 2.71833333e+02, 3.19897850e+06,\n", + " 6.96195833e+05, 3.34945200e+06, 3.43807767e+06, 3.32340933e+06,\n", + " 3.35910983e+06, 3.22471400e+06, 7.20000000e+00, 5.28333333e+00,\n", + " 9.08333333e+00, 3.13884333e+06, 5.05000000e+01, 5.11666667e+01,\n", + " 5.55000000e+01, 5.36666667e+01, 4.61666667e+01, 4.96666667e+01,\n", + " 4.11666667e+01, 3.41833333e+02]])" ] }, - "execution_count": 20, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1203,17 +1329,17 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "metadata": {}, "outputs": [], "source": [ - "step = 4\n", + "step = 5\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "metadata": {}, "outputs": [ { @@ -1222,7 +1348,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 22, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1233,45 +1359,45 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([[-0.23563892, -0.24267292, -0.3286385 , -0.17702227, -0.35287222,\n", - " -0.19248826, -0.3317757 , -1. , -1. , -1. ,\n", - " -1. , -1. , -1. , -1. , -0.11702128,\n", - " -0.24050633, -0.25714286, -0.37378787, -0.22758621, -0.22758621,\n", - " -0.31972789, -0.1862069 , -0.36986301, -0.1862069 , -0.33793103,\n", - " -0.26141079],\n", - " [-0.18171161, -0.0926143 , -0.13380282, -0.02930832, -0.21688159,\n", - " -0.15023474, -0.40420561, -0.99995911, -0.99995779, -0.99995941,\n", - " -0.99995718, -0.99996326, -0.99996042, -0.99997164, -0.19148936,\n", - " -0.36708861, -0.35238095, -0.37370786, -0.1862069 , -0.11724138,\n", - " -0.11564626, -0.06206897, -0.23287671, -0.14482759, -0.40689655,\n", - " -0.17012448],\n", - " [-0.03868699, -0.14185229, -0.0399061 , -0.07151231, -0.34818288,\n", - " -0.20892019, -0.35514019, -0.99991116, -0.99991693, -0.99991555,\n", - " -0.999915 , -0.99993254, -0.99992474, -0.99994125, -0.17021277,\n", - " -0.27848101, -0.3968254 , -0.37362746, -0.04827586, -0.14482759,\n", - " -0.04761905, -0.07586207, -0.35616438, -0.22758621, -0.36551724,\n", - " -0.1659751 ],\n", - " [-0.39038687, -0.27315358, -0.25586854, -0.14654162, -0.24970692,\n", - " -0.16431925, -0.35280374, -0.99988085, -0.99988276, -0.99988086,\n", - " -0.99987538, -0.99989714, -0.99988581, -0.99991086, -0.08510638,\n", - " -0.16455696, -0.33333333, -0.37355606, -0.39310345, -0.28275862,\n", - " -0.23809524, -0.13103448, -0.26027397, -0.15862069, -0.36551724,\n", - " -0.2593361 ],\n", - " [-0.36694021, -0.26846424, -0.25821596, -0.16060961, -0.37866354,\n", - " -0.41784038, -0.49065421, -0.99984912, -0.99984859, -0.99984551,\n", - " -0.99983767, -0.99986841, -0.99985921, -0.99988655, -0.20212766,\n", - " -0.25316456, -0.39047619, -0.37349166, -0.36551724, -0.28275862,\n", - " -0.2244898 , -0.17241379, -0.39726027, -0.42068966, -0.48965517,\n", - " -0.33195021]])" + "array([[-0.26126126, -0.23706897, -0.20870076, -0.14106583, -0.32328767,\n", + " -0.25969448, -0.42198789, -1. , -1. , -1. ,\n", + " -1. , -1. , -1. , -1. , -0.11007463,\n", + " -0.16824645, -0.10424155, -0.37397741, -0.25233645, -0.22716628,\n", + " -0.20140515, -0.13481829, -0.32239156, -0.25380117, -0.4182243 ,\n", + " -0.25697453],\n", + " [-0.53349001, -0.47805643, -0.49088771, -0.38557994, -0.51037182,\n", + " -0.44339992, -0.55438391, -0.99983031, -0.99982547, -0.99982499,\n", + " -0.99980741, -0.9998428 , -0.99983779, -0.99986887, -0.23507463,\n", + " -0.26777251, -0.25233645, -0.37363511, -0.52570093, -0.470726 ,\n", + " -0.4824356 , -0.37866354, -0.50762016, -0.44093567, -0.55373832,\n", + " -0.48085254],\n", + " [-0.41441441, -0.35971787, -0.3462669 , -0.29780564, -0.44735812,\n", + " -0.36036036, -0.48486624, -0.99967026, -0.99965608, -0.99965576,\n", + " -0.99961813, -0.99968416, -0.99965569, -0.99971512, -0.20335821,\n", + " -0.20853081, -0.2379583 , -0.37328583, -0.4088785 , -0.34894614,\n", + " -0.33723653, -0.29425557, -0.43962485, -0.35438596, -0.48364486,\n", + " -0.38104315],\n", + " [-0.40266353, -0.39615987, -0.4281795 , -0.37460815, -0.49902153,\n", + " -0.4649432 , -0.56766257, -0.99949857, -0.99948535, -0.99949373,\n", + " -0.999451 , -0.99954455, -0.99950765, -0.99959435, -0.24813433,\n", + " -0.26540284, -0.27246585, -0.37296782, -0.39719626, -0.38875878,\n", + " -0.42154567, -0.37162954, -0.49589683, -0.4619883 , -0.56542056,\n", + " -0.4427309 ],\n", + " [-0.28084606, -0.26410658, -0.18479326, -0.22296238, -0.3369863 ,\n", + " -0.27183705, -0.35481351, -0.99929598, -0.99929474, -0.99930071,\n", + " -0.99926107, -0.99938368, -0.99933831, -0.9994513 , -0.19402985,\n", + " -0.24881517, -0.21639109, -0.37259906, -0.28271028, -0.27166276,\n", + " -0.21077283, -0.23798359, -0.34349355, -0.29590643, -0.4135514 ,\n", + " -0.28920464]])" ] }, - "execution_count": 23, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1293,17 +1419,17 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "step = 5\n", + "step = 6\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1312,7 +1438,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 25, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -1323,7 +1449,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -1373,123 +1499,123 @@ " \n", " \n", " 0\n", - " -0.235639\n", - " -0.242673\n", - " -0.328638\n", - " -0.177022\n", - " -0.352872\n", - " -0.192488\n", - " -0.331776\n", + " -0.261261\n", + " -0.237069\n", + " -0.208701\n", + " -0.141066\n", + " -0.323288\n", + " -0.259694\n", + " -0.421988\n", " -1.000000\n", " -1.000000\n", " -1.000000\n", " ...\n", - " -0.257143\n", - " -0.373788\n", - " -0.227586\n", - " -0.227586\n", - " -0.319728\n", - " -0.186207\n", - " -0.369863\n", - " -0.186207\n", - " -0.337931\n", - " -0.261411\n", + " -0.104242\n", + " -0.373977\n", + " -0.252336\n", + " -0.227166\n", + " -0.201405\n", + " -0.134818\n", + " -0.322392\n", + " -0.253801\n", + " -0.418224\n", + " -0.256975\n", " \n", " \n", " 1\n", - " -0.181712\n", - " -0.092614\n", - " -0.133803\n", - " -0.029308\n", - " -0.216882\n", - " -0.150235\n", - " -0.404206\n", - " -0.999959\n", - " -0.999958\n", - " -0.999959\n", + " -0.533490\n", + " -0.478056\n", + " -0.490888\n", + " -0.385580\n", + " -0.510372\n", + " -0.443400\n", + " -0.554384\n", + " -0.999830\n", + " -0.999825\n", + " -0.999825\n", " ...\n", - " -0.352381\n", - " -0.373708\n", - " -0.186207\n", - " -0.117241\n", - " -0.115646\n", - " -0.062069\n", - " -0.232877\n", - " -0.144828\n", - " -0.406897\n", - " -0.170124\n", + " -0.252336\n", + " -0.373635\n", + " -0.525701\n", + " -0.470726\n", + " -0.482436\n", + " -0.378664\n", + " -0.507620\n", + " -0.440936\n", + " -0.553738\n", + " -0.480853\n", " \n", " \n", " 2\n", - " -0.038687\n", - " -0.141852\n", - " -0.039906\n", - " -0.071512\n", - " -0.348183\n", - " -0.208920\n", - " -0.355140\n", - " -0.999911\n", - " -0.999917\n", - " -0.999916\n", + " -0.414414\n", + " -0.359718\n", + " -0.346267\n", + " -0.297806\n", + " -0.447358\n", + " -0.360360\n", + " -0.484866\n", + " -0.999670\n", + " -0.999656\n", + " -0.999656\n", " ...\n", - " -0.396825\n", - " -0.373627\n", - " -0.048276\n", - " -0.144828\n", - " -0.047619\n", - " -0.075862\n", - " -0.356164\n", - " -0.227586\n", - " -0.365517\n", - " -0.165975\n", + " -0.237958\n", + " -0.373286\n", + " -0.408879\n", + " -0.348946\n", + " -0.337237\n", + " -0.294256\n", + " -0.439625\n", + " -0.354386\n", + " -0.483645\n", + " -0.381043\n", " \n", " \n", " 3\n", - " -0.390387\n", - " -0.273154\n", - " -0.255869\n", - " -0.146542\n", - " -0.249707\n", - " -0.164319\n", - " -0.352804\n", - " -0.999881\n", - " -0.999883\n", - " -0.999881\n", + " -0.402664\n", + " -0.396160\n", + " -0.428180\n", + " -0.374608\n", + " -0.499022\n", + " -0.464943\n", + " -0.567663\n", + " -0.999499\n", + " -0.999485\n", + " -0.999494\n", " ...\n", - " -0.333333\n", - " -0.373556\n", - " -0.393103\n", - " -0.282759\n", - " -0.238095\n", - " -0.131034\n", - " -0.260274\n", - " -0.158621\n", - " -0.365517\n", - " -0.259336\n", + " -0.272466\n", + " -0.372968\n", + " -0.397196\n", + " -0.388759\n", + " -0.421546\n", + " -0.371630\n", + " -0.495897\n", + " -0.461988\n", + " -0.565421\n", + " -0.442731\n", " \n", " \n", " 4\n", - " -0.366940\n", - " -0.268464\n", - " -0.258216\n", - " -0.160610\n", - " -0.378664\n", - " -0.417840\n", - " -0.490654\n", - " -0.999849\n", - " -0.999849\n", - " -0.999846\n", + " -0.280846\n", + " -0.264107\n", + " -0.184793\n", + " -0.222962\n", + " -0.336986\n", + " -0.271837\n", + " -0.354814\n", + " -0.999296\n", + " -0.999295\n", + " -0.999301\n", " ...\n", - " -0.390476\n", - " -0.373492\n", - " -0.365517\n", - " -0.282759\n", - " -0.224490\n", - " -0.172414\n", - " -0.397260\n", - " -0.420690\n", - " -0.489655\n", - " -0.331950\n", + " -0.216391\n", + " -0.372599\n", + " -0.282710\n", + " -0.271663\n", + " -0.210773\n", + " -0.237984\n", + " -0.343494\n", + " -0.295906\n", + " -0.413551\n", + " -0.289205\n", " \n", " \n", "\n", @@ -1498,30 +1624,30 @@ ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", - "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", - "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", - "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", - "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", - "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", "\n", " 7 8 9 ... 16 17 18 19 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.257143 -0.373788 -0.227586 -0.227586 \n", - "1 -0.999959 -0.999958 -0.999959 ... -0.352381 -0.373708 -0.186207 -0.117241 \n", - "2 -0.999911 -0.999917 -0.999916 ... -0.396825 -0.373627 -0.048276 -0.144828 \n", - "3 -0.999881 -0.999883 -0.999881 ... -0.333333 -0.373556 -0.393103 -0.282759 \n", - "4 -0.999849 -0.999849 -0.999846 ... -0.390476 -0.373492 -0.365517 -0.282759 \n", + "0 -1.000000 -1.000000 -1.000000 ... -0.104242 -0.373977 -0.252336 -0.227166 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.252336 -0.373635 -0.525701 -0.470726 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.237958 -0.373286 -0.408879 -0.348946 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.272466 -0.372968 -0.397196 -0.388759 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.216391 -0.372599 -0.282710 -0.271663 \n", "\n", " 20 21 22 23 24 25 \n", - "0 -0.319728 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 \n", - "1 -0.115646 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 \n", - "2 -0.047619 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 \n", - "3 -0.238095 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 \n", - "4 -0.224490 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 \n", + "0 -0.201405 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 \n", + "1 -0.482436 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 \n", + "2 -0.337237 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 \n", + "3 -0.421546 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 \n", + "4 -0.210773 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 \n", "\n", "[5 rows x 26 columns]" ] }, - "execution_count": 26, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1543,17 +1669,17 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 30, "metadata": {}, "outputs": [], "source": [ - "step = 6\n", + "step = 7\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "metadata": {}, "outputs": [ { @@ -1562,7 +1688,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 28, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1573,7 +1699,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "metadata": {}, "outputs": [ { @@ -1623,122 +1749,122 @@ " \n", " \n", " 0\n", - " -0.235639\n", - " -0.242673\n", - " -0.328638\n", - " -0.177022\n", - " -0.352872\n", - " -0.192488\n", - " -0.331776\n", + " -0.261261\n", + " -0.237069\n", + " -0.208701\n", + " -0.141066\n", + " -0.323288\n", + " -0.259694\n", + " -0.421988\n", " -1.000000\n", " -1.000000\n", " -1.000000\n", " ...\n", - " -0.373788\n", - " -0.227586\n", - " -0.227586\n", - " -0.319728\n", - " -0.186207\n", - " -0.369863\n", - " -0.186207\n", - " -0.337931\n", - " -0.261411\n", + " -0.373977\n", + " -0.252336\n", + " -0.227166\n", + " -0.201405\n", + " -0.134818\n", + " -0.322392\n", + " -0.253801\n", + " -0.418224\n", + " -0.256975\n", " T001\n", " \n", " \n", " 1\n", - " -0.181712\n", - " -0.092614\n", - " -0.133803\n", - " -0.029308\n", - " -0.216882\n", - " -0.150235\n", - " -0.404206\n", - " -0.999959\n", - " -0.999958\n", - " -0.999959\n", + " -0.533490\n", + " -0.478056\n", + " -0.490888\n", + " -0.385580\n", + " -0.510372\n", + " -0.443400\n", + " -0.554384\n", + " -0.999830\n", + " -0.999825\n", + " -0.999825\n", " ...\n", - " -0.373708\n", - " -0.186207\n", - " -0.117241\n", - " -0.115646\n", - " -0.062069\n", - " -0.232877\n", - " -0.144828\n", - " -0.406897\n", - " -0.170124\n", + " -0.373635\n", + " -0.525701\n", + " -0.470726\n", + " -0.482436\n", + " -0.378664\n", + " -0.507620\n", + " -0.440936\n", + " -0.553738\n", + " -0.480853\n", " T001\n", " \n", " \n", " 2\n", - " -0.038687\n", - " -0.141852\n", - " -0.039906\n", - " -0.071512\n", - " -0.348183\n", - " -0.208920\n", - " -0.355140\n", - " -0.999911\n", - " -0.999917\n", - " -0.999916\n", + " -0.414414\n", + " -0.359718\n", + " -0.346267\n", + " -0.297806\n", + " -0.447358\n", + " -0.360360\n", + " -0.484866\n", + " -0.999670\n", + " -0.999656\n", + " -0.999656\n", " ...\n", - " -0.373627\n", - " -0.048276\n", - " -0.144828\n", - " -0.047619\n", - " -0.075862\n", - " -0.356164\n", - " -0.227586\n", - " -0.365517\n", - " -0.165975\n", + " -0.373286\n", + " -0.408879\n", + " -0.348946\n", + " -0.337237\n", + " -0.294256\n", + " -0.439625\n", + " -0.354386\n", + " -0.483645\n", + " -0.381043\n", " T001\n", " \n", " \n", " 3\n", - " -0.390387\n", - " -0.273154\n", - " -0.255869\n", - " -0.146542\n", - " -0.249707\n", - " -0.164319\n", - " -0.352804\n", - " -0.999881\n", - " -0.999883\n", - " -0.999881\n", + " -0.402664\n", + " -0.396160\n", + " -0.428180\n", + " -0.374608\n", + " -0.499022\n", + " -0.464943\n", + " -0.567663\n", + " -0.999499\n", + " -0.999485\n", + " -0.999494\n", " ...\n", - " -0.373556\n", - " -0.393103\n", - " -0.282759\n", - " -0.238095\n", - " -0.131034\n", - " -0.260274\n", - " -0.158621\n", - " -0.365517\n", - " -0.259336\n", + " -0.372968\n", + " -0.397196\n", + " -0.388759\n", + " -0.421546\n", + " -0.371630\n", + " -0.495897\n", + " -0.461988\n", + " -0.565421\n", + " -0.442731\n", " T001\n", " \n", " \n", " 4\n", - " -0.366940\n", - " -0.268464\n", - " -0.258216\n", - " -0.160610\n", - " -0.378664\n", - " -0.417840\n", - " -0.490654\n", - " -0.999849\n", - " -0.999849\n", - " -0.999846\n", + " -0.280846\n", + " -0.264107\n", + " -0.184793\n", + " -0.222962\n", + " -0.336986\n", + " -0.271837\n", + " -0.354814\n", + " -0.999296\n", + " -0.999295\n", + " -0.999301\n", " ...\n", - " -0.373492\n", - " -0.365517\n", - " -0.282759\n", - " -0.224490\n", - " -0.172414\n", - " -0.397260\n", - " -0.420690\n", - " -0.489655\n", - " -0.331950\n", + " -0.372599\n", + " -0.282710\n", + " -0.271663\n", + " -0.210773\n", + " -0.237984\n", + " -0.343494\n", + " -0.295906\n", + " -0.413551\n", + " -0.289205\n", " T001\n", " \n", " \n", @@ -1748,30 +1874,30 @@ ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", - "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", - "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", - "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", - "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", - "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", "\n", " 7 8 9 ... 17 18 19 20 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.373788 -0.227586 -0.227586 -0.319728 \n", - "1 -0.999959 -0.999958 -0.999959 ... -0.373708 -0.186207 -0.117241 -0.115646 \n", - "2 -0.999911 -0.999917 -0.999916 ... -0.373627 -0.048276 -0.144828 -0.047619 \n", - "3 -0.999881 -0.999883 -0.999881 ... -0.373556 -0.393103 -0.282759 -0.238095 \n", - "4 -0.999849 -0.999849 -0.999846 ... -0.373492 -0.365517 -0.282759 -0.224490 \n", + "0 -1.000000 -1.000000 -1.000000 ... -0.373977 -0.252336 -0.227166 -0.201405 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.373635 -0.525701 -0.470726 -0.482436 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.373286 -0.408879 -0.348946 -0.337237 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.372968 -0.397196 -0.388759 -0.421546 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.372599 -0.282710 -0.271663 -0.210773 \n", "\n", " 21 22 23 24 25 turbine_id \n", - "0 -0.186207 -0.369863 -0.186207 -0.337931 -0.261411 T001 \n", - "1 -0.062069 -0.232877 -0.144828 -0.406897 -0.170124 T001 \n", - "2 -0.075862 -0.356164 -0.227586 -0.365517 -0.165975 T001 \n", - "3 -0.131034 -0.260274 -0.158621 -0.365517 -0.259336 T001 \n", - "4 -0.172414 -0.397260 -0.420690 -0.489655 -0.331950 T001 \n", + "0 -0.134818 -0.322392 -0.253801 -0.418224 -0.256975 T001 \n", + "1 -0.378664 -0.507620 -0.440936 -0.553738 -0.480853 T001 \n", + "2 -0.294256 -0.439625 -0.354386 -0.483645 -0.381043 T001 \n", + "3 -0.371630 -0.495897 -0.461988 -0.565421 -0.442731 T001 \n", + "4 -0.237984 -0.343494 -0.295906 -0.413551 -0.289205 T001 \n", "\n", "[5 rows x 27 columns]" ] }, - "execution_count": 29, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1793,17 +1919,17 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 33, "metadata": {}, "outputs": [], "source": [ - "step = 7\n", + "step = 8\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 34, "metadata": {}, "outputs": [ { @@ -1812,7 +1938,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 31, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } @@ -1823,7 +1949,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 35, "metadata": {}, "outputs": [ { @@ -1873,123 +1999,123 @@ " \n", " \n", " 0\n", - " -0.235639\n", - " -0.242673\n", - " -0.328638\n", - " -0.177022\n", - " -0.352872\n", - " -0.192488\n", - " -0.331776\n", + " -0.261261\n", + " -0.237069\n", + " -0.208701\n", + " -0.141066\n", + " -0.323288\n", + " -0.259694\n", + " -0.421988\n", " -1.000000\n", " -1.000000\n", " -1.000000\n", " ...\n", - " -0.227586\n", - " -0.227586\n", - " -0.319728\n", - " -0.186207\n", - " -0.369863\n", - " -0.186207\n", - " -0.337931\n", - " -0.261411\n", + " -0.252336\n", + " -0.227166\n", + " -0.201405\n", + " -0.134818\n", + " -0.322392\n", + " -0.253801\n", + " -0.418224\n", + " -0.256975\n", " T001\n", " 2013-01-10 00:00:00\n", " \n", " \n", " 1\n", - " -0.181712\n", - " -0.092614\n", - " -0.133803\n", - " -0.029308\n", - " -0.216882\n", - " -0.150235\n", - " -0.404206\n", - " -0.999959\n", - " -0.999958\n", - " -0.999959\n", + " -0.533490\n", + " -0.478056\n", + " -0.490888\n", + " -0.385580\n", + " -0.510372\n", + " -0.443400\n", + " -0.554384\n", + " -0.999830\n", + " -0.999825\n", + " -0.999825\n", " ...\n", - " -0.186207\n", - " -0.117241\n", - " -0.115646\n", - " -0.062069\n", - " -0.232877\n", - " -0.144828\n", - " -0.406897\n", - " -0.170124\n", + " -0.525701\n", + " -0.470726\n", + " -0.482436\n", + " -0.378664\n", + " -0.507620\n", + " -0.440936\n", + " -0.553738\n", + " -0.480853\n", " T001\n", - " 2013-01-10 00:10:00\n", + " 2013-01-10 01:00:00\n", " \n", " \n", " 2\n", - " -0.038687\n", - " -0.141852\n", - " -0.039906\n", - " -0.071512\n", - " -0.348183\n", - " -0.208920\n", - " -0.355140\n", - " -0.999911\n", - " -0.999917\n", - " -0.999916\n", + " -0.414414\n", + " -0.359718\n", + " -0.346267\n", + " -0.297806\n", + " -0.447358\n", + " -0.360360\n", + " -0.484866\n", + " -0.999670\n", + " -0.999656\n", + " -0.999656\n", " ...\n", - " -0.048276\n", - " -0.144828\n", - " -0.047619\n", - " -0.075862\n", - " -0.356164\n", - " -0.227586\n", - " -0.365517\n", - " -0.165975\n", + " -0.408879\n", + " -0.348946\n", + " -0.337237\n", + " -0.294256\n", + " -0.439625\n", + " -0.354386\n", + " -0.483645\n", + " -0.381043\n", " T001\n", - " 2013-01-10 00:20:00\n", + " 2013-01-10 02:00:00\n", " \n", " \n", " 3\n", - " -0.390387\n", - " -0.273154\n", - " -0.255869\n", - " -0.146542\n", - " -0.249707\n", - " -0.164319\n", - " -0.352804\n", - " -0.999881\n", - " -0.999883\n", - " -0.999881\n", + " -0.402664\n", + " -0.396160\n", + " -0.428180\n", + " -0.374608\n", + " -0.499022\n", + " -0.464943\n", + " -0.567663\n", + " -0.999499\n", + " -0.999485\n", + " -0.999494\n", " ...\n", - " -0.393103\n", - " -0.282759\n", - " -0.238095\n", - " -0.131034\n", - " -0.260274\n", - " -0.158621\n", - " -0.365517\n", - " -0.259336\n", + " -0.397196\n", + " -0.388759\n", + " -0.421546\n", + " -0.371630\n", + " -0.495897\n", + " -0.461988\n", + " -0.565421\n", + " -0.442731\n", " T001\n", - " 2013-01-10 00:30:00\n", + " 2013-01-10 03:00:00\n", " \n", " \n", " 4\n", - " -0.366940\n", - " -0.268464\n", - " -0.258216\n", - " -0.160610\n", - " -0.378664\n", - " -0.417840\n", - " -0.490654\n", - " -0.999849\n", - " -0.999849\n", - " -0.999846\n", + " -0.280846\n", + " -0.264107\n", + " -0.184793\n", + " -0.222962\n", + " -0.336986\n", + " -0.271837\n", + " -0.354814\n", + " -0.999296\n", + " -0.999295\n", + " -0.999301\n", " ...\n", - " -0.365517\n", - " -0.282759\n", - " -0.224490\n", - " -0.172414\n", - " -0.397260\n", - " -0.420690\n", - " -0.489655\n", - " -0.331950\n", + " -0.282710\n", + " -0.271663\n", + " -0.210773\n", + " -0.237984\n", + " -0.343494\n", + " -0.295906\n", + " -0.413551\n", + " -0.289205\n", " T001\n", - " 2013-01-10 00:40:00\n", + " 2013-01-10 04:00:00\n", " \n", " \n", "\n", @@ -1998,30 +2124,30 @@ ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", - "0 -0.235639 -0.242673 -0.328638 -0.177022 -0.352872 -0.192488 -0.331776 \n", - "1 -0.181712 -0.092614 -0.133803 -0.029308 -0.216882 -0.150235 -0.404206 \n", - "2 -0.038687 -0.141852 -0.039906 -0.071512 -0.348183 -0.208920 -0.355140 \n", - "3 -0.390387 -0.273154 -0.255869 -0.146542 -0.249707 -0.164319 -0.352804 \n", - "4 -0.366940 -0.268464 -0.258216 -0.160610 -0.378664 -0.417840 -0.490654 \n", + "0 -0.261261 -0.237069 -0.208701 -0.141066 -0.323288 -0.259694 -0.421988 \n", + "1 -0.533490 -0.478056 -0.490888 -0.385580 -0.510372 -0.443400 -0.554384 \n", + "2 -0.414414 -0.359718 -0.346267 -0.297806 -0.447358 -0.360360 -0.484866 \n", + "3 -0.402664 -0.396160 -0.428180 -0.374608 -0.499022 -0.464943 -0.567663 \n", + "4 -0.280846 -0.264107 -0.184793 -0.222962 -0.336986 -0.271837 -0.354814 \n", "\n", " 7 8 9 ... 18 19 20 21 \\\n", - "0 -1.000000 -1.000000 -1.000000 ... -0.227586 -0.227586 -0.319728 -0.186207 \n", - "1 -0.999959 -0.999958 -0.999959 ... -0.186207 -0.117241 -0.115646 -0.062069 \n", - "2 -0.999911 -0.999917 -0.999916 ... -0.048276 -0.144828 -0.047619 -0.075862 \n", - "3 -0.999881 -0.999883 -0.999881 ... -0.393103 -0.282759 -0.238095 -0.131034 \n", - "4 -0.999849 -0.999849 -0.999846 ... -0.365517 -0.282759 -0.224490 -0.172414 \n", + "0 -1.000000 -1.000000 -1.000000 ... -0.252336 -0.227166 -0.201405 -0.134818 \n", + "1 -0.999830 -0.999825 -0.999825 ... -0.525701 -0.470726 -0.482436 -0.378664 \n", + "2 -0.999670 -0.999656 -0.999656 ... -0.408879 -0.348946 -0.337237 -0.294256 \n", + "3 -0.999499 -0.999485 -0.999494 ... -0.397196 -0.388759 -0.421546 -0.371630 \n", + "4 -0.999296 -0.999295 -0.999301 ... -0.282710 -0.271663 -0.210773 -0.237984 \n", "\n", " 22 23 24 25 turbine_id timestamp \n", - "0 -0.369863 -0.186207 -0.337931 -0.261411 T001 2013-01-10 00:00:00 \n", - "1 -0.232877 -0.144828 -0.406897 -0.170124 T001 2013-01-10 00:10:00 \n", - "2 -0.356164 -0.227586 -0.365517 -0.165975 T001 2013-01-10 00:20:00 \n", - "3 -0.260274 -0.158621 -0.365517 -0.259336 T001 2013-01-10 00:30:00 \n", - "4 -0.397260 -0.420690 -0.489655 -0.331950 T001 2013-01-10 00:40:00 \n", + "0 -0.322392 -0.253801 -0.418224 -0.256975 T001 2013-01-10 00:00:00 \n", + "1 -0.507620 -0.440936 -0.553738 -0.480853 T001 2013-01-10 01:00:00 \n", + "2 -0.439625 -0.354386 -0.483645 -0.381043 T001 2013-01-10 02:00:00 \n", + "3 -0.495897 -0.461988 -0.565421 -0.442731 T001 2013-01-10 03:00:00 \n", + "4 -0.343494 -0.295906 -0.413551 -0.289205 T001 2013-01-10 04:00:00 \n", "\n", "[5 rows x 28 columns]" ] }, - "execution_count": 32, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -2044,7 +2170,7 @@ }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 36, "metadata": {}, "outputs": [ { @@ -2053,29 +2179,29 @@ "{'window_size': 24, 'cutoff_time': 'cutoff_time', 'time_index': 'timestamp'}" ] }, - "execution_count": 33, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipeline._pipeline.get_hyperparameters()[\n", - " 'mlprimitives.custom.timeseries_preprocessing.cutoff_window_sequences#1']" + " 'mlstars.custom.timeseries_preprocessing.cutoff_window_sequences#1']" ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 37, "metadata": {}, "outputs": [], "source": [ - "step = 8\n", + "step = 9\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 38, "metadata": {}, "outputs": [ { @@ -2084,7 +2210,7 @@ "dict_keys(['readings', 'turbines', 'turbine_id', 'timestamp', 'X', 'y'])" ] }, - "execution_count": 35, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -2095,16 +2221,16 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(51121, 28)" + "(8521, 28)" ] }, - "execution_count": 36, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -2115,7 +2241,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 40, "metadata": {}, "outputs": [ { @@ -2124,7 +2250,7 @@ "(353,)" ] }, - "execution_count": 37, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2135,7 +2261,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -2144,7 +2270,7 @@ "(353, 24, 26)" ] }, - "execution_count": 38, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } @@ -2155,7 +2281,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 42, "metadata": { "scrolled": false }, @@ -2163,27 +2289,27 @@ { "data": { "text/plain": [ - "array([[-0.66002345, -0.57327081, -0.64084507, -0.57796014, -0.6014068 ,\n", - " -0.56103286, -0.55140187, -0.9928135 , -0.99291267, -0.99315058,\n", - " -0.99304288, -0.99346346, -0.99352632, -0.99395333, -0.42553191,\n", - " -0.41772152, -0.58730159, -0.35996294, -0.66896552, -0.57241379,\n", - " -0.61904762, -0.5862069 , -0.60273973, -0.55862069, -0.55862069,\n", - " -0.59751037],\n", - " [-0.2989449 , -0.38569754, -0.48591549, -0.47713951, -0.66705744,\n", - " -0.5915493 , -0.77336449, -0.99278389, -0.9928852 , -0.99312701,\n", - " -0.99301988, -0.9934481 , -0.9935075 , -0.9939459 , -0.39361702,\n", - " -0.40506329, -0.54285714, -0.35992014, -0.40689655, -0.42068966,\n", - " -0.46938776, -0.48965517, -0.67123288, -0.5862069 , -0.83448276,\n", - " -0.5560166 ],\n", - " [-0.33645955, -0.40679953, -0.39906103, -0.38569754, -0.56154748,\n", - " -0.43192488, -0.45560748, -0.99275498, -0.9928584 , -0.99310017,\n", - " -0.99299431, -0.99342739, -0.99348349, -0.99392294, -0.29787234,\n", - " -0.3164557 , -0.49206349, -0.35986854, -0.42068966, -0.43448276,\n", - " -0.40136054, -0.43448276, -0.56164384, -0.47586207, -0.51724138,\n", - " -0.46473029]])" + "array([[-0.58793576, -0.60305643, -0.63981971, -0.61481191, -0.69823875,\n", + " -0.65021543, -0.68912322, -0.99436914, -0.99439755, -0.99454249,\n", + " -0.99446788, -0.99476185, -0.99490997, -0.99529511, -0.34701493,\n", + " -0.33886256, -0.33860532, -0.36301186, -0.57943925, -0.59250585,\n", + " -0.6323185 , -0.60609613, -0.69284877, -0.64444444, -0.68691589,\n", + " -0.63853752],\n", + " [-0.56600078, -0.5846395 , -0.63002156, -0.61559561, -0.70880626,\n", + " -0.66392479, -0.69732474, -0.9942427 , -0.99427986, -0.9944408 ,\n", + " -0.99436498, -0.99468147, -0.99482011, -0.99521249, -0.33955224,\n", + " -0.31516588, -0.38892883, -0.36280656, -0.55841121, -0.57611241,\n", + " -0.62295082, -0.61078546, -0.70222743, -0.65847953, -0.69392523,\n", + " -0.63645815],\n", + " [-0.64081473, -0.64184953, -0.67038997, -0.63597179, -0.71350294,\n", + " -0.65844105, -0.66764304, -0.99412236, -0.99416864, -0.99434228,\n", + " -0.99426059, -0.99459663, -0.99472365, -0.99511795, -0.34328358,\n", + " -0.30094787, -0.36304817, -0.36259859, -0.63317757, -0.6323185 ,\n", + " -0.66042155, -0.62954279, -0.70926143, -0.65380117, -0.66588785,\n", + " -0.66002426]])" ] }, - "execution_count": 39, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -2205,22 +2331,21 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 43, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2022-01-18 07:34:41.001707: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", - "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", - "2022-01-18 07:34:41.024991: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fac7ea34260 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", - "2022-01-18 07:34:41.025038: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" + "2023-04-13 18:22:35.422060: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA\n", + "2023-04-13 18:22:35.434959: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7fbf6980d6b0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", + "2023-04-13 18:22:35.434974: I tensorflow/compiler/xla/service/service.cc:176] StreamExecutor device (0): Host, Default Version\n" ] } ], "source": [ - "step = 9\n", + "step = 10\n", "context = pipeline.fit(**context, output_=step, start_=step)" ] } @@ -2241,7 +2366,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.11" + "version": "3.8.16" } }, "nbformat": 4, From c7ce06771980fe5a5516fc9c9d5e759984b03677 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Thu, 20 Jul 2023 18:13:12 -0400 Subject: [PATCH 165/171] Update demo bucket (#76) * update bucket * update macos version --- .github/workflows/tests.yml | 2 +- draco/demo.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 17d140f..69cf892 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -70,7 +70,7 @@ jobs: strategy: matrix: python-version: [3.6, 3.7, 3.8] - os: [ubuntu-20.04, macos-10.15] + os: [ubuntu-20.04, macos-latest] steps: - uses: actions/checkout@v1 - name: Set up Python ${{ matrix.python-version }} diff --git a/draco/demo.py b/draco/demo.py index dcd4126..ef91fc5 100644 --- a/draco/demo.py +++ b/draco/demo.py @@ -7,7 +7,7 @@ LOGGER = logging.getLogger(__name__) -S3_URL = 'https://d3-ai-greenguard.s3.amazonaws.com/' +S3_URL = 'https://sintel-draco.s3.amazonaws.com/' DEMO_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'demo') _FILES = { @@ -46,7 +46,7 @@ def load_demo(name='default', load_readings=True): Subsequent calls will load the cached data instead of downloading it again. Args: - rul (str): + name (str): Name of the dataset to load. If "RUL", load NASA's CMAPSS dataset https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/#turbofan. If "default" then load default demo. From 4fcdccf22cb5980f119eba52eb220c34565d772f Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish <40212131+sarahmish@users.noreply.github.com> Date: Mon, 24 Jul 2023 08:11:59 -0400 Subject: [PATCH 166/171] Release 0.3 (#77) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Bump version: 0.2.1.dev0 → 0.2.1.dev1 * add release notes * update history --- HISTORY.md | 9 +++++++++ draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/HISTORY.md b/HISTORY.md index 539ca0e..a1cef30 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,5 +1,14 @@ # History +## 0.3.0 - 2022-07-20 + +This release switches from ``MLPrimitives`` to ``ml-stars``. +Moreover, we remove all pipelines using deep feature synthesis. + +* Update demo bucket - [Issue #76](https://github.com/sintel-dev/Draco/issues/76) by @sarahmish +* Remove ``dfs`` based pipelines - [Issue #73](https://github.com/sintel-dev/Draco/issues/73) by @sarahmish +* Move from ``MLPrimitives`` to ``ml-stars`` - [Issue #72](https://github.com/sintel-dev/Draco/issues/72) by @sarahmish + ## 0.2.0 - 2022-04-12 diff --git a/draco/__init__.py b/draco/__init__.py index e134da2..b54bb16 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.1.dev0' +__version__ = '0.2.1.dev1' import os diff --git a/setup.cfg b/setup.cfg index e78faaa..97a6f92 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1.dev0 +current_version = 0.2.1.dev1 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 9087746..dba269e 100644 --- a/setup.py +++ b/setup.py @@ -118,6 +118,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.2.1.dev0', + version='0.2.1.dev1', zip_safe=False, ) From 7b00501a8acb66197715b8a9125b54cff98aec99 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 31 Jul 2023 17:34:42 +0300 Subject: [PATCH 167/171] =?UTF-8?q?Bump=20version:=200.2.1.dev1=20?= =?UTF-8?q?=E2=86=92=200.2.1.dev2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index b54bb16..2b4fb3b 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.1.dev1' +__version__ = '0.2.1.dev2' import os diff --git a/setup.cfg b/setup.cfg index 97a6f92..7c17e7e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1.dev1 +current_version = 0.2.1.dev2 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index dba269e..a229a7d 100644 --- a/setup.py +++ b/setup.py @@ -118,6 +118,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.2.1.dev1', + version='0.2.1.dev2', zip_safe=False, ) From 3fa39d7a4103f247ccd76c68f5476c017db017b1 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 31 Jul 2023 18:34:55 +0300 Subject: [PATCH 168/171] prepare release --- HISTORY.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/HISTORY.md b/HISTORY.md index a1cef30..ebd216a 100644 --- a/HISTORY.md +++ b/HISTORY.md @@ -1,6 +1,6 @@ # History -## 0.3.0 - 2022-07-20 +## 0.3.0 - 2022-07-31 This release switches from ``MLPrimitives`` to ``ml-stars``. Moreover, we remove all pipelines using deep feature synthesis. From 122ee46f9a2376bb4f009f593f2c4d031a5f3bb5 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 31 Jul 2023 18:35:21 +0300 Subject: [PATCH 169/171] =?UTF-8?q?Bump=20version:=200.2.1.dev2=20?= =?UTF-8?q?=E2=86=92=200.3.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 2b4fb3b..8d60420 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.2.1.dev2' +__version__ = '0.3.0.dev0' import os diff --git a/setup.cfg b/setup.cfg index 7c17e7e..b8b1363 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.2.1.dev2 +current_version = 0.3.0.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index a229a7d..48f9ac6 100644 --- a/setup.py +++ b/setup.py @@ -118,6 +118,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.2.1.dev2', + version='0.3.0.dev0', zip_safe=False, ) From a574fe24543ee8f9cdb929407fc94432894651a7 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 31 Jul 2023 18:35:22 +0300 Subject: [PATCH 170/171] =?UTF-8?q?Bump=20version:=200.3.0.dev0=20?= =?UTF-8?q?=E2=86=92=200.3.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index 8d60420..b042dce 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.3.0.dev0' +__version__ = '0.3.0' import os diff --git a/setup.cfg b/setup.cfg index b8b1363..6445146 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.0.dev0 +current_version = 0.3.0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 48f9ac6..569e9ae 100644 --- a/setup.py +++ b/setup.py @@ -118,6 +118,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.3.0.dev0', + version='0.3.0', zip_safe=False, ) From 113e14fddb3b31570537aaf011b0e95255511855 Mon Sep 17 00:00:00 2001 From: Sarah Alnegheimish Date: Mon, 31 Jul 2023 18:36:01 +0300 Subject: [PATCH 171/171] =?UTF-8?q?Bump=20version:=200.3.0=20=E2=86=92=200?= =?UTF-8?q?.3.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- draco/__init__.py | 2 +- setup.cfg | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/draco/__init__.py b/draco/__init__.py index b042dce..9b2e05b 100644 --- a/draco/__init__.py +++ b/draco/__init__.py @@ -4,7 +4,7 @@ __author__ = """MIT Data To AI Lab""" __email__ = 'dailabmit@gmail.com' -__version__ = '0.3.0' +__version__ = '0.3.1.dev0' import os diff --git a/setup.cfg b/setup.cfg index 6445146..e595904 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.3.0 +current_version = 0.3.1.dev0 commit = True tag = True parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? diff --git a/setup.py b/setup.py index 569e9ae..5d475a5 100644 --- a/setup.py +++ b/setup.py @@ -118,6 +118,6 @@ test_suite='tests', tests_require=tests_require, url='https://github.com/sintel-dev/Draco', - version='0.3.0', + version='0.3.1.dev0', zip_safe=False, )