From 3bd48f9ac68feb088b1243db656aec21bc7e9fb5 Mon Sep 17 00:00:00 2001 From: CristianCuadrado Date: Thu, 3 Sep 2020 17:52:47 +0100 Subject: [PATCH 1/8] add utils for deepecho models --- deepecho/models/utils.py | 232 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 deepecho/models/utils.py diff --git a/deepecho/models/utils.py b/deepecho/models/utils.py new file mode 100644 index 0000000..fa5fdfd --- /dev/null +++ b/deepecho/models/utils.py @@ -0,0 +1,232 @@ +"""Utils for models.""" +# pylint: disable-all + +import numpy as np +import pandas as pd +import torch + + +def context_to_tensor_gan(context, context_size, context_map): + """Convert the input context to the corresponding tensor.""" + tensor = torch.zeros(context_size) + for column, properties in context_map.items(): + value = context[column] + value_to_tensor(tensor, value, properties) + + return tensor + + +def data_to_tensor_gan(data, model_data_size, data_map, max_sequence_length, fixed_length=None): + """Convert the input data to the corresponding tensor. + + If ``self._fixed_length`` is ``False``, add a 1.0 to indicate + the sequence end and pad the rest of the sequence with 0.0s. + """ + tensors = [] + num_rows = len(data[0]) + for row in range(num_rows): + tensor = torch.zeros(model_data_size) + for column, properties in data_map.items(): + value = data[column][row] + value_to_tensor(tensor, value, properties) + + tensors.append(tensor) + + if not fixed_length: + tensors[-1][-1] = 1.0 + + for _ in range(max_sequence_length - num_rows): + tensors.append(torch.zeros(model_data_size)) + + return torch.stack(tensors, dim=0) + + +@staticmethod +def denormalize(tensor, row, properties, round_value, std_norm=False): + """Denormalize previously normalized values, setting NaN values if necessary. + + If 'std_norm' is True, denormalize from 0 and 1. + If 'std_norm' is False, denormalize from -1 and 1. + """ + value_idx, missing_idx = properties['indices'] + if tensor[row, 0, missing_idx] > 0.5: + return None + + normalized = tensor[row, 0, value_idx].item() + column_min = properties['min'] + column_range = properties['max'] - column_min + + if std_norm: + denormalized = (normalized) * column_range + column_min + else: + denormalized = (normalized + 1) * column_range / 2.0 + column_min + + if round_value: + denormalized = round(denormalized) + + return denormalized + + +@staticmethod +def index_map(columns, types): + """Decide which dimension will store which column information in the tensor. + + The output of this function has two elements: + + - An idx_map, which is a dict that indicates the indexes at which + the list of tensor dimensions associated with each input column starts, + and the properties of such columns. + - An integer that indicates how many dimensions the tensor will have. + + In order to decide this, the following process is followed for each column: + + - If the column is numerical (continuous or count), 2 dimensions are created + for it. These will contain information about the value itself, as well + as information about whether the value should be NaN or not. + - If the column is categorical or ordinal, 1 dimentions is created for + each possible value, which will be later on used to hold one-hot encoding + information about the values. + """ + dimensions = 0 + mapping = {} + for column, column_type in enumerate(types): + values = columns[column] + if column_type in ('continuous', 'count'): + mapping[column] = { + 'type': column_type, + 'min': np.min(values), + 'max': np.max(values), + 'indices': (dimensions, dimensions + 1) + } + dimensions += 2 + + elif column_type in ('categorical', 'ordinal'): + indices = {} + for value in set(values): + indices[value] = dimensions + dimensions += 1 + + mapping[column] = { + 'type': column_type, + 'indices': indices + } + + else: + raise ValueError('Unsupported type: {}'.format(column_type)) + + return mapping, dimensions + + +@staticmethod +def normalize(tensor, value, properties, std_norm=False): + """Normalize value and flag nans. + + If 'std_norm' is True, normalize between 0 and 1. + If 'std_norm' is False, normalize between -1 and 1. + """ + value_idx, missing_idx = properties['indices'] + if pd.isnull(value): + tensor[value_idx] = 0.0 + tensor[missing_idx] = 1.0 + else: + column_min = properties['min'] + column_range = properties['max'] - column_min + offset = value - column_min + + if std_norm: + tensor[value_idx] = offset / column_range + else: + tensor[value_idx] = 2.0 * offset / column_range - 1.0 + + tensor[missing_idx] = 0.0 + + return tensor + + +@staticmethod +def one_hot_decode(tensor, row, properties): + """Obtain the category that corresponds to the highest one-hot value.""" + max_value = float('-inf') + for category, idx in properties['indices'].items(): + value = tensor[row, 0, idx] + if value > max_value: + max_value = value + selected = category + + return selected + + +@staticmethod +def one_hot_encode(tensor, value, properties): + """Update the index that corresponds to the value to 1.0.""" + value_index = properties['indices'][value] + tensor[value_index] = 1.0 + + return tensor + + +def transform(data, data_map): + """Transform data.""" + for properties in data_map.values(): + column_type = properties['type'] + if column_type in ('continuous', 'count'): + value_idx, missing_idx = properties['indices'] + data[:, :, value_idx] = torch.tanh(data[:, :, value_idx]) + data[:, :, missing_idx] = torch.sigmoid(data[:, :, missing_idx]) + elif column_type in ('categorical', 'ordinal'): + indices = list(properties['indices'].values()) + data[:, :, indices] = torch.nn.functional.gumbel_softmax( + data[:, :, indices], hard=True) + + return data + + +def truncate(generated, data_size): + """Truncate generated samples.""" + end_flag = (generated[:, :, data_size] > 0.5).float().round() + generated[:, :, data_size] = end_flag + + for sequence_idx in range(generated.shape[1]): + # Pad with zeroes after end_flag == 1 + sequence = generated[:, sequence_idx] + end_flag = sequence[:, data_size] + if (end_flag == 1.0).any(): + cut_idx = end_flag.detach().cpu().numpy().argmax() + sequence[cut_idx + 1:] = 0.0 + + +def tensor_to_data_gan(tensor, data_map): + """Rebuild a valid sequence from the given tensor.""" + sequence_length, num_sequences, _ = tensor.shape + assert num_sequences == 1 + + data = [None] * len(data_map) + for column, properties in data_map.items(): + column_type = properties['type'] + + column_data = [] + data[column] = column_data + for row in range(sequence_length): + if column_type in ('continuous', 'count'): + round_value = column_type == 'count' + value = denormalize(tensor, row, properties, round_value=round_value) + elif column_type in ('categorical', 'ordinal'): + value = one_hot_decode(tensor, row, properties) + else: + raise ValueError() # Theoretically unreachable + + column_data.append(value) + + return data + + +def value_to_tensor(tensor, value, properties): + """Update the tensor according to the value and properties.""" + column_type = properties['type'] + if column_type in ('continuous', 'count'): + normalize(tensor, value, properties) + elif column_type in ('categorical', 'ordinal'): + one_hot_encode(tensor, value, properties) + + else: + raise ValueError() # Theoretically unreachable From 3f4a7aa622af0f45a82781841489d0d3de069b83 Mon Sep 17 00:00:00 2001 From: CristianCuadrado Date: Thu, 3 Sep 2020 17:57:15 +0100 Subject: [PATCH 2/8] fix _transform --- deepecho/models/basic_gan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index f91fc0e..4378e10 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -406,7 +406,8 @@ def _transform(self, data): data[:, :, missing_idx] = torch.sigmoid(data[:, :, missing_idx]) elif column_type in ('categorical', 'ordinal'): indices = list(properties['indices'].values()) - data[:, :, indices] = torch.nn.functional.softmax(data[:, :, indices]) + data[:, :, indices] = torch.nn.functional.gumbel_softmax(data[:, :, indices], + hard=True) return data From fcc81c716cd734a50edee82978d495c9f5f00bfc Mon Sep 17 00:00:00 2001 From: CristianCuadrado Date: Fri, 4 Sep 2020 12:55:14 +0100 Subject: [PATCH 3/8] add functions to utils --- deepecho/models/basic_gan.py | 15 +-- deepecho/models/utils.py | 192 ++++++++++++++--------------------- 2 files changed, 76 insertions(+), 131 deletions(-) diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index 4378e10..e494836 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -24,7 +24,6 @@ class BasicGenerator(torch.nn.Module): This generator consist on a RNN layer followed by a Linear layer with the following schema: - - The Generator takes as input a ``sequence_length`` and a ``context`` vector. - The ``context`` vector is expanded over the ``sequence_lenght`` and padded with ``latent_size`` random noise. @@ -33,7 +32,6 @@ class BasicGenerator(torch.nn.Module): generates an output of shape ``(sequence_length, context_length, hidden_size)``. - The RNN output is passed to the Linear layer that outputs a tensor of size ``(sequence_length, context_length, output_size)`` - Args: context_size (int): Size of the contextual arrays. @@ -78,14 +76,12 @@ class BasicDiscriminator(torch.nn.Module): This discriminator consist on a RNN layer followed by a Linear layer with the following schema: - - The Discriminator takes as input a collection of sequences that include both the data and the context columns. - RNN takes as input a tensor with shape ``(sequence_length, number_of_sequences, context_size + data_size)`` and generates an output of shape ``(sequence_length, num_sequences, hidden_size)``. - The RNN output is passed to the Linear layer that outputs a single value. - Args: context_size (int): Number of values in the contextual arrays. @@ -121,7 +117,6 @@ class BasicGANModel(DeepEcho): - apply sigmoid to continuous/count/datetime - apply softmax to categorical/ordinal 4. Define a discriminator that takes sequence + context -> score. - Args: epochs (int): Number of training epochs. Defaults to 1024. @@ -196,14 +191,11 @@ def _index_map(columns, types): """Decide which dimension will store which column information in the tensor. The output of this function has two elements: - - An idx_map, which is a dict that indicates the indexes at which the list of tensor dimensions associated with each input column starts, and the properties of such columns. - An integer that indicates how many dimensions the tensor will have. - In order to decide this, the following process is followed for each column: - - If the column is numerical (continuous or count), 2 dimensions are created for it. These will contain information about the value itself, as well as information about whether the value should be NaN or not. @@ -406,8 +398,7 @@ def _transform(self, data): data[:, :, missing_idx] = torch.sigmoid(data[:, :, missing_idx]) elif column_type in ('categorical', 'ordinal'): indices = list(properties['indices'].values()) - data[:, :, indices] = torch.nn.functional.gumbel_softmax(data[:, :, indices], - hard=True) + data[:, :, indices] = torch.nn.functional.softmax(data[:, :, indices]) return data @@ -492,7 +483,6 @@ def fit_sequences(self, sequences, context_types, data_types): List of sequences. Each sequence is a single training example (i.e. an example of a multivariate time series with some context). For example, a sequence might look something like:: - { "context": [1], "data": [ @@ -501,11 +491,9 @@ def fit_sequences(self, sequences, context_types, data_types): [1, 3, 4, 5, 2, 3, 1] ] } - The "context" attribute maps to a list of variables which should be used for conditioning. These are variables which do not change over time. - The "data" attribute contains a list of lists corrsponding to the actual time series data such that `data[i][j]` contains the value at the jth time step of the ith channel of the @@ -559,7 +547,6 @@ def sample_sequence(self, context, sequence_length=None): context (list): The list of values to condition on. It must match the types specified in context_types when fit was called. - Returns: list[list]: A list of lists (data) corresponding to the types specified diff --git a/deepecho/models/utils.py b/deepecho/models/utils.py index fa5fdfd..8da71f9 100644 --- a/deepecho/models/utils.py +++ b/deepecho/models/utils.py @@ -6,68 +6,6 @@ import torch -def context_to_tensor_gan(context, context_size, context_map): - """Convert the input context to the corresponding tensor.""" - tensor = torch.zeros(context_size) - for column, properties in context_map.items(): - value = context[column] - value_to_tensor(tensor, value, properties) - - return tensor - - -def data_to_tensor_gan(data, model_data_size, data_map, max_sequence_length, fixed_length=None): - """Convert the input data to the corresponding tensor. - - If ``self._fixed_length`` is ``False``, add a 1.0 to indicate - the sequence end and pad the rest of the sequence with 0.0s. - """ - tensors = [] - num_rows = len(data[0]) - for row in range(num_rows): - tensor = torch.zeros(model_data_size) - for column, properties in data_map.items(): - value = data[column][row] - value_to_tensor(tensor, value, properties) - - tensors.append(tensor) - - if not fixed_length: - tensors[-1][-1] = 1.0 - - for _ in range(max_sequence_length - num_rows): - tensors.append(torch.zeros(model_data_size)) - - return torch.stack(tensors, dim=0) - - -@staticmethod -def denormalize(tensor, row, properties, round_value, std_norm=False): - """Denormalize previously normalized values, setting NaN values if necessary. - - If 'std_norm' is True, denormalize from 0 and 1. - If 'std_norm' is False, denormalize from -1 and 1. - """ - value_idx, missing_idx = properties['indices'] - if tensor[row, 0, missing_idx] > 0.5: - return None - - normalized = tensor[row, 0, value_idx].item() - column_min = properties['min'] - column_range = properties['max'] - column_min - - if std_norm: - denormalized = (normalized) * column_range + column_min - else: - denormalized = (normalized + 1) * column_range / 2.0 + column_min - - if round_value: - denormalized = round(denormalized) - - return denormalized - - -@staticmethod def index_map(columns, types): """Decide which dimension will store which column information in the tensor. @@ -117,12 +55,10 @@ def index_map(columns, types): return mapping, dimensions -@staticmethod -def normalize(tensor, value, properties, std_norm=False): +def normalize(tensor, value, properties): """Normalize value and flag nans. - If 'std_norm' is True, normalize between 0 and 1. - If 'std_norm' is False, normalize between -1 and 1. + Normalize between -1 and 1. """ value_idx, missing_idx = properties['indices'] if pd.isnull(value): @@ -133,17 +69,34 @@ def normalize(tensor, value, properties, std_norm=False): column_range = properties['max'] - column_min offset = value - column_min - if std_norm: - tensor[value_idx] = offset / column_range - else: - tensor[value_idx] = 2.0 * offset / column_range - 1.0 - + tensor[value_idx] = 2.0 * offset / column_range - 1.0 tensor[missing_idx] = 0.0 - return tensor + +def denormalize(tensor, row, properties, round_value): + """Denormalize previously normalized values, setting NaN values if necessary.""" + value_idx, missing_idx = properties['indices'] + if tensor[row, 0, missing_idx] > 0.5: + return None + + normalized = tensor[row, 0, value_idx].item() + column_min = properties['min'] + column_range = properties['max'] - column_min + + denormalized = (normalized + 1) * column_range / 2.0 + column_min + + if round_value: + denormalized = round(denormalized) + + return denormalized + + +def one_hot_encode(tensor, value, properties): + """Update the index that corresponds to the value to 1.0.""" + value_index = properties['indices'][value] + tensor[value_index] = 1.0 -@staticmethod def one_hot_decode(tensor, row, properties): """Obtain the category that corresponds to the highest one-hot value.""" max_value = float('-inf') @@ -156,46 +109,63 @@ def one_hot_decode(tensor, row, properties): return selected -@staticmethod -def one_hot_encode(tensor, value, properties): - """Update the index that corresponds to the value to 1.0.""" - value_index = properties['indices'][value] - tensor[value_index] = 1.0 +def build_tensor(transform, sequences, key, dim, device): + """Convert input sequences to tensors.""" + tensors = [] + for sequence in sequences: + tensors.append(transform(sequence[key])) - return tensor + return torch.stack(tensors, dim=dim).to(device) -def transform(data, data_map): - """Transform data.""" - for properties in data_map.values(): - column_type = properties['type'] - if column_type in ('continuous', 'count'): - value_idx, missing_idx = properties['indices'] - data[:, :, value_idx] = torch.tanh(data[:, :, value_idx]) - data[:, :, missing_idx] = torch.sigmoid(data[:, :, missing_idx]) - elif column_type in ('categorical', 'ordinal'): - indices = list(properties['indices'].values()) - data[:, :, indices] = torch.nn.functional.gumbel_softmax( - data[:, :, indices], hard=True) +def value_to_tensor(tensor, value, properties): + """Update the tensor according to the value and properties.""" + column_type = properties['type'] + if column_type in ('continuous', 'count'): + normalize(tensor, value, properties) + elif column_type in ('categorical', 'ordinal'): + one_hot_encode(tensor, value, properties) - return data + else: + raise ValueError() # Theoretically unreachable -def truncate(generated, data_size): - """Truncate generated samples.""" - end_flag = (generated[:, :, data_size] > 0.5).float().round() - generated[:, :, data_size] = end_flag +def data_to_tensor(data, model_data_size, data_map, fixed_length, max_sequence_length): + """Convert the input data to the corresponding tensor. - for sequence_idx in range(generated.shape[1]): - # Pad with zeroes after end_flag == 1 - sequence = generated[:, sequence_idx] - end_flag = sequence[:, data_size] - if (end_flag == 1.0).any(): - cut_idx = end_flag.detach().cpu().numpy().argmax() - sequence[cut_idx + 1:] = 0.0 + If ``self._fixed_length`` is ``False``, add a 1.0 to indicate + the sequence end and pad the rest of the sequence with 0.0s. + """ + tensors = [] + num_rows = len(data[0]) + for row in range(num_rows): + tensor = torch.zeros(model_data_size) + for column, properties in data_map.items(): + value = data[column][row] + value_to_tensor(tensor, value, properties) + tensors.append(tensor) + + if not fixed_length: + tensors[-1][-1] = 1.0 + + for _ in range(max_sequence_length - num_rows): + tensors.append(torch.zeros(model_data_size)) -def tensor_to_data_gan(tensor, data_map): + return torch.stack(tensors, dim=0) + + +def context_to_tensor(context, context_size, context_map): + """Convert the input context to the corresponding tensor.""" + tensor = torch.zeros(context_size) + for column, properties in context_map.items(): + value = context[column] + value_to_tensor(tensor, value, properties) + + return tensor + + +def tensor_to_data(tensor, data_map): """Rebuild a valid sequence from the given tensor.""" sequence_length, num_sequences, _ = tensor.shape assert num_sequences == 1 @@ -213,20 +183,8 @@ def tensor_to_data_gan(tensor, data_map): elif column_type in ('categorical', 'ordinal'): value = one_hot_decode(tensor, row, properties) else: - raise ValueError() # Theoretically unreachable + raise ValueError() # Theoretically unreachable column_data.append(value) return data - - -def value_to_tensor(tensor, value, properties): - """Update the tensor according to the value and properties.""" - column_type = properties['type'] - if column_type in ('continuous', 'count'): - normalize(tensor, value, properties) - elif column_type in ('categorical', 'ordinal'): - one_hot_encode(tensor, value, properties) - - else: - raise ValueError() # Theoretically unreachable From 1a2991e6265078e9fc2c4da79eb9573c5c4e801a Mon Sep 17 00:00:00 2001 From: CristianCuadrado Date: Fri, 4 Sep 2020 15:27:15 +0100 Subject: [PATCH 4/8] add docstrings to utils, modify basic_gan --- deepecho/models/basic_gan.py | 200 ++++------------------------------- deepecho/models/utils.py | 127 +++++++++++++++++++--- 2 files changed, 137 insertions(+), 190 deletions(-) diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index e494836..8a60102 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -3,11 +3,12 @@ import logging import numpy as np -import pandas as pd import torch from tqdm import tqdm from deepecho.models.base import DeepEcho +from deepecho.models.utils import ( + build_tensor, context_to_tensor, data_to_tensor, index_map, tensor_to_data) LOGGER = logging.getLogger(__name__) @@ -24,6 +25,7 @@ class BasicGenerator(torch.nn.Module): This generator consist on a RNN layer followed by a Linear layer with the following schema: + - The Generator takes as input a ``sequence_length`` and a ``context`` vector. - The ``context`` vector is expanded over the ``sequence_lenght`` and padded with ``latent_size`` random noise. @@ -32,6 +34,7 @@ class BasicGenerator(torch.nn.Module): generates an output of shape ``(sequence_length, context_length, hidden_size)``. - The RNN output is passed to the Linear layer that outputs a tensor of size ``(sequence_length, context_length, output_size)`` + Args: context_size (int): Size of the contextual arrays. @@ -76,12 +79,14 @@ class BasicDiscriminator(torch.nn.Module): This discriminator consist on a RNN layer followed by a Linear layer with the following schema: + - The Discriminator takes as input a collection of sequences that include both the data and the context columns. - RNN takes as input a tensor with shape ``(sequence_length, number_of_sequences, context_size + data_size)`` and generates an output of shape ``(sequence_length, num_sequences, hidden_size)``. - The RNN output is passed to the Linear layer that outputs a single value. + Args: context_size (int): Number of values in the contextual arrays. @@ -117,6 +122,7 @@ class BasicGANModel(DeepEcho): - apply sigmoid to continuous/count/datetime - apply softmax to categorical/ordinal 4. Define a discriminator that takes sequence + context -> score. + Args: epochs (int): Number of training epochs. Defaults to 1024. @@ -186,52 +192,6 @@ def __repr__(self): # Preprocessing and preparing # # ########################### # - @staticmethod - def _index_map(columns, types): - """Decide which dimension will store which column information in the tensor. - - The output of this function has two elements: - - An idx_map, which is a dict that indicates the indexes at which - the list of tensor dimensions associated with each input column starts, - and the properties of such columns. - - An integer that indicates how many dimensions the tensor will have. - In order to decide this, the following process is followed for each column: - - If the column is numerical (continuous or count), 2 dimensions are created - for it. These will contain information about the value itself, as well - as information about whether the value should be NaN or not. - - If the column is categorical or ordinal, 1 dimentions is created for - each possible value, which will be later on used to hold one-hot encoding - information about the values. - """ - dimensions = 0 - mapping = {} - for column, column_type in enumerate(types): - values = columns[column] - if column_type in ('continuous', 'count'): - mapping[column] = { - 'type': column_type, - 'min': np.min(values), - 'max': np.max(values), - 'indices': (dimensions, dimensions + 1) - } - dimensions += 2 - - elif column_type in ('categorical', 'ordinal'): - indices = {} - for value in set(values): - indices[value] = dimensions - dimensions += 1 - - mapping[column] = { - 'type': column_type, - 'indices': indices - } - - else: - raise ValueError('Unsupported type: {}'.format(column_type)) - - return mapping, dimensions - def _analyze_data(self, sequences, context_types, data_types): """Extract information about the context and data that will be used later. @@ -249,142 +209,17 @@ def _analyze_data(self, sequences, context_types, data_types): for column in range(len(context_types)): context.append([sequence['context'][column] for sequence in sequences]) - self._context_map, self._context_size = self._index_map(context, context_types) + self._context_map, self._context_size = index_map(context, context_types) # Concatenate all the data sequences together data = [] for column in range(len(data_types)): data.append(sum([sequence['data'][column] for sequence in sequences], [])) - self._data_map, self._data_size = self._index_map(data, data_types) + self._data_map, self._data_size = index_map(data, data_types) self._model_data_size = self._data_size + int(not self._fixed_length) - @staticmethod - def _normalize(tensor, value, properties): - """Normalize the value between 0 and 1 and flag nans.""" - value_idx, missing_idx = properties['indices'] - if pd.isnull(value): - tensor[value_idx] = 0.0 - tensor[missing_idx] = 1.0 - else: - column_min = properties['min'] - column_range = properties['max'] - column_min - offset = value - column_min - tensor[value_idx] = 2.0 * offset / column_range - 1.0 - tensor[missing_idx] = 0.0 - - @staticmethod - def _denormalize(tensor, row, properties, round_value): - """Denormalize previously normalized values, setting NaN values if necessary.""" - value_idx, missing_idx = properties['indices'] - if tensor[row, 0, missing_idx] > 0.5: - return None - - normalized = tensor[row, 0, value_idx].item() - column_min = properties['min'] - column_range = properties['max'] - column_min - - denormalized = (normalized + 1) * column_range / 2.0 + column_min - if round_value: - denormalized = round(denormalized) - - return denormalized - - @staticmethod - def _one_hot_encode(tensor, value, properties): - """Update the index that corresponds to the value to 1.0.""" - value_index = properties['indices'][value] - tensor[value_index] = 1.0 - - @staticmethod - def _one_hot_decode(tensor, row, properties): - """Obtain the category that corresponds to the highest one-hot value.""" - max_value = float('-inf') - for category, idx in properties['indices'].items(): - value = tensor[row, 0, idx] - if value > max_value: - max_value = value - selected = category - - return selected - - def _value_to_tensor(self, tensor, value, properties): - """Update the tensor according to the value and properties.""" - column_type = properties['type'] - if column_type in ('continuous', 'count'): - self._normalize(tensor, value, properties) - elif column_type in ('categorical', 'ordinal'): - self._one_hot_encode(tensor, value, properties) - - else: - raise ValueError() # Theoretically unreachable - - def _data_to_tensor(self, data): - """Convert the input data to the corresponding tensor. - - If ``self._fixed_length`` is ``False``, add a 1.0 to indicate - the sequence end and pad the rest of the sequence with 0.0s. - """ - tensors = [] - num_rows = len(data[0]) - for row in range(num_rows): - tensor = torch.zeros(self._model_data_size) - for column, properties in self._data_map.items(): - value = data[column][row] - self._value_to_tensor(tensor, value, properties) - - tensors.append(tensor) - - if not self._fixed_length: - tensors[-1][-1] = 1.0 - - for _ in range(self._max_sequence_length - num_rows): - tensors.append(torch.zeros(self._model_data_size)) - - return torch.stack(tensors, dim=0) - - def _context_to_tensor(self, context): - """Convert the input context to the corresponding tensor.""" - tensor = torch.zeros(self._context_size) - for column, properties in self._context_map.items(): - value = context[column] - self._value_to_tensor(tensor, value, properties) - - return tensor - - def _tensor_to_data(self, tensor): - """Rebuild a valid sequence from the given tensor.""" - sequence_length, num_sequences, _ = tensor.shape - assert num_sequences == 1 - - data = [None] * len(self._data_map) - for column, properties in self._data_map.items(): - column_type = properties['type'] - - column_data = [] - data[column] = column_data - for row in range(sequence_length): - if column_type in ('continuous', 'count'): - round_value = column_type == 'count' - value = self._denormalize(tensor, row, properties, round_value=round_value) - elif column_type in ('categorical', 'ordinal'): - value = self._one_hot_decode(tensor, row, properties) - else: - raise ValueError() # Theoretically unreachable - - column_data.append(value) - - return data - - def _build_tensor(self, transform, sequences, key, dim): - """Convert input sequences to tensors.""" - tensors = [] - for sequence in sequences: - tensors.append(transform(sequence[key])) - - return torch.stack(tensors, dim=dim).to(self._device) - # ################## # # GAN Training steps # # ################## # @@ -483,6 +318,7 @@ def fit_sequences(self, sequences, context_types, data_types): List of sequences. Each sequence is a single training example (i.e. an example of a multivariate time series with some context). For example, a sequence might look something like:: + { "context": [1], "data": [ @@ -491,9 +327,11 @@ def fit_sequences(self, sequences, context_types, data_types): [1, 3, 4, 5, 2, 3, 1] ] } + The "context" attribute maps to a list of variables which should be used for conditioning. These are variables which do not change over time. + The "data" attribute contains a list of lists corrsponding to the actual time series data such that `data[i][j]` contains the value at the jth time step of the ith channel of the @@ -510,8 +348,12 @@ def fit_sequences(self, sequences, context_types, data_types): """ self._analyze_data(sequences, context_types, data_types) - data = self._build_tensor(self._data_to_tensor, sequences, 'data', dim=1) - context = self._build_tensor(self._context_to_tensor, sequences, 'context', dim=0) + data = build_tensor(data_to_tensor, sequences, 'data', dim=1, device=self._device, + model_data_size=self._model_data_size, data_map=self._data_map, + fixed_length=self._fixed_length, + max_sequence_length=self._max_sequence_length) + context = build_tensor(context_to_tensor, sequences, 'context', dim=0, device=self._device, + context_size=self._context_size, context_map=self._context_map) data_context = _expand_context(data, context) discriminator, generator_opt, discriminator_opt = self._build_fit_artifacts() @@ -547,12 +389,14 @@ def sample_sequence(self, context, sequence_length=None): context (list): The list of values to condition on. It must match the types specified in context_types when fit was called. + Returns: list[list]: A list of lists (data) corresponding to the types specified in data_types when fit was called. """ - context = self._context_to_tensor(context).unsqueeze(0).to(self._device) + context = context_to_tensor(context, self._context_size, self._context_map)\ + .unsqueeze(0).to(self._device) with torch.no_grad(): generated = self._generate(context, sequence_length) @@ -562,4 +406,4 @@ def sample_sequence(self, context, sequence_length=None): cut_index = end_flag.cpu().numpy().argmax() generated = generated[:cut_index, :, :] - return self._tensor_to_data(generated) + return tensor_to_data(generated, self._data_map) diff --git a/deepecho/models/utils.py b/deepecho/models/utils.py index 8da71f9..3bbc773 100644 --- a/deepecho/models/utils.py +++ b/deepecho/models/utils.py @@ -56,9 +56,15 @@ def index_map(columns, types): def normalize(tensor, value, properties): - """Normalize value and flag nans. - - Normalize between -1 and 1. + """Normalize value and flag nans. Normalized values are between -1 and 1. + + Args: + tensor (array): + Vector to store normalize values and recording null values position. + value (float): + Value to normalize. + properties (dict): + Contains information related to the value category. """ value_idx, missing_idx = properties['indices'] if pd.isnull(value): @@ -74,7 +80,22 @@ def normalize(tensor, value, properties): def denormalize(tensor, row, properties, round_value): - """Denormalize previously normalized values, setting NaN values if necessary.""" + """Denormalize previously normalized values, setting NaN values if necessary. + + Args: + tensor (array): + 3D Vector that contains different samples with normalized values + and record of null values. + row (int): + Sample to denormalize + properties (dict): + Contains information related to the value category. + round_value(boolean): + Apply round to the denormalized value or not. + Return: + denormalized(float) + Return the denormalized value. + """ value_idx, missing_idx = properties['indices'] if tensor[row, 0, missing_idx] > 0.5: return None @@ -92,13 +113,34 @@ def denormalize(tensor, row, properties, round_value): def one_hot_encode(tensor, value, properties): - """Update the index that corresponds to the value to 1.0.""" + """Update the index that corresponds to the value to 1.0. + + Args: + tensor (array): + Vector to store one hot encoding. + value (int): + Categorical variable key + properties (dict): + Contains information related to the value category. + """ value_index = properties['indices'][value] tensor[value_index] = 1.0 def one_hot_decode(tensor, row, properties): - """Obtain the category that corresponds to the highest one-hot value.""" + """Obtain the category that corresponds to the highest one-hot value. + + Args: + tensor (array): + Vector that store one hot encoding for different samples. + row (int): + Indicates the sample. + properties (dict): + Contains information related to the value category. + Returns: + selected(int): + Category selected. + """ max_value = float('-inf') for category, idx in properties['indices'].items(): value = tensor[row, 0, idx] @@ -109,17 +151,43 @@ def one_hot_decode(tensor, row, properties): return selected -def build_tensor(transform, sequences, key, dim, device): - """Convert input sequences to tensors.""" +def build_tensor(transform, sequences, key, dim, device, **transform_kwargs): + """Convert input sequences to tensors. + + Args: + transform (function): + Function to apply. + sequences (dict): + Contains data samples. + key (str): + Indicates with information pass to the function from variable 'sequence'. + dim(int) + Dimension to insert. + device(torch.device) + Indicate available device. + **transform_kwargs(dict) + Contains input variables for the function passed by 'transform'. + Returns: + 3D torch vector, with all samples concatenated. + """ tensors = [] for sequence in sequences: - tensors.append(transform(sequence[key])) + tensors.append(transform(sequence[key], **transform_kwargs)) return torch.stack(tensors, dim=dim).to(device) def value_to_tensor(tensor, value, properties): - """Update the tensor according to the value and properties.""" + """Update the tensor according to the value and properties. + + Args: + tensor (array): + Vector to store the values and recording null values position. + value (float): + Value to normalize. + properties (dict): + Contains information related to the value category. + """ column_type = properties['type'] if column_type in ('continuous', 'count'): normalize(tensor, value, properties) @@ -135,6 +203,20 @@ def data_to_tensor(data, model_data_size, data_map, fixed_length, max_sequence_l If ``self._fixed_length`` is ``False``, add a 1.0 to indicate the sequence end and pad the rest of the sequence with 0.0s. + + Args: + data (list): + List of arrays of input data. + model_data_size(int): + Dimension of tensors. + data_map (dict): + Contains information related to the value category. + fixed_length(Boolean): + Define samples length. + max_sequence_length(): + Define the length of the biggest sequence. + Return: + 2D torch vector, with all samples concatenated. """ tensors = [] num_rows = len(data[0]) @@ -156,7 +238,19 @@ def data_to_tensor(data, model_data_size, data_map, fixed_length, max_sequence_l def context_to_tensor(context, context_size, context_map): - """Convert the input context to the corresponding tensor.""" + """Convert the input context to the corresponding tensor. + + Args: + context (array): + Context context information. + context_size(int): + Define 'tensor' size. + context_map (dict): + Contains information related to the value category. + Return: + tensor(torch tensor): + 3D array, contains the concatenated samples + """ tensor = torch.zeros(context_size) for column, properties in context_map.items(): value = context[column] @@ -166,7 +260,16 @@ def context_to_tensor(context, context_size, context_map): def tensor_to_data(tensor, data_map): - """Rebuild a valid sequence from the given tensor.""" + """Rebuild a valid sequence from the given tensor. + + Args: + tensor (list): + List of arrays of input data. + data_map(int): + Dimension of tensors. + Return: + data + """ sequence_length, num_sequences, _ = tensor.shape assert num_sequences == 1 From 3dd7fa583d12ff0760f3d49a6d4ed3a7cfb64d03 Mon Sep 17 00:00:00 2001 From: CristianCuadrado Date: Fri, 4 Sep 2020 15:32:33 +0100 Subject: [PATCH 5/8] modify utils --- deepecho/models/utils.py | 52 ++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/deepecho/models/utils.py b/deepecho/models/utils.py index 3bbc773..b43e2f2 100644 --- a/deepecho/models/utils.py +++ b/deepecho/models/utils.py @@ -151,32 +151,6 @@ def one_hot_decode(tensor, row, properties): return selected -def build_tensor(transform, sequences, key, dim, device, **transform_kwargs): - """Convert input sequences to tensors. - - Args: - transform (function): - Function to apply. - sequences (dict): - Contains data samples. - key (str): - Indicates with information pass to the function from variable 'sequence'. - dim(int) - Dimension to insert. - device(torch.device) - Indicate available device. - **transform_kwargs(dict) - Contains input variables for the function passed by 'transform'. - Returns: - 3D torch vector, with all samples concatenated. - """ - tensors = [] - for sequence in sequences: - tensors.append(transform(sequence[key], **transform_kwargs)) - - return torch.stack(tensors, dim=dim).to(device) - - def value_to_tensor(tensor, value, properties): """Update the tensor according to the value and properties. @@ -291,3 +265,29 @@ def tensor_to_data(tensor, data_map): column_data.append(value) return data + + +def build_tensor(transform, sequences, key, dim, device, **transform_kwargs): + """Convert input sequences to tensors. + + Args: + transform (function): + Function to apply. + sequences (dict): + Contains data samples. + key (str): + Indicates with information pass to the function from variable 'sequence'. + dim(int) + Dimension to insert. + device(torch.device) + Indicate available device. + **transform_kwargs(dict) + Contains input variables for the function passed by 'transform'. + Returns: + 3D torch vector, with all samples concatenated. + """ + tensors = [] + for sequence in sequences: + tensors.append(transform(sequence[key], **transform_kwargs)) + + return torch.stack(tensors, dim=dim).to(device) From 1c350e38f569500c10ddc047b012f37e62a7b150 Mon Sep 17 00:00:00 2001 From: CristianCuadrado Date: Mon, 7 Sep 2020 11:24:34 +0100 Subject: [PATCH 6/8] utils and basic_gan last improvements --- deepecho/models/basic_gan.py | 32 +++++++++++++++------ deepecho/models/utils.py | 54 ++++++++++++++++++++++++------------ 2 files changed, 59 insertions(+), 27 deletions(-) diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index 8a60102..b37b0b3 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -348,12 +348,27 @@ def fit_sequences(self, sequences, context_types, data_types): """ self._analyze_data(sequences, context_types, data_types) - data = build_tensor(data_to_tensor, sequences, 'data', dim=1, device=self._device, - model_data_size=self._model_data_size, data_map=self._data_map, - fixed_length=self._fixed_length, - max_sequence_length=self._max_sequence_length) - context = build_tensor(context_to_tensor, sequences, 'context', dim=0, device=self._device, - context_size=self._context_size, context_map=self._context_map) + data = build_tensor( + transform=data_to_tensor, + sequences=sequences, + key='data', + dim=1, + device=self._device, + model_data_size=self._model_data_size, + data_map=self._data_map, + fixed_length=self._fixed_length, + max_sequence_length=self._max_sequence_length + ) + + context = build_tensor( + transform=context_to_tensor, + sequences=sequences, + key='context', + dim=0, + context_size=self._context_size, + context_map=self._context_map + ).to(self._device) + data_context = _expand_context(data, context) discriminator, generator_opt, discriminator_opt = self._build_fit_artifacts() @@ -395,9 +410,8 @@ def sample_sequence(self, context, sequence_length=None): A list of lists (data) corresponding to the types specified in data_types when fit was called. """ - context = context_to_tensor(context, self._context_size, self._context_map)\ - .unsqueeze(0).to(self._device) - + context_tensor = context_to_tensor(context, self._context_size, self._context_map) + context = context_tensor.unsqueeze(0).to(self._device) with torch.no_grad(): generated = self._generate(context, sequence_length) if sequence_length is None: diff --git a/deepecho/models/utils.py b/deepecho/models/utils.py index b43e2f2..a5efccd 100644 --- a/deepecho/models/utils.py +++ b/deepecho/models/utils.py @@ -1,5 +1,4 @@ """Utils for models.""" -# pylint: disable-all import numpy as np import pandas as pd @@ -11,7 +10,7 @@ def index_map(columns, types): The output of this function has two elements: - - An idx_map, which is a dict that indicates the indexes at which + - An 'mapping', which is a dict that indicates the indexes at which the list of tensor dimensions associated with each input column starts, and the properties of such columns. - An integer that indicates how many dimensions the tensor will have. @@ -24,6 +23,18 @@ def index_map(columns, types): - If the column is categorical or ordinal, 1 dimentions is created for each possible value, which will be later on used to hold one-hot encoding information about the values. + + Args: + columns(list): + Data contained in the associate column. + types(list): + Contains information about 'columns' type. + + Returns: + dict: + Contains information related to the properties of the columns data. + int: + Indicates how many dimensions the tensor will have """ dimensions = 0 mapping = {} @@ -92,9 +103,10 @@ def denormalize(tensor, row, properties, round_value): Contains information related to the value category. round_value(boolean): Apply round to the denormalized value or not. - Return: - denormalized(float) - Return the denormalized value. + + Returns: + float: + Denormalized value. """ value_idx, missing_idx = properties['indices'] if tensor[row, 0, missing_idx] > 0.5: @@ -137,9 +149,10 @@ def one_hot_decode(tensor, row, properties): Indicates the sample. properties (dict): Contains information related to the value category. + Returns: - selected(int): - Category selected. + int: + Category selected. """ max_value = float('-inf') for category, idx in properties['indices'].items(): @@ -189,8 +202,10 @@ def data_to_tensor(data, model_data_size, data_map, fixed_length, max_sequence_l Define samples length. max_sequence_length(): Define the length of the biggest sequence. - Return: - 2D torch vector, with all samples concatenated. + + Returns: + torch tensor: + All samples concatenated. """ tensors = [] num_rows = len(data[0]) @@ -221,8 +236,9 @@ def context_to_tensor(context, context_size, context_map): Define 'tensor' size. context_map (dict): Contains information related to the value category. - Return: - tensor(torch tensor): + + Returns: + torch tensor: 3D array, contains the concatenated samples """ tensor = torch.zeros(context_size) @@ -241,8 +257,10 @@ def tensor_to_data(tensor, data_map): List of arrays of input data. data_map(int): Dimension of tensors. - Return: - data + + Returns: + list: + data sequence """ sequence_length, num_sequences, _ = tensor.shape assert num_sequences == 1 @@ -267,7 +285,7 @@ def tensor_to_data(tensor, data_map): return data -def build_tensor(transform, sequences, key, dim, device, **transform_kwargs): +def build_tensor(transform, sequences, key, dim, **transform_kwargs): """Convert input sequences to tensors. Args: @@ -279,15 +297,15 @@ def build_tensor(transform, sequences, key, dim, device, **transform_kwargs): Indicates with information pass to the function from variable 'sequence'. dim(int) Dimension to insert. - device(torch.device) - Indicate available device. **transform_kwargs(dict) Contains input variables for the function passed by 'transform'. + Returns: - 3D torch vector, with all samples concatenated. + torch tensor: + All samples concatenated. """ tensors = [] for sequence in sequences: tensors.append(transform(sequence[key], **transform_kwargs)) - return torch.stack(tensors, dim=dim).to(device) + return torch.stack(tensors, dim=dim) From a4b53a22eb18280a6dd949acb0b0f1482857316f Mon Sep 17 00:00:00 2001 From: CristianCuadrado Date: Mon, 7 Sep 2020 11:55:34 +0100 Subject: [PATCH 7/8] improve docstrings in utils, modifications in basic_gan --- deepecho/models/basic_gan.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deepecho/models/basic_gan.py b/deepecho/models/basic_gan.py index b37b0b3..a022f9b 100644 --- a/deepecho/models/basic_gan.py +++ b/deepecho/models/basic_gan.py @@ -353,12 +353,11 @@ def fit_sequences(self, sequences, context_types, data_types): sequences=sequences, key='data', dim=1, - device=self._device, model_data_size=self._model_data_size, data_map=self._data_map, fixed_length=self._fixed_length, max_sequence_length=self._max_sequence_length - ) + ).to(self._device) context = build_tensor( transform=context_to_tensor, From 5b091e15dbce6c9766c4102e498492f7baece3fd Mon Sep 17 00:00:00 2001 From: Carles Sala Date: Wed, 9 Sep 2020 14:31:50 +0200 Subject: [PATCH 8/8] Update docstrings --- deepecho/models/utils.py | 119 ++++++++++++++++++++------------------- 1 file changed, 62 insertions(+), 57 deletions(-) diff --git a/deepecho/models/utils.py b/deepecho/models/utils.py index a5efccd..36a6fc6 100644 --- a/deepecho/models/utils.py +++ b/deepecho/models/utils.py @@ -25,16 +25,15 @@ def index_map(columns, types): information about the values. Args: - columns(list): - Data contained in the associate column. - types(list): - Contains information about 'columns' type. + columns (list): + List of lists containing the values of each column. + types (list): + List of strings containing the type of each column. Returns: - dict: - Contains information related to the properties of the columns data. - int: - Indicates how many dimensions the tensor will have + tuple: + * ``dict``: Information related to the properties of the columns data. + * ``int``: Number of dimensions the that tensor will have. """ dimensions = 0 mapping = {} @@ -67,15 +66,18 @@ def index_map(columns, types): def normalize(tensor, value, properties): - """Normalize value and flag nans. Normalized values are between -1 and 1. + """Normalize value and flag nans. + + Normalized values are between -1 and 1. Args: tensor (array): - Vector to store normalize values and recording null values position. + Tensor in which the normalized values will be stored. value (float): Value to normalize. properties (dict): - Contains information related to the value category. + Dictionary with information related to the given value, + which must contain the indices and the min/max values. """ value_idx, missing_idx = properties['indices'] if pd.isnull(value): @@ -96,13 +98,14 @@ def denormalize(tensor, row, properties, round_value): Args: tensor (array): 3D Vector that contains different samples with normalized values - and record of null values. + and records of null values. row (int): - Sample to denormalize + Index of the row that needs to be decoded. properties (dict): - Contains information related to the value category. - round_value(boolean): - Apply round to the denormalized value or not. + Dictionary with information related to the given value, + which must contain the indices and the min/max values. + round_value (boolean): + Whether to round the denormalized value or not. Returns: float: @@ -125,15 +128,16 @@ def denormalize(tensor, row, properties, round_value): def one_hot_encode(tensor, value, properties): - """Update the index that corresponds to the value to 1.0. + """Set 1.0 at the tensor index that corresponds to the value. Args: tensor (array): - Vector to store one hot encoding. + Tensor that will be updated. value (int): - Categorical variable key + Value that needs to be one-hot encoded. properties (dict): - Contains information related to the value category. + Dictionary with information related to the given value, + which must contain the indices of the values. """ value_index = properties['indices'][value] tensor[value_index] = 1.0 @@ -144,15 +148,16 @@ def one_hot_decode(tensor, row, properties): Args: tensor (array): - Vector that store one hot encoding for different samples. + Tensor which contains the one-hot encoded rows. row (int): - Indicates the sample. + Index of the row that needs to be decoded. properties (dict): - Contains information related to the value category. + Dictionary with information related to the given value, + which must contain the indices of the values. Returns: int: - Category selected. + Decoded category value. """ max_value = float('-inf') for category, idx in properties['indices'].items(): @@ -169,11 +174,12 @@ def value_to_tensor(tensor, value, properties): Args: tensor (array): - Vector to store the values and recording null values position. + Tensor in which the encoded or normalized values will be stored. value (float): - Value to normalize. + Value to encode or normalize. properties (dict): - Contains information related to the value category. + Dictionary with information related to the given value, + which must contain the indices and min/max of the values. """ column_type = properties['type'] if column_type in ('continuous', 'count'): @@ -188,24 +194,24 @@ def value_to_tensor(tensor, value, properties): def data_to_tensor(data, model_data_size, data_map, fixed_length, max_sequence_length): """Convert the input data to the corresponding tensor. - If ``self._fixed_length`` is ``False``, add a 1.0 to indicate + If ``fixed_length`` is ``False``, add a 1.0 to indicate the sequence end and pad the rest of the sequence with 0.0s. Args: data (list): - List of arrays of input data. - model_data_size(int): - Dimension of tensors. + List of lists containing the input sequences. + model_data_size (int): + Number of columns to create in the tensor. data_map (dict): - Contains information related to the value category. - fixed_length(Boolean): - Define samples length. - max_sequence_length(): - Define the length of the biggest sequence. + Dictionary with information related to the data variables, + which must contain the indices and min/max of the values. + fixed_length (boolean): + Whether to add an end flag column or not. + max_sequence_length (int): + Maximum sequence length. Returns: - torch tensor: - All samples concatenated. + torch.tensor """ tensors = [] num_rows = len(data[0]) @@ -230,16 +236,16 @@ def context_to_tensor(context, context_size, context_map): """Convert the input context to the corresponding tensor. Args: - context (array): - Context context information. - context_size(int): - Define 'tensor' size. + context (list): + List containing the context values. + context_size (int): + Size of the output tensor. context_map (dict): - Contains information related to the value category. + Dictionary with information related to the context variables, + which must contain the indices and min/max of the values. Returns: - torch tensor: - 3D array, contains the concatenated samples + torch.tensor """ tensor = torch.zeros(context_size) for column, properties in context_map.items(): @@ -254,13 +260,13 @@ def tensor_to_data(tensor, data_map): Args: tensor (list): - List of arrays of input data. - data_map(int): - Dimension of tensors. + Tensor containing the generated data. + data_map (int): + Dictionary with information related to the data variables, + which must contain the indices and min/max of the values. Returns: - list: - data sequence + list """ sequence_length, num_sequences, _ = tensor.shape assert num_sequences == 1 @@ -292,17 +298,16 @@ def build_tensor(transform, sequences, key, dim, **transform_kwargs): transform (function): Function to apply. sequences (dict): - Contains data samples. + Dict containing the sequences and the context vectors. key (str): - Indicates with information pass to the function from variable 'sequence'. - dim(int) - Dimension to insert. + Key to use when obtaining the data from the sequences dict. + dim (int) + Dimension to use when the tensors are stacked. **transform_kwargs(dict) - Contains input variables for the function passed by 'transform'. + Additional arguments for the ``transform`` function. Returns: - torch tensor: - All samples concatenated. + torch tensor """ tensors = [] for sequence in sequences: