Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions modAL/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import scipy.sparse as sp
from sklearn.metrics.pairwise import pairwise_distances, pairwise_distances_argmin_min

from modAL.utils.data import data_vstack, modALinput
from modAL.utils.data import data_vstack, modALinput, data_shape
from modAL.models.base import BaseCommittee, BaseLearner
from modAL.uncertainty import classifier_uncertainty

Expand Down Expand Up @@ -150,8 +150,10 @@ def ranked_batch(classifier: Union[BaseLearner, BaseCommittee],
if classifier.X_training is None:
best_coldstart_instance_index, labeled = select_cold_start_instance(X=unlabeled, metric=metric, n_jobs=n_jobs)
instance_index_ranking = [best_coldstart_instance_index]
elif classifier.X_training.shape[0] > 0:
labeled = classifier.Xt_training[:] if classifier.on_transformed else classifier.X_training[:]
elif data_shape(classifier.X_training)[0] > 0:
labeled = classifier.transform_without_estimating(
classifier.X_training
) if classifier.on_transformed else classifier.X_training[:]
instance_index_ranking = []

# The maximum number of records to sample.
Expand Down
8 changes: 0 additions & 8 deletions modAL/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,9 @@ def __init__(self,
self.on_transformed = on_transformed

self.X_training = X_training
self.Xt_training = None
self.y_training = y_training
if X_training is not None:
self._fit_to_known(bootstrap=bootstrap_init, **fit_kwargs)
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None

assert isinstance(force_all_finite, bool), 'force_all_finite must be a bool'
self.force_all_finite = force_all_finite
Expand All @@ -92,15 +90,10 @@ def _add_training_data(self, X: modALinput, y: modALinput) -> None:

if self.X_training is None:
self.X_training = X
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
self.y_training = y
else:
try:
self.X_training = data_vstack((self.X_training, X))
self.Xt_training = data_vstack((
self.Xt_training,
self.transform_without_estimating(X)
)) if self.on_transformed else None
self.y_training = data_vstack((self.y_training, y))
except ValueError:
raise ValueError('the dimensions of the new training data and label must'
Expand Down Expand Up @@ -213,7 +206,6 @@ def fit(self, X: modALinput, y: modALinput, bootstrap: bool = False, **fit_kwarg
check_X_y(X, y, accept_sparse=True, ensure_2d=False, allow_nd=True, multi_output=True, dtype=None,
force_all_finite=self.force_all_finite)
self.X_training, self.y_training = X, y
self.Xt_training = self.transform_without_estimating(self.X_training) if self.on_transformed else None
return self._fit_to_known(bootstrap=bootstrap, **fit_kwargs)

def predict(self, X: modALinput, **predict_kwargs) -> Any:
Expand Down
40 changes: 40 additions & 0 deletions tests/core_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction.text import CountVectorizer
from scipy.stats import entropy, norm
from scipy.special import ndtr
from scipy import sparse as sp
Expand Down Expand Up @@ -824,6 +825,45 @@ def test_on_transformed(self):
query_idx, query_inst = learner.query(X_pool)
learner.teach(X_pool.iloc[query_idx], y_pool[query_idx])

def test_on_transformed_with_variable_transformation(self):
"""
Learnable transformations naturally change after a model is retrained. Make sure this is handled
properly for on_transformed=True query strategies.
"""
query_strategies = [
modAL.batch.uncertainty_batch_sampling
# add further strategies which work with instance representations
# no further ones as of 09.12.2020
]

X_labeled = ['Dog', 'Cat', 'Tree']

# contains unseen in labeled words, training model on those
# will alter CountVectorizer transformations
X_pool = ['Airplane', 'House']

y = [0, 1, 1, 0, 1] # irrelevant for test

for query_strategy in query_strategies:
learner = modAL.models.learners.ActiveLearner(
estimator=make_pipeline(
CountVectorizer(),
RandomForestClassifier(n_estimators=10)
),
query_strategy=query_strategy,
X_training=X_labeled, y_training=y[:len(X_labeled)],
on_transformed=True,
)

for _ in range(len(X_pool)):
query_idx, query_instance = learner.query(X_pool, n_instances=1)
i = query_idx[0]

learner.teach(
X=[X_pool[i]],
y=[y[i]]
)

def test_old_query_strategy_interface(self):
n_samples = 10
n_features = 5
Expand Down