99from warnings import warn
1010
1111from ..tree import ExtraTreeRegressor
12- from ..utils import check_random_state , check_array
12+ from ..utils import (
13+ check_random_state ,
14+ check_array ,
15+ gen_batches ,
16+ get_chunk_n_rows ,
17+ )
1318from ..utils .fixes import _joblib_parallel_args
14- from ..utils .validation import check_is_fitted
19+ from ..utils .validation import check_is_fitted , _num_samples
1520from ..base import OutlierMixin
1621
1722from .bagging import BaseBagging
@@ -388,21 +393,69 @@ def score_samples(self, X):
388393 "match the input. Model n_features is {0} and "
389394 "input n_features is {1}."
390395 "" .format (self .n_features_ , X .shape [1 ]))
391- n_samples = X .shape [0 ]
392396
393- n_samples_leaf = np .zeros (n_samples , order = "f" )
394- depths = np .zeros (n_samples , order = "f" )
397+ # Take the opposite of the scores as bigger is better (here less
398+ # abnormal)
399+ return - self ._compute_chunked_score_samples (X )
400+
401+ @property
402+ def threshold_ (self ):
403+ if self .behaviour != 'old' :
404+ raise AttributeError ("threshold_ attribute does not exist when "
405+ "behaviour != 'old'" )
406+ warn ("threshold_ attribute is deprecated in 0.20 and will"
407+ " be removed in 0.22." , DeprecationWarning )
408+ return self ._threshold_
409+
410+ def _compute_chunked_score_samples (self , X ):
411+
412+ n_samples = _num_samples (X )
395413
396414 if self ._max_features == X .shape [1 ]:
397415 subsample_features = False
398416 else :
399417 subsample_features = True
400418
419+ # We get as many rows as possible within our working_memory budget
420+ # (defined by sklearn.get_config()['working_memory']) to store
421+ # self._max_features in each row during computation.
422+ #
423+ # Note:
424+ # - this will get at least 1 row, even if 1 row of score will
425+ # exceed working_memory.
426+ # - this does only account for temporary memory usage while loading
427+ # the data needed to compute the scores -- the returned scores
428+ # themselves are 1D.
429+
430+ chunk_n_rows = get_chunk_n_rows (row_bytes = 16 * self ._max_features ,
431+ max_n_rows = n_samples )
432+ slices = gen_batches (n_samples , chunk_n_rows )
433+
434+ scores = np .zeros (n_samples , order = "f" )
435+
436+ for sl in slices :
437+ # compute score on the slices of test samples:
438+ scores [sl ] = self ._compute_score_samples (X [sl ], subsample_features )
439+
440+ return scores
441+
442+ def _compute_score_samples (self , X , subsample_features ):
443+ """Compute the score of each samples in X going through the extra trees.
444+
445+ Parameters
446+ ----------
447+ X : array-like or sparse matrix
448+
449+ subsample_features : bool,
450+ whether features should be subsampled
451+ """
452+ n_samples = X .shape [0 ]
453+
454+ depths = np .zeros (n_samples , order = "f" )
455+
401456 for tree , features in zip (self .estimators_ , self .estimators_features_ ):
402- if subsample_features :
403- X_subset = X [:, features ]
404- else :
405- X_subset = X
457+ X_subset = X [:, features ] if subsample_features else X
458+
406459 leaves_index = tree .apply (X_subset )
407460 node_indicator = tree .decision_path (X_subset )
408461 n_samples_leaf = tree .tree_ .n_node_samples [leaves_index ]
@@ -418,19 +471,7 @@ def score_samples(self, X):
418471 / (len (self .estimators_ )
419472 * _average_path_length ([self .max_samples_ ]))
420473 )
421-
422- # Take the opposite of the scores as bigger is better (here less
423- # abnormal)
424- return - scores
425-
426- @property
427- def threshold_ (self ):
428- if self .behaviour != 'old' :
429- raise AttributeError ("threshold_ attribute does not exist when "
430- "behaviour != 'old'" )
431- warn ("threshold_ attribute is deprecated in 0.20 and will"
432- " be removed in 0.22." , DeprecationWarning )
433- return self ._threshold_
474+ return scores
434475
435476
436477def _average_path_length (n_samples_leaf ):
0 commit comments