@@ -33,42 +33,32 @@ def _get_mask(X, value_to_mask):
3333 return X == value_to_mask
3434
3535
36- def _get_median (negative_elements , n_zeros , positive_elements ):
37- """Compute the median of the array formed by negative_elements,
38- n_zeros zeros and positive_elements. This function is used
39- to support sparse matrices."""
40- negative_elements = np .sort (negative_elements , kind = 'heapsort' )
41- positive_elements = np .sort (positive_elements , kind = 'heapsort' )
42-
43- n_elems = len (negative_elements ) + n_zeros + len (positive_elements )
36+ def _get_median (data , n_zeros ):
37+ """Compute the median of data with n_zeros additional zeros.
38+
39+ This function is used to support sparse matrices; it modifies data in-place
40+ """
41+ n_elems = len (data ) + n_zeros
4442 if not n_elems :
4543 return np .nan
44+ n_negative = np .count_nonzero (data < 0 )
45+ middle , is_odd = divmod (n_elems , 2 )
46+ data .sort ()
4647
47- median_position = (n_elems - 1 ) / 2.0
48+ if is_odd :
49+ return _get_elem_at_rank (middle , data , n_negative , n_zeros )
4850
49- if round (median_position ) == median_position :
50- median = _get_elem_at_rank (negative_elements , n_zeros ,
51- positive_elements , median_position )
52- else :
53- a = _get_elem_at_rank (negative_elements , n_zeros ,
54- positive_elements , math .floor (median_position ))
55- b = _get_elem_at_rank (negative_elements , n_zeros ,
56- positive_elements , math .ceil (median_position ))
57- median = (a + b ) / 2.0
58-
59- return median
60-
61-
62- def _get_elem_at_rank (negative_elements , n_zeros , positive_elements , k ):
63- """Compute the kth largest element of the array formed by
64- negative_elements, n_zeros zeros and positive_elements."""
65- len_neg = len (negative_elements )
66- if k < len_neg :
67- return negative_elements [k ]
68- elif k >= len_neg + n_zeros :
69- return positive_elements [k - len_neg - n_zeros ]
70- else :
51+ return (_get_elem_at_rank (middle - 1 , data , n_negative , n_zeros ) +
52+ _get_elem_at_rank (middle , data , n_negative , n_zeros )) / 2.
53+
54+
55+ def _get_elem_at_rank (rank , data , n_negative , n_zeros ):
56+ """Find the value in data augmented with n_zeros for the given rank"""
57+ if rank < n_negative :
58+ return data [rank ]
59+ if rank - n_negative < n_zeros :
7160 return 0
61+ return data [rank - n_zeros ]
7262
7363
7464def _most_frequent (array , extra_value , n_repeat ):
@@ -137,8 +127,8 @@ class Imputer(BaseEstimator, TransformerMixin):
137127
138128 Attributes
139129 ----------
140- `statistics_` : array of shape (n_features,) or (n_samples,)
141- The statistics along the imputation axis.
130+ `statistics_` : array of shape (n_features,)
131+ The imputation fill value for each feature if axis == 0 .
142132
143133 Notes
144134 -----
@@ -211,7 +201,7 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
211201
212202 # Count the zeros
213203 if missing_values == 0 :
214- n_zeros_axis = np .zeros (X .shape [not axis ])
204+ n_zeros_axis = np .zeros (X .shape [not axis ], dtype = int )
215205 else :
216206 n_zeros_axis = X .shape [axis ] - np .diff (X .indptr )
217207
@@ -257,19 +247,15 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
257247 mask_valids = np .hsplit (np .logical_not (mask_missing_values ),
258248 X .indptr [1 :- 1 ])
259249
260- columns = [col [mask .astype (np .bool )]
250+ # astype necessary for bug in numpy.hsplit before v1.9
251+ columns = [col [mask .astype (bool , copy = False )]
261252 for col , mask in zip (columns_all , mask_valids )]
262253
263254 # Median
264255 if strategy == "median" :
265256 median = np .empty (len (columns ))
266257 for i , column in enumerate (columns ):
267-
268- negatives = column [column < 0 ]
269- positives = column [column > 0 ]
270- median [i ] = _get_median (negatives ,
271- n_zeros_axis [i ],
272- positives )
258+ median [i ] = _get_median (column , n_zeros_axis [i ])
273259
274260 return median
275261
0 commit comments