ENH fix astype usage to prevent copying

larsmans · larsmans · commit cdd2279c09dd · 2014-06-18T11:01:28.000+02:00
Removed where possible; using copy=False (from utils.fixes) where needed.

Also some C integer type fixes to gradient boosting.
diff --git a/.gitattributes b/.gitattributes
@@ -10,6 +10,7 @@
 /sklearn/linear_model/sgd_fast.c -diff
 /sklearn/metrics/pairwise_fast.c -diff
 /sklearn/neighbors/ball_tree.c -diff
+/sklearn/neighbors/kd_tree.c -diff
 /sklearn/svm/liblinear.c -diff
 /sklearn/svm/libsvm.c -diff
 /sklearn/svm/libsvm_sparse.c -diff
diff --git a/sklearn/covariance/robust_covariance.py b/sklearn/covariance/robust_covariance.py
@@ -681,7 +681,7 @@ def reweight_covariance(self, data):
             location_reweighted = data[mask].mean(0)
         covariance_reweighted = self._nonrobust_covariance(
             data[mask], assume_centered=self.assume_centered)
-        support_reweighted = np.zeros(n_samples).astype(bool)
+        support_reweighted = np.zeros(n_samples, dtype=bool)
         support_reweighted[mask] = True
         self._set_covariance(covariance_reweighted)
         self.location_ = location_reweighted
diff --git a/sklearn/datasets/samples_generator.py b/sklearn/datasets/samples_generator.py
@@ -14,6 +14,7 @@
 from ..preprocessing import MultiLabelBinarizer
 from ..utils import array2d, check_random_state
 from ..utils import shuffle as util_shuffle
+from ..utils.fixes import astype
 from ..utils.random import sample_without_replacement
 from ..externals import six
 map = six.moves.map
@@ -26,8 +27,9 @@ def _generate_hypercube(samples, dimensions, rng):
     if dimensions > 30:
         return np.hstack([_generate_hypercube(samples, dimensions - 30, rng),
                           _generate_hypercube(samples, 30, rng)])
-    out = sample_without_replacement(2 ** dimensions, samples,
-                                     random_state=rng).astype('>u4')
+    out = astype(sample_without_replacement(2 ** dimensions, samples,
+                                            random_state=rng),
+                 dtype='>u4', copy=False)
     out = np.unpackbits(out.view('>u1')).reshape((-1, 32))[:, -dimensions:]
     return out
 
diff --git a/sklearn/ensemble/_gradient_boosting.c b/sklearn/ensemble/_gradient_boosting.c
diff --git a/sklearn/ensemble/_gradient_boosting.pyx b/sklearn/ensemble/_gradient_boosting.pyx
@@ -16,16 +16,12 @@ from sklearn.tree._tree cimport Tree, Node
 
 ctypedef np.int32_t int32
 ctypedef np.float64_t float64
-ctypedef np.int8_t int8
-
-from numpy import bool as np_bool
+ctypedef np.uint8_t uint8
 
 # no namespace lookup for numpy dtype and array creation
 from numpy import zeros as np_zeros
 from numpy import ones as np_ones
 from numpy import bool as np_bool
-from numpy import int8 as np_int8
-from numpy import intp as np_intp
 from numpy import float32 as np_float32
 from numpy import float64 as np_float64
 
@@ -267,7 +263,8 @@ cpdef _partial_dependence_tree(Tree tree, DTYPE_t[:, ::1] X,
                              total_weight)
 
 
-def _random_sample_mask(int n_total_samples, int n_total_in_bag, random_state):
+def _random_sample_mask(np.npy_intp n_total_samples,
+                        np.npy_intp n_total_in_bag, random_state):
      """Create a random sample mask where ``n_total_in_bag`` elements are set.
 
      Parameters
@@ -289,15 +286,15 @@ def _random_sample_mask(int n_total_samples, int n_total_in_bag, random_state):
      """
      cdef np.ndarray[float64, ndim=1, mode="c"] rand = \
           random_state.rand(n_total_samples)
-     cdef np.ndarray[int8, ndim=1, mode="c"] sample_mask = \
-          np_zeros((n_total_samples,), dtype=np_int8)
+     cdef np.ndarray[uint8, ndim=1, mode="c", cast=True] sample_mask = \
+          np_zeros((n_total_samples,), dtype=np_bool)
 
-     cdef int n_bagged = 0
-     cdef int i = 0
+     cdef np.npy_intp n_bagged = 0
+     cdef np.npy_intp i = 0
 
-     for i from 0 <= i < n_total_samples:
+     for i in range(n_total_samples):
          if rand[i] * (n_total_samples - i) < (n_total_in_bag - n_bagged):
              sample_mask[i] = 1
              n_bagged += 1
 
-     return sample_mask.astype(np_bool)
+     return sample_mask
diff --git a/sklearn/feature_extraction/image.py b/sklearn/feature_extraction/image.py
@@ -16,6 +16,7 @@
 from numpy.lib.stride_tricks import as_strided
 
 from ..utils import array2d, check_random_state
+from ..utils.fixes import astype
 from ..base import BaseEstimator
 
 __all__ = ['PatchExtractor',
@@ -107,7 +108,8 @@ def _to_graph(n_x, n_y, n_z, mask=None, img=None,
         n_voxels = diag.size
     else:
         if mask is not None:
-            mask = mask.astype(np.bool)
+            mask = astype(mask, dtype=np.bool, copy=False)
+            mask = np.asarray(mask, dtype=np.bool)
             edges = _mask_edges_weights(mask, edges)
             n_voxels = np.sum(mask)
         else:
@@ -147,7 +149,7 @@ def img_to_graph(img, mask=None, return_as=sparse.coo_matrix, dtype=None):
         dtype of img
 
     Notes
-    ===========
+    =====
     For sklearn versions 0.14.1 and prior, return_as=np.ndarray was handled
     by returning a dense np.matrix instance.  Going forward, np.ndarray
     returns an np.ndarray, as expected.
@@ -183,7 +185,7 @@ def grid_to_graph(n_x, n_y, n_z=1, mask=None, return_as=sparse.coo_matrix,
         The data of the returned sparse matrix. By default it is int
 
     Notes
-    ===========
+    =====
     For sklearn versions 0.14.1 and prior, return_as=np.ndarray was handled
     by returning a dense np.matrix instance.  Going forward, np.ndarray
     returns an np.ndarray, as expected.
diff --git a/sklearn/gaussian_process/gaussian_process.py b/sklearn/gaussian_process/gaussian_process.py
@@ -52,7 +52,7 @@ def l1_cross_distances(X):
         ij[ll_0:ll_1, 1] = np.arange(k + 1, n_samples)
         D[ll_0:ll_1] = np.abs(X[k] - X[(k + 1):n_samples])
 
-    return D, ij.astype(np.int)
+    return D, ij
 
 
 class GaussianProcess(BaseEstimator, RegressorMixin):
diff --git a/sklearn/learning_curve.py b/sklearn/learning_curve.py
@@ -4,14 +4,17 @@
 #
 # License: BSD 3 clause
 
-import numpy as np
 import warnings
+
+import numpy as np
+
 from .base import is_classifier, clone
 from .cross_validation import _check_cv
-from .utils import check_arrays
 from .externals.joblib import Parallel, delayed
 from .cross_validation import _safe_split, _score, _fit_and_score
 from .metrics.scorer import check_scoring
+from .utils import check_arrays
+from .utils.fixes import astype
 
 
 def learning_curve(estimator, X, y, train_sizes=np.linspace(0.1, 1.0, 5),
@@ -175,8 +178,8 @@ def _translate_train_sizes(train_sizes, n_max_training_samples):
                              "must be within (0, 1], but is within [%f, %f]."
                              % (n_min_required_samples,
                                 n_max_required_samples))
-        train_sizes_abs = (train_sizes_abs
-                           * n_max_training_samples).astype(np.int)
+        train_sizes_abs = astype(train_sizes_abs * n_max_training_samples,
+                                 dtype=np.int, copy=False)
         train_sizes_abs = np.clip(train_sizes_abs, 1,
                                   n_max_training_samples)
     else:
diff --git a/sklearn/linear_model/randomized_l1.py b/sklearn/linear_model/randomized_l1.py
@@ -49,7 +49,7 @@ def _resample_model(estimator_func, X, y, scaling=.5, n_resampling=200,
                 verbose=max(0, verbose - 1),
                 **params)
             for _ in range(n_resampling)):
-        scores_ += active_set.astype(np.float)
+        scores_ += active_set
 
     scores_ /= n_resampling
     return scores_
diff --git a/sklearn/naive_bayes.py b/sklearn/naive_bayes.py
@@ -298,7 +298,6 @@ def fit(self, X, y, sample_weight=None):
             Returns self.
         """
         X, y = check_arrays(X, y, sparse_format='csr')
-        X = X.astype(np.float)
         y = column_or_1d(y, warn=True)
         _, n_features = X.shape
 
@@ -308,7 +307,8 @@ def fit(self, X, y, sample_weight=None):
         if Y.shape[1] == 1:
             Y = np.concatenate((1 - Y, Y), axis=1)
 
-        # convert to float to support sample weight consistently
+        # convert to float to support sample weight consistently;
+        # this means we also don't have to cast X to floating point
         Y = Y.astype(np.float64)
         if sample_weight is not None:
             Y *= array2d(sample_weight).T
diff --git a/sklearn/neighbors/binary_tree.pxi b/sklearn/neighbors/binary_tree.pxi
@@ -1703,7 +1703,7 @@ cdef class BinaryTree:
 
         # prepare r for query
         r = np.asarray(r, dtype=DTYPE, order='C')
-        r = np.atleast_1d(r).astype(DTYPE)
+        r = np.atleast_1d(r)
         if r.ndim != 1:
             raise ValueError("r must be a 1-dimensional array")
         i_rsort = np.argsort(r)
diff --git a/sklearn/neighbors/classification.py b/sklearn/neighbors/classification.py
@@ -378,7 +378,7 @@ def predict(self, X):
                                  in zip(pred_labels[inliers], weights)],
                                 dtype=np.int)
 
-            mode = mode.ravel().astype(np.int)
+            mode = mode.ravel()
 
             y_pred[inliers, k] = classes_k.take(mode)
 
diff --git a/sklearn/neighbors/kd_tree.c b/sklearn/neighbors/kd_tree.c
diff --git a/sklearn/utils/linear_assignment_.py b/sklearn/utils/linear_assignment_.py