fieraloca
diff --git a/‎benchmarks/bench_tsne_mnist.py‎
Lines changed: 3 additions & 2 deletions b/‎benchmarks/bench_tsne_mnist.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎doc/whats_new/v0.22.rst‎
Lines changed: 4 additions & 0 deletions b/‎doc/whats_new/v0.22.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎sklearn/manifold/_barnes_hut_tsne.pyx‎
Lines changed: 118 additions & 90 deletions b/‎sklearn/manifold/_barnes_hut_tsne.pyx‎
Lines changed: 118 additions & 90 deletions
@@ -21,7 +21,7 @@
 from sklearn.decomposition import PCA
 from sklearn.utils import check_array
 from sklearn.utils import shuffle as _shuffle
-
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 
 LOG_DIR = "mnist_tsne_output"
 if not os.path.exists(LOG_DIR):
@@ -86,6 +86,7 @@ def sanitize(filename):
                              "preprocessing.")
     args = parser.parse_args()
 
+    print("Used number of threads: {}".format(_openmp_effective_n_threads()))
     X, y = load_data(order=args.order)
 
     if args.pca_components > 0:
@@ -141,7 +142,7 @@ def bhtsne(X):
         data_size.append(70000)
 
     results = []
-    basename, _ = os.path.splitext(__file__)
+    basename = os.path.basename(os.path.splitext(__file__)[0])
     log_filename = os.path.join(LOG_DIR, basename + '.json')
     for n in data_size:
         X_train = X[:n]
 
@@ -430,6 +430,10 @@ Changelog
   impact when ``metric="precomputed"`` or (``metric="euclidean"`` and
   ``method="exact"``). :issue:`15082` by `Roman Yurchak`_.
 
+- |Efficiency| Improved efficiency of :class:`manifold.TSNE` when
+  ``method="barnes-hut"`` by computing the gradient in parallel.
+  :pr:`13213` by :user:`Thomas Moreau <tommoral>`
+
 - |API| Deprecate ``training_data_`` unused attribute in
   :class:`manifold.Isomap`. :issue:`10482` by `Tom Dupre la Tour`_.
 
 
@@ -9,14 +9,16 @@
 # implementations and papers describing the technique
 
 
-from libc.stdlib cimport malloc, free
-from libc.stdio cimport printf
-from libc.math cimport sqrt, log
 import numpy as np
 cimport numpy as np
+from libc.stdio cimport printf
+from libc.math cimport sqrt, log
+from libc.stdlib cimport malloc, free
+from cython.parallel cimport prange, parallel
 
 from ..neighbors._quad_tree cimport _QuadTree
 
+
 cdef char* EMPTY_STRING = ""
 
 cdef extern from "math.h":
@@ -53,17 +55,18 @@ cdef float compute_gradient(float[:] val_P,
                             int dof,
                             long start,
                             long stop,
-                            bint compute_error) nogil:
+                            bint compute_error,
+                            int num_threads) nogil:
     # Having created the tree, calculate the gradient
     # in two components, the positive and negative forces
     cdef:
         long i, coord
         int ax
         long n_samples = pos_reference.shape[0]
         int n_dimensions = qt.n_dimensions
-        double[1] sum_Q
         clock_t t1 = 0, t2 = 0
-        float sQ, error
+        double sQ
+        float error
         int take_timing = 1 if qt.verbose > 15 else 0
 
     if qt.verbose > 11:
@@ -72,25 +75,25 @@ cdef float compute_gradient(float[:] val_P,
     cdef float* neg_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
     cdef float* pos_f = <float*> malloc(sizeof(float) * n_samples * n_dimensions)
 
-    sum_Q[0] = 0.0
     if take_timing:
         t1 = clock()
-    compute_gradient_negative(pos_reference, neg_f, qt, sum_Q,
-                              dof, theta, start, stop)
+    sQ = compute_gradient_negative(pos_reference, neg_f, qt, dof, theta, start,
+                                   stop, num_threads)
     if take_timing:
         t2 = clock()
         printf("[t-SNE] Computing negative gradient: %e ticks\n", ((float) (t2 - t1)))
-    sQ = sum_Q[0]
 
     if take_timing:
         t1 = clock()
     error = compute_gradient_positive(val_P, pos_reference, neighbors, indptr,
                                       pos_f, n_dimensions, dof, sQ, start,
-                                      qt.verbose, compute_error)
+                                      qt.verbose, compute_error, num_threads)
     if take_timing:
         t2 = clock()
-        printf("[t-SNE] Computing positive gradient: %e ticks\n", ((float) (t2 - t1)))
-    for i in range(start, n_samples):
+        printf("[t-SNE] Computing positive gradient: %e ticks\n",
+               ((float) (t2 - t1)))
+    for i in prange(start, n_samples, nogil=True, num_threads=num_threads,
+                    schedule='static'):
         for ax in range(n_dimensions):
             coord = i * n_dimensions + ax
             tot_force[i, ax] = pos_f[coord] - (neg_f[coord] / sQ)
@@ -110,7 +113,8 @@ cdef float compute_gradient_positive(float[:] val_P,
                                      double sum_Q,
                                      np.int64_t start,
                                      int verbose,
-                                     bint compute_error) nogil:
+                                     bint compute_error,
+                                     int num_threads) nogil:
     # Sum over the following expression for i not equal to j
     # grad_i = p_ij (1 + ||y_i - y_j||^2)^-1 (y_i - y_j)
     # This is equivalent to compute_edge_forces in the authors' code
@@ -120,118 +124,138 @@ cdef float compute_gradient_positive(float[:] val_P,
         int ax
         long i, j, k
         long n_samples = indptr.shape[0] - 1
-        float dij, qij, pij
         float C = 0.0
+        float dij, qij, pij
         float exponent = (dof + 1.0) / 2.0
         float float_dof = (float) (dof)
-        float[3] buff
+        float* buff
         clock_t t1 = 0, t2 = 0
         float dt
 
     if verbose > 10:
         t1 = clock()
-    for i in range(start, n_samples):
-        # Init the gradient vector
-        for ax in range(n_dimensions):
-            pos_f[i * n_dimensions + ax] = 0.0
-        # Compute the positive interaction for the nearest neighbors
-        for k in range(indptr[i], indptr[i+1]):
-            j = neighbors[k]
-            dij = 0.0
-            pij = val_P[k]
-            for ax in range(n_dimensions):
-                buff[ax] = pos_reference[i, ax] - pos_reference[j, ax]
-                dij += buff[ax] * buff[ax]
-            qij = float_dof / (float_dof + dij)
-            if dof != 1:  # i.e. exponent != 1
-                qij **= exponent
-            dij = pij * qij
-
-            # only compute the error when needed
-            if compute_error:
-                qij /= sum_Q
-                C += pij * log(max(pij, FLOAT32_TINY) / max(qij, FLOAT32_TINY))
+
+    with nogil, parallel(num_threads=num_threads):
+        # Define private buffer variables
+        buff = <float *> malloc(sizeof(float) * n_dimensions)
+
+        for i in prange(start, n_samples, schedule='static'):
+            # Init the gradient vector
             for ax in range(n_dimensions):
-                pos_f[i * n_dimensions + ax] += dij * buff[ax]
+                pos_f[i * n_dimensions + ax] = 0.0
+            # Compute the positive interaction for the nearest neighbors
+            for k in range(indptr[i], indptr[i+1]):
+                j = neighbors[k]
+                dij = 0.0
+                pij = val_P[k]
+                for ax in range(n_dimensions):
+                    buff[ax] = pos_reference[i, ax] - pos_reference[j, ax]
+                    dij += buff[ax] * buff[ax]
+                qij = float_dof / (float_dof + dij)
+                if dof != 1:  # i.e. exponent != 1
+                    qij = qij ** exponent
+                dij = pij * qij
+
+                # only compute the error when needed
+                if compute_error:
+                    qij = qij / sum_Q
+                    C += pij * log(max(pij, FLOAT32_TINY) \
+                        / max(qij, FLOAT32_TINY))
+                for ax in range(n_dimensions):
+                    pos_f[i * n_dimensions + ax] += dij * buff[ax]
+
+        free(buff)
     if verbose > 10:
         t2 = clock()
         dt = ((float) (t2 - t1))
         printf("[t-SNE] Computed error=%1.4f in %1.1e ticks\n", C, dt)
     return C
 
 
-cdef void compute_gradient_negative(float[:, :] pos_reference,
-                                    float* neg_f,
-                                    _QuadTree qt,
-                                    double* sum_Q,
-                                    int dof,
-                                    float theta,
-                                    long start,
-                                    long stop) nogil:
+cdef double compute_gradient_negative(float[:, :] pos_reference,
+                                      float* neg_f,
+                                      _QuadTree qt,
+                                      int dof,
+                                      float theta,
+                                      long start,
+                                      long stop,
+                                      int num_threads) nogil:
     if stop == -1:
         stop = pos_reference.shape[0]
     cdef:
         int ax
         int n_dimensions = qt.n_dimensions
+        int offset = n_dimensions + 2
         long i, j, idx
         long n = stop - start
         long dta = 0
         long dtb = 0
-        long offset = n_dimensions + 2
         float size, dist2s, mult
         float exponent = (dof + 1.0) / 2.0
         float float_dof = (float) (dof)
-        double qijZ
-        float[1] iQ
-        float[3] force, neg_force, pos
+        double qijZ, sum_Q = 0.0
+        float* force
+        float* neg_force
+        float* pos
         clock_t t1 = 0, t2 = 0, t3 = 0
         int take_timing = 1 if qt.verbose > 20 else 0
 
-    summary = <float*> malloc(sizeof(float) * n * offset)
 
-    for i in range(start, stop):
-        # Clear the arrays
-        for ax in range(n_dimensions):
-            force[ax] = 0.0
-            neg_force[ax] = 0.0
-            pos[ax] = pos_reference[i, ax]
-        iQ[0] = 0.0
-        # Find which nodes are summarizing and collect their centers of mass
-        # deltas, and sizes, into vectorized arrays
-        if take_timing:
-            t1 = clock()
-        idx = qt.summarize(pos, summary, theta*theta)
-        if take_timing:
-            t2 = clock()
-        # Compute the t-SNE negative force
-        # for the digits dataset, walking the tree
-        # is about 10-15x more expensive than the
-        # following for loop
-        for j in range(idx // offset):
-
-            dist2s = summary[j * offset + n_dimensions]
-            size = summary[j * offset + n_dimensions + 1]
-            qijZ = float_dof / (float_dof + dist2s)  # 1/(1+dist)
-            if dof != 1:  # i.e. exponent != 1
-                qijZ **= exponent
-            sum_Q[0] += size * qijZ   # size of the node * q
-            mult = size * qijZ * qijZ
+    with nogil, parallel(num_threads=num_threads):
+        # Define thread-local buffers
+        summary = <float*> malloc(sizeof(float) * n * offset)
+        pos = <float *> malloc(sizeof(float) * n_dimensions)
+        force = <float *> malloc(sizeof(float) * n_dimensions)
+        neg_force = <float *> malloc(sizeof(float) * n_dimensions)
+
+        for i in prange(start, stop, schedule='static'):
+            # Clear the arrays
             for ax in range(n_dimensions):
-                neg_force[ax] += mult * summary[j * offset + ax]
-        if take_timing:
-            t3 = clock()
-        for ax in range(n_dimensions):
-            neg_f[i * n_dimensions + ax] = neg_force[ax]
-        if take_timing:
-            dta += t2 - t1
-            dtb += t3 - t2
+                force[ax] = 0.0
+                neg_force[ax] = 0.0
+                pos[ax] = pos_reference[i, ax]
+
+            # Find which nodes are summarizing and collect their centers of mass
+            # deltas, and sizes, into vectorized arrays
+            if take_timing:
+                t1 = clock()
+            idx = qt.summarize(pos, summary, theta*theta)
+            if take_timing:
+                t2 = clock()
+            # Compute the t-SNE negative force
+            # for the digits dataset, walking the tree
+            # is about 10-15x more expensive than the
+            # following for loop
+            for j in range(idx // offset):
+
+                dist2s = summary[j * offset + n_dimensions]
+                size = summary[j * offset + n_dimensions + 1]
+                qijZ = float_dof / (float_dof + dist2s)  # 1/(1+dist)
+                if dof != 1:  # i.e. exponent != 1
+                    qijZ = qijZ ** exponent
+
+                sum_Q += size * qijZ   # size of the node * q
+                mult = size * qijZ * qijZ
+                for ax in range(n_dimensions):
+                    neg_force[ax] += mult * summary[j * offset + ax]
+            if take_timing:
+                t3 = clock()
+            for ax in range(n_dimensions):
+                neg_f[i * n_dimensions + ax] = neg_force[ax]
+            if take_timing:
+                dta += t2 - t1
+                dtb += t3 - t2
+        free(pos)
+        free(force)
+        free(neg_force)
+        free(summary)
     if take_timing:
         printf("[t-SNE] Tree: %li clock ticks | ", dta)
         printf("Force computation: %li clock ticks\n", dtb)
 
     # Put sum_Q to machine EPSILON to avoid divisions by 0
-    sum_Q[0] = max(sum_Q[0], FLOAT64_EPS)
-    free(summary)
+    sum_Q = max(sum_Q, FLOAT64_EPS)
+    return sum_Q
 
 
 def gradient(float[:] val_P,
@@ -244,7 +268,8 @@ def gradient(float[:] val_P,
              int verbose,
              int dof=1,
              long skip_num_points=0,
-             bint compute_error=1):
+             bint compute_error=1,
+             int num_threads=1):
     # This function is designed to be called from external Python
     # it passes the 'forces' array by reference and fills thats array
     # up in-place
@@ -269,8 +294,11 @@ def gradient(float[:] val_P,
         # in the generated C code that triggers error with gcc 4.9
         # and -Werror=format-security
         printf("[t-SNE] Computing gradient\n%s", EMPTY_STRING)
+
     C = compute_gradient(val_P, pos_output, neighbors, indptr, forces,
-                         qt, theta, dof, skip_num_points, -1, compute_error)
+                         qt, theta, dof, skip_num_points, -1, compute_error,
+                         num_threads)
+
     if verbose > 10:
         # XXX: format hack to workaround lack of `const char *` type
         # in the generated C code