Skip to content

Commit dcf827e

Browse files
committed
Merge pull request scikit-learn#2368 from emsrc/cosine_distance
Cosine distance metric for sparse matrices
2 parents 41d0fb8 + fc91c33 commit dcf827e

File tree

2 files changed

+55
-14
lines changed

2 files changed

+55
-14
lines changed

sklearn/metrics/pairwise.py

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,37 @@ def manhattan_distances(X, Y=None, sum_over_features=True,
271271
return D
272272

273273

274+
def cosine_distances(X, Y=None):
275+
"""
276+
Compute cosine distance between samples in X and Y.
277+
278+
Cosine distance is defined as 1.0 minus the cosine similarity.
279+
280+
Parameters
281+
----------
282+
X : array_like, sparse matrix
283+
with shape (n_samples_X, n_features).
284+
285+
Y : array_like, sparse matrix (optional)
286+
with shape (n_samples_Y, n_features).
287+
288+
Returns
289+
-------
290+
distance matrix : array_like
291+
An array with shape (n_samples_X, n_samples_Y).
292+
293+
See also
294+
--------
295+
sklearn.metrics.pairwise.cosine_similarity
296+
scipy.spatial.distance.cosine (dense matrices only)
297+
"""
298+
# 1.0 - cosine_similarity(X, Y) without copy
299+
S = cosine_similarity(X, Y)
300+
S *= -1
301+
S += 1
302+
return S
303+
304+
274305
# Kernels
275306
def linear_kernel(X, Y=None):
276307
"""
@@ -525,11 +556,12 @@ def chi2_kernel(X, Y=None, gamma=1.):
525556
PAIRWISE_DISTANCE_FUNCTIONS = {
526557
# If updating this dictionary, update the doc in both distance_metrics()
527558
# and also in pairwise_distances()!
559+
'cityblock': manhattan_distances,
560+
'cosine': cosine_distances,
528561
'euclidean': euclidean_distances,
529562
'l2': euclidean_distances,
530563
'l1': manhattan_distances,
531-
'manhattan': manhattan_distances,
532-
'cityblock': manhattan_distances, }
564+
'manhattan': manhattan_distances, }
533565

534566

535567
def distance_metrics():
@@ -545,6 +577,7 @@ def distance_metrics():
545577
metric Function
546578
============ ====================================
547579
'cityblock' metrics.pairwise.manhattan_distances
580+
'cosine' metrics.pairwise.cosine_distances
548581
'euclidean' metrics.pairwise.euclidean_distances
549582
'l1' metrics.pairwise.manhattan_distances
550583
'l2' metrics.pairwise.euclidean_distances
@@ -585,25 +618,27 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
585618
If Y is given (default is None), then the returned matrix is the pairwise
586619
distance between the arrays from both X and Y.
587620
588-
Please note that support for sparse matrices is currently limited to those
589-
metrics listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS.
621+
Please note that support for sparse matrices is currently limited to
622+
'euclidean', 'l2' and 'cosine'.
590623
591624
Valid values for metric are:
592625
593-
- from scikit-learn: ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock']
626+
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
627+
'manhattan']
594628
595629
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
596-
'correlation', 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski',
597-
'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao',
598-
'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
630+
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis',
631+
'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean',
632+
'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
599633
See the documentation for scipy.spatial.distance for details on these
600634
metrics.
601635
602-
Note in the case of 'euclidean' and 'cityblock' (which are valid
603-
scipy.spatial.distance metrics), the values will use the scikit-learn
604-
implementation, which is faster and has support for sparse matrices.
605-
For a verbose description of the metrics from scikit-learn, see the
606-
__doc__ of the sklearn.pairwise.distance_metrics function.
636+
Note that in the case of 'cityblock', 'cosine' and 'euclidean' (which are
637+
valid scipy.spatial.distance metrics), the scikit-learn implementation
638+
will be used, which is faster and has support for sparse matrices (except
639+
for 'cityblock'). For a verbose description of the metrics from
640+
scikit-learn, see the __doc__ of the sklearn.pairwise.distance_metrics
641+
function.
607642
608643
Parameters
609644
----------

sklearn/metrics/tests/test_pairwise.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
from sklearn.metrics.pairwise import rbf_kernel
2121
from sklearn.metrics.pairwise import sigmoid_kernel
2222
from sklearn.metrics.pairwise import cosine_similarity
23+
from sklearn.metrics.pairwise import cosine_distances
2324
from sklearn.metrics.pairwise import pairwise_distances
2425
from sklearn.metrics.pairwise import pairwise_kernels
2526
from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
@@ -66,6 +67,7 @@ def test_pairwise_distances():
6667
S3 = manhattan_distances(X, Y, size_threshold=10)
6768
assert_array_almost_equal(S, S3)
6869
# Test cosine as a string metric versus cosine callable
70+
# "cosine" uses sklearn metric, cosine (function) is scipy.spatial
6971
S = pairwise_distances(X, Y, metric="cosine")
7072
S2 = pairwise_distances(X, Y, metric=cosine)
7173
assert_equal(S.shape[0], X.shape[0])
@@ -75,12 +77,16 @@ def test_pairwise_distances():
7577
S = np.dot(X, X.T)
7678
S2 = pairwise_distances(S, metric="precomputed")
7779
assert_true(S is S2)
78-
# Test with sparse X and Y
80+
# Test with sparse X and Y,
81+
# currently only supported for euclidean and cosine
7982
X_sparse = csr_matrix(X)
8083
Y_sparse = csr_matrix(Y)
8184
S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
8285
S2 = euclidean_distances(X_sparse, Y_sparse)
8386
assert_array_almost_equal(S, S2)
87+
S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
88+
S2 = cosine_distances(X_sparse, Y_sparse)
89+
assert_array_almost_equal(S, S2)
8490
# Test with scipy.spatial.distance metric, with a kwd
8591
kwds = {"p": 2.0}
8692
S = pairwise_distances(X, Y, metric="minkowski", **kwds)

0 commit comments

Comments
 (0)