Skip to content

Commit 3f37cb9

Browse files
TomDLTlesteve
authored andcommitted
[MRG+2] convert to boolean arrays for boolean distances (scikit-learn#6932)
for example jaccard.
1 parent fe03879 commit 3f37cb9

File tree

4 files changed

+82
-15
lines changed

4 files changed

+82
-15
lines changed

doc/whats_new.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,6 @@ Enhancements
130130
(`#4294 <https://github.com/scikit-learn/scikit-learn/pull/4294>`_) by
131131
`Raghav R V`_.
132132

133-
- The random forest, extra trees and decision tree estimators now has a
134-
method ``decision_path`` which returns the decision path of samples in
135-
the tree. By `Arnaud Joly`_.
136-
137133
- The random forest, extra tree and decision tree estimators now has a
138134
method ``decision_path`` which returns the decision path of samples in
139135
the tree. By `Arnaud Joly`_.
@@ -283,6 +279,10 @@ Bug fixes
283279
with them as parameters, could not be passed to :func:`base.clone`.
284280
By `Loic Esteve`_.
285281

282+
- :func:`pairwise_distances` now converts arrays to boolean arrays when
283+
required in scipy.spatial.distance.
284+
(`#5460 https://github.com/scikit-learn/scikit-learn/pull/5460>`_)
285+
By `Tom Dupre la Tour`_.
286286

287287
API changes summary
288288
-------------------

sklearn/metrics/pairwise.py

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ def _return_float_dtype(X, Y):
5454
return X, Y, dtype
5555

5656

57-
def check_pairwise_arrays(X, Y, precomputed=False):
57+
def check_pairwise_arrays(X, Y, precomputed=False, dtype=None):
5858
""" Set X and Y appropriately and checks inputs
5959
6060
If Y is None, it is set as a pointer to X (i.e. not a copy).
@@ -64,9 +64,9 @@ def check_pairwise_arrays(X, Y, precomputed=False):
6464
6565
Specifically, this function first ensures that both X and Y are arrays,
6666
then checks that they are at least two dimensional while ensuring that
67-
their elements are floats. Finally, the function checks that the size
68-
of the second dimension of the two arrays is equal, or the equivalent
69-
check for a precomputed distance matrix.
67+
their elements are floats (or dtype if provided). Finally, the function
68+
checks that the size of the second dimension of the two arrays is equal, or
69+
the equivalent check for a precomputed distance matrix.
7070
7171
Parameters
7272
----------
@@ -78,6 +78,12 @@ def check_pairwise_arrays(X, Y, precomputed=False):
7878
True if X is to be treated as precomputed distances to the samples in
7979
Y.
8080
81+
dtype : string, type, list of types or None (default=None)
82+
Data type required for X and Y. If None, the dtype will be an
83+
appropriate float type selected by _return_float_dtype.
84+
85+
.. versionadded:: 0.18
86+
8187
Returns
8288
-------
8389
safe_X : {array-like, sparse matrix}, shape (n_samples_a, n_features)
@@ -88,13 +94,21 @@ def check_pairwise_arrays(X, Y, precomputed=False):
8894
If Y was None, safe_Y will be a pointer to X.
8995
9096
"""
91-
X, Y, dtype = _return_float_dtype(X, Y)
97+
X, Y, dtype_float = _return_float_dtype(X, Y)
98+
99+
warn_on_dtype = dtype is not None
100+
estimator = 'check_pairwise_arrays'
101+
if dtype is None:
102+
dtype = dtype_float
92103

93104
if Y is X or Y is None:
94-
X = Y = check_array(X, accept_sparse='csr', dtype=dtype)
105+
X = Y = check_array(X, accept_sparse='csr', dtype=dtype,
106+
warn_on_dtype=warn_on_dtype, estimator=estimator)
95107
else:
96-
X = check_array(X, accept_sparse='csr', dtype=dtype)
97-
Y = check_array(Y, accept_sparse='csr', dtype=dtype)
108+
X = check_array(X, accept_sparse='csr', dtype=dtype,
109+
warn_on_dtype=warn_on_dtype, estimator=estimator)
110+
Y = check_array(Y, accept_sparse='csr', dtype=dtype,
111+
warn_on_dtype=warn_on_dtype, estimator=estimator)
98112

99113
if precomputed:
100114
if X.shape[1] != Y.shape[0]:
@@ -1208,7 +1222,11 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
12081222
if issparse(X) or issparse(Y):
12091223
raise TypeError("scipy distance metrics do not"
12101224
" support sparse matrices.")
1211-
X, Y = check_pairwise_arrays(X, Y)
1225+
1226+
dtype = bool if metric in PAIRWISE_BOOLEAN_FUNCTIONS else None
1227+
1228+
X, Y = check_pairwise_arrays(X, Y, dtype=dtype)
1229+
12121230
if n_jobs == 1 and X is Y:
12131231
return distance.squareform(distance.pdist(X, metric=metric,
12141232
**kwds))
@@ -1217,6 +1235,20 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=1, **kwds):
12171235
return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
12181236

12191237

1238+
# These distances recquire boolean arrays, when using scipy.spatial.distance
1239+
PAIRWISE_BOOLEAN_FUNCTIONS = [
1240+
'dice',
1241+
'jaccard',
1242+
'kulsinski',
1243+
'matching',
1244+
'rogerstanimoto',
1245+
'russellrao',
1246+
'sokalmichener',
1247+
'sokalsneath',
1248+
'yule',
1249+
]
1250+
1251+
12201252
# Helper functions - distance
12211253
PAIRWISE_KERNEL_FUNCTIONS = {
12221254
# If updating this dictionary, update the doc in both distance_metrics()

sklearn/metrics/tests/test_pairwise.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from sklearn.utils.testing import assert_raises
1313
from sklearn.utils.testing import assert_raises_regexp
1414
from sklearn.utils.testing import assert_true
15+
from sklearn.utils.testing import ignore_warnings
1516

1617
from sklearn.externals.six import iteritems
1718

@@ -31,13 +32,15 @@
3132
from sklearn.metrics.pairwise import pairwise_kernels
3233
from sklearn.metrics.pairwise import PAIRWISE_KERNEL_FUNCTIONS
3334
from sklearn.metrics.pairwise import PAIRWISE_DISTANCE_FUNCTIONS
35+
from sklearn.metrics.pairwise import PAIRWISE_BOOLEAN_FUNCTIONS
3436
from sklearn.metrics.pairwise import PAIRED_DISTANCES
3537
from sklearn.metrics.pairwise import check_pairwise_arrays
3638
from sklearn.metrics.pairwise import check_paired_arrays
3739
from sklearn.metrics.pairwise import paired_distances
3840
from sklearn.metrics.pairwise import paired_euclidean_distances
3941
from sklearn.metrics.pairwise import paired_manhattan_distances
4042
from sklearn.preprocessing import normalize
43+
from sklearn.exceptions import DataConversionWarning
4144

4245

4346
def test_pairwise_distances():
@@ -115,6 +118,22 @@ def test_pairwise_distances():
115118
assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
116119

117120

121+
# ignore conversion to boolean in pairwise_distances
122+
@ignore_warnings(category=DataConversionWarning)
123+
def test_pairwise_boolean_distance():
124+
# test that we convert to boolean arrays for boolean distances
125+
rng = np.random.RandomState(0)
126+
X = rng.randn(5, 4)
127+
Y = X.copy()
128+
Y[0, 0] = 1 - Y[0, 0]
129+
130+
for metric in PAIRWISE_BOOLEAN_FUNCTIONS:
131+
for Z in [Y, None]:
132+
res = pairwise_distances(X, Z, metric=metric)
133+
res[np.isnan(res)] = 0
134+
assert_true(np.sum(res != 0) == 0)
135+
136+
118137
def test_pairwise_precomputed():
119138
for func in [pairwise_distances, pairwise_kernels]:
120139
# Test correct shape
@@ -143,7 +162,7 @@ def test_pairwise_precomputed():
143162
assert_equal('f', S.dtype.kind)
144163

145164
# Test converts list to array-like
146-
S = func([[1]], metric='precomputed')
165+
S = func([[1.]], metric='precomputed')
147166
assert_true(isinstance(S, np.ndarray))
148167

149168

@@ -188,7 +207,7 @@ def test_pairwise_callable_nonstrict_metric():
188207
# paired_distances should allow callable metric where metric(x, x) != 0
189208
# Knowing that the callable is a strict metric would allow the diagonal to
190209
# be left uncalculated and set to 0.
191-
assert_equal(pairwise_distances([[1]], metric=lambda x, y: 5)[0, 0], 5)
210+
assert_equal(pairwise_distances([[1.]], metric=lambda x, y: 5)[0, 0], 5)
192211

193212

194213
def callable_rbf_kernel(x, y, **kwds):

sklearn/neighbors/tests/test_neighbors.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from sklearn.utils.validation import check_random_state
1818
from sklearn.metrics.pairwise import pairwise_distances
1919
from sklearn import neighbors, datasets
20+
from sklearn.exceptions import DataConversionWarning
2021

2122
rng = np.random.RandomState(0)
2223
# load and shuffle iris dataset
@@ -1200,3 +1201,18 @@ def test_dtype_convert():
12001201

12011202
result = classifier.fit(X, y).predict(X)
12021203
assert_array_equal(result, y)
1204+
1205+
1206+
# ignore conversion to boolean in pairwise_distances
1207+
@ignore_warnings(category=DataConversionWarning)
1208+
def test_pairwise_boolean_distance():
1209+
# Non-regression test for #4523
1210+
# 'brute': uses scipy.spatial.distance through pairwise_distances
1211+
# 'ball_tree': uses sklearn.neighbors.dist_metrics
1212+
rng = np.random.RandomState(0)
1213+
X = rng.uniform(size=(6, 5))
1214+
NN = neighbors.NearestNeighbors
1215+
1216+
nn1 = NN(metric="jaccard", algorithm='brute').fit(X)
1217+
nn2 = NN(metric="jaccard", algorithm='ball_tree').fit(X)
1218+
assert_array_equal(nn1.kneighbors(X)[0], nn2.kneighbors(X)[0])

0 commit comments

Comments
 (0)