Skip to content

Commit e86ca90

Browse files
committed
Merge pull request scikit-learn#3377 from ldirer/hashing_fix3356
Hashing fix3356
2 parents ca7b6df + f208316 commit e86ca90

File tree

1 file changed

+14
-1
lines changed

1 file changed

+14
-1
lines changed

sklearn/feature_extraction/tests/test_text.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from numpy.testing import assert_array_equal
2929
from numpy.testing import assert_raises
3030
from sklearn.utils.testing import (assert_in, assert_less, assert_greater,
31-
assert_warns_message)
31+
assert_warns_message, assert_raise_message)
3232

3333
from collections import defaultdict, Mapping
3434
from functools import partial
@@ -868,6 +868,19 @@ def test_non_unique_vocab():
868868
assert_raises(ValueError, CountVectorizer, vocabulary=vocab)
869869

870870

871+
def test_hashingvectorizer_nan_in_docs():
872+
# np.nan can appear when using pandas to load text fields from a csv file
873+
# with missing values.
874+
message = "np.nan is an invalid document, expected byte or unicode string."
875+
exception = ValueError
876+
877+
def func():
878+
hv = HashingVectorizer()
879+
hv.fit_transform(['hello world', np.nan, 'hello hello'])
880+
881+
assert_raise_message(exception, message, func)
882+
883+
871884
def test_tfidfvectorizer_binary():
872885
# Non-regression test: TfidfVectorizer used to ignore its "binary" param.
873886
v = TfidfVectorizer(binary=True, use_idf=False, norm=None)

0 commit comments

Comments
 (0)