segmond
diff --git a/‎sklearn/feature_extraction/tests/test_text.py‎
Lines changed: 14 additions & 1 deletion b/‎sklearn/feature_extraction/tests/test_text.py‎
Lines changed: 14 additions & 1 deletion
@@ -28,7 +28,7 @@
 from numpy.testing import assert_array_equal
 from numpy.testing import assert_raises
 from sklearn.utils.testing import (assert_in, assert_less, assert_greater,
-                                   assert_warns_message)
+                                   assert_warns_message, assert_raise_message)
 
 from collections import defaultdict, Mapping
 from functools import partial
@@ -868,6 +868,19 @@ def test_non_unique_vocab():
     assert_raises(ValueError, CountVectorizer, vocabulary=vocab)
 
 
+def test_hashingvectorizer_nan_in_docs():
+    # np.nan can appear when using pandas to load text fields from a csv file
+    # with missing values.
+    message = "np.nan is an invalid document, expected byte or unicode string."
+    exception = ValueError
+
+    def func():
+        hv = HashingVectorizer()
+        hv.fit_transform(['hello world', np.nan, 'hello hello'])
+
+    assert_raise_message(exception, message, func)
+
+
 def test_tfidfvectorizer_binary():
     # Non-regression test: TfidfVectorizer used to ignore its "binary" param.
     v = TfidfVectorizer(binary=True, use_idf=False, norm=None)