From ce37867475131c275661ecf67dc05aa1aa33d3ad Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 18:46:45 +0100 Subject: [PATCH 1/8] Replace Vectors.most_similar with annoy --- requirements.txt | 1 + sense2vec/sense2vec.py | 42 ++++++++++++++++++++++++++--------------- sense2vec/util.py | 17 +++++++++++++++++ setup.cfg | 1 + tests/test_sense2vec.py | 1 + 5 files changed, 47 insertions(+), 15 deletions(-) diff --git a/requirements.txt b/requirements.txt index a7827ff..2d640ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ spacy>=2.2.2,<3.0.0 srsly>=0.2.0 catalogue>=0.0.4 # Third-party dependencies +annoy>=1.16.0,<2.0.0 numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" # Development requirements diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 93127ec..cd4d77c 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -2,10 +2,11 @@ from pathlib import Path from spacy.vectors import Vectors from spacy.strings import StringStore +from annoy import AnnoyIndex import numpy import srsly -from .util import registry, SimpleFrozenDict +from .util import registry, SimpleFrozenDict, get_similarity class Sense2Vec(object): @@ -29,8 +30,10 @@ def __init__( registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2Vec): The newly constructed object. """ + self._index_metric = "euclidean" self.vectors = Vectors(shape=shape, name=vectors_name) self.strings = StringStore() if strings is None else strings + self.index = AnnoyIndex(self.vectors.shape[1], self._index_metric) self.freqs: Dict[int, int] = {} self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"} self.cfg.update(overrides) @@ -171,13 +174,7 @@ def similarity( keys_b = [keys_b] average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0) average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0) - if average_a.all() == 0 or average_b.all() == 0: - return 0.0 - norm_a = numpy.linalg.norm(average_a) - norm_b = numpy.linalg.norm(average_b) - if norm_a == norm_b: - return 1.0 - return numpy.dot(average_a, average_b) / (norm_a * norm_b) + return get_similarity(average_a, average_b) def most_similar( self, @@ -203,13 +200,14 @@ def most_similar( if len(self.vectors) < n_similar: n_similar = len(self.vectors) vecs = numpy.vstack([self[key] for key in keys]) - average = vecs.mean(axis=0, keepdims=True) - result_keys, _, scores = self.vectors.most_similar( - average, n=n_similar, batch_size=batch_size - ) - result = list(zip(result_keys.flatten(), scores.flatten())) - result = [(self.strings[key], score) for key, score in result if key] - result = [(key, score) for key, score in result if key not in keys] + average = vecs.mean(axis=0, keepdims=False) + nns = self.index.get_nns_by_vector(average, n_similar, include_distances=True) + result = [] + for row, distance in zip(*nns): + key = self.strings[self.vectors.find(row=row)[0]] + if key not in keys: + score = 1.0 if distance == 0.0 else get_similarity(average, self[key]) + result.append((key, score)) return result def get_other_senses( @@ -258,6 +256,18 @@ def get_best_sense( freqs.append((freq, key)) return max(freqs)[1] if freqs else None + def build_index(self): + """Build an AnnoyIndex from the vectors. Used for fast calculation of + the approximate nearest neighbors in Sense2Vec.most_similar. This + method should be called after modifying the vectors. + """ + self.index = AnnoyIndex(self.vectors.shape[1], self._index_metric) + for key, vector in self.vectors.items(): + # The key ints are too big so use the row for annoy + row = self.vectors.find(key=key) + self.index.add_item(row, vector) + self.index.build(100) + def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes: """Serialize a Sense2Vec object to a bytestring. @@ -284,6 +294,7 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) + self.build_index() return self def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): @@ -315,4 +326,5 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) + self.build_index() return self diff --git a/sense2vec/util.py b/sense2vec/util.py index 496a7b4..4a653ed 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -3,6 +3,7 @@ from spacy.tokens import Doc, Token, Span from spacy.util import filter_spans import catalogue +import numpy try: import importlib.metadata as importlib_metadata # Python 3.8 @@ -167,6 +168,22 @@ def merge_phrases(doc: Doc) -> Doc: return doc +def get_similarity(vec_a: numpy.ndarray, vec_b: numpy.ndarray) -> float: + """Calculate the similarity of two vectors. + + vec_a (numpy.ndarray): The vector. + vec_b (numpy.ndarray): The other vector. + RETURNS (float): The similarity score. + """ + if vec_a.all() == 0 or vec_b.all() == 0: + return 0.0 + norm_a = numpy.linalg.norm(vec_a) + norm_b = numpy.linalg.norm(vec_b) + if norm_a == norm_b: + return 1.0 + return numpy.dot(vec_a, vec_b) / (norm_a * norm_b) + + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty diff --git a/setup.cfg b/setup.cfg index 2a11bf0..06f2da1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,6 +31,7 @@ install_requires = srsly>=0.2.0 catalogue>=0.0.4 wasabi>=0.4.0,<1.1.0 + annoy>=1.16.0,<2.0.0 numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index fbf4657..340bf25 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -89,6 +89,7 @@ def test_sense2vec_most_similar(): s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32)) s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32)) + s2v.build_index() result1 = s2v.most_similar(["x"], n=2) assert len(result1) == 2 assert result1[0][0] == "a" From 190beacda266ebb0f1668cea1533ef354fbbc384 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 19:49:28 +0100 Subject: [PATCH 2/8] Make index building optional --- sense2vec/sense2vec.py | 50 ++++++++++++++++++++++++----------------- tests/test_sense2vec.py | 6 +++-- 2 files changed, 34 insertions(+), 22 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index cd4d77c..1be993c 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -30,10 +30,9 @@ def __init__( registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2Vec): The newly constructed object. """ - self._index_metric = "euclidean" self.vectors = Vectors(shape=shape, name=vectors_name) self.strings = StringStore() if strings is None else strings - self.index = AnnoyIndex(self.vectors.shape[1], self._index_metric) + self.index = None self.freqs: Dict[int, int] = {} self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"} self.cfg.update(overrides) @@ -183,7 +182,8 @@ def most_similar( batch_size: int = 16, ) -> List[Tuple[str, float]]: """Get the most similar entries in the table. If more than one key is - provided, the average of the vectors is used. + provided, the average of the vectors is used. To make this faster, + you can run Sense2Vec.build_index, which uses the annoy library. keys (unicode / int / iterable): The string or integer key(s) to compare to. n (int): The number of similar keys to return. @@ -200,15 +200,24 @@ def most_similar( if len(self.vectors) < n_similar: n_similar = len(self.vectors) vecs = numpy.vstack([self[key] for key in keys]) - average = vecs.mean(axis=0, keepdims=False) - nns = self.index.get_nns_by_vector(average, n_similar, include_distances=True) - result = [] - for row, distance in zip(*nns): - key = self.strings[self.vectors.find(row=row)[0]] - if key not in keys: - score = 1.0 if distance == 0.0 else get_similarity(average, self[key]) - result.append((key, score)) - return result + if self.index is None: # use the less efficient default way + avg = vecs.mean(axis=0, keepdims=True) + result_keys, _, scores = self.vectors.most_similar( + avg, n=n_similar, batch_size=batch_size + ) + result = list(zip(result_keys.flatten(), scores.flatten())) + result = [(self.strings[key], score) for key, score in result if key] + return [(key, score) for key, score in result if key not in keys] + else: # index is built, use annoy + avg = vecs.mean(axis=0, keepdims=False) + nns = self.index.get_nns_by_vector(avg, n_similar, include_distances=True) + result = [] + for row, dist in zip(*nns): + key = self.strings[self.vectors.find(row=row)[0]] + if key not in keys: + score = 1.0 if dist == 0.0 else get_similarity(avg, self[key]) + result.append((key, score)) + return result def get_other_senses( self, key: Union[str, int], ignore_case: bool = True @@ -256,17 +265,20 @@ def get_best_sense( freqs.append((freq, key)) return max(freqs)[1] if freqs else None - def build_index(self): - """Build an AnnoyIndex from the vectors. Used for fast calculation of - the approximate nearest neighbors in Sense2Vec.most_similar. This - method should be called after modifying the vectors. + def build_index(self, metric: str = "euclidean", n_trees: int = 100): + """Build an AnnoyIndex from the vectors. Used for faster calculation of + the approximate nearest neighbors in Sense2Vec.most_similar. See the + annoy docs for more details: https://github.com/spotify/annoy + + metric (unicode): The metric to use. + n_trees (int): The number of trees to build. """ - self.index = AnnoyIndex(self.vectors.shape[1], self._index_metric) + self.index = AnnoyIndex(self.vectors.shape[1], metric) for key, vector in self.vectors.items(): # The key ints are too big so use the row for annoy row = self.vectors.find(key=key) self.index.add_item(row, vector) - self.index.build(100) + self.index.build(n_trees) def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes: """Serialize a Sense2Vec object to a bytestring. @@ -294,7 +306,6 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) - self.build_index() return self def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): @@ -326,5 +337,4 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) - self.build_index() return self diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 340bf25..969a62e 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -81,7 +81,8 @@ def test_sense2vec_similarity(): assert s2v.similarity("a", "e") == 0.0 -def test_sense2vec_most_similar(): +@pytest.mark.parametrize("build_index", [True, False]) +def test_sense2vec_most_similar(build_index): s2v = Sense2Vec(shape=(6, 4)) s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32)) @@ -89,7 +90,8 @@ def test_sense2vec_most_similar(): s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32)) s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32)) - s2v.build_index() + if build_index: + s2v.build_index() result1 = s2v.most_similar(["x"], n=2) assert len(result1) == 2 assert result1[0][0] == "a" From 3d8f5a701764016ce13214903bd81f8a875d3c83 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 19:49:31 +0100 Subject: [PATCH 3/8] Update README.md --- README.md | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a9c76bf..2aad471 100644 --- a/README.md +++ b/README.md @@ -417,7 +417,11 @@ assert s2v.similarity("machine_learning|NOUN", "machine_learning|NOUN") == 1.0 #### method `Sense2Vec.most_similar` Get the most similar entries in the table. If more than one key is provided, the -average of the vectors is used. +average of the vectors is used. To make this faster, you can run +`Sense2Vec.build_index`, which uses the +[annoy](https://github.com/spotify/annoy) library to build an index of the +vectors. This will make the initial load time slower, but will speed up the most +similar calculations significantly. | Argument | Type | Description | | ------------ | ------------------------- | ------------------------------------------------------- | @@ -466,6 +470,17 @@ assert s2v.get_best_sense("duck") == "duck|NOUN" assert s2v.get_best_sense("duck", ["VERB", "ADJ"]) == "duck|VERB" ``` +#### method `Sense2Vec.build_index` + +Build an `AnnoyIndex` from the vectors. Used for faster calculation of the +approximate nearest neighbors in `Sense2Vec.most_similar`. See the +[`annoy` docs](https://github.com/spotify/annoy) for more details. + +| Argument | Type | Description | +| --------- | ------- | ------------------------------------------------------------------------------------------------- | +| `metric` | unicode | The [metric](https://github.com/spotify/annoy#full-python-api) to use. Defaults to `"euclidean"`. | +| `n_trees` | int | The number of trees to build. Defaults to `100`. | + #### method `Sense2Vec.to_bytes` Serialize a `Sense2Vec` object to a bytestring. From fb8a9930eaf05ac1cd14a9c96336320f6b14c3bd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 19:51:46 +0100 Subject: [PATCH 4/8] Make test more robust --- tests/test_sense2vec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 969a62e..68ae168 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -98,9 +98,9 @@ def test_sense2vec_most_similar(build_index): assert result1[0][1] == 1.0 assert result1[0][1] == pytest.approx(1.0) assert result1[1][0] == "b" - result2 = s2v.most_similar(["a", "x"], n=2) - assert len(result2) == 2 - assert sorted([key for key, _ in result2]) == ["b", "d"] + result2 = s2v.most_similar(["a", "x"], n=3) + assert len(result2) == 3 + assert sorted([key for key, _ in result2]) == ["b", "c", "d"] result3 = s2v.most_similar(["a", "b"], n=3) assert len(result3) == 3 assert "y" not in [key for key, _ in result3] From b036e0e0f2265d4fe376b7518cda62d69d124900 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 19:54:00 +0100 Subject: [PATCH 5/8] Update README.md [ci skip] --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 2aad471..c6a7b29 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,8 @@ models. - spaCy **pipeline component** and **extension attributes**. - Fully **serializable** so you can easily ship your sense2vec vectors with your spaCy model packages. +- Use [`annoy`](https://github.com/spotify/annoy) to build an index for super + fast approximate calculations of most similar vectors. - **Train your own vectors** using a pretrained spaCy model, raw text and [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via [fastText](https://github.com/facebookresearch/fastText) From c3583e6903d26a73a0e756bc3c74b6b505f713f5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 19:54:37 +0100 Subject: [PATCH 6/8] Update README.md [ci skip] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c6a7b29..0db952f 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,8 @@ models. - spaCy **pipeline component** and **extension attributes**. - Fully **serializable** so you can easily ship your sense2vec vectors with your spaCy model packages. -- Use [`annoy`](https://github.com/spotify/annoy) to build an index for super - fast approximate calculations of most similar vectors. +- Use [`annoy`](https://github.com/spotify/annoy) to build an index for **super + fast approximate calculations** of most similar vectors. - **Train your own vectors** using a pretrained spaCy model, raw text and [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via [fastText](https://github.com/facebookresearch/fastText) From f99154dfcaae83302827583d389e0a2db2225b00 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 20:10:44 +0100 Subject: [PATCH 7/8] Try downgrading annoy --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 2d640ca..5a7ccbf 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ spacy>=2.2.2,<3.0.0 srsly>=0.2.0 catalogue>=0.0.4 # Third-party dependencies -annoy>=1.16.0,<2.0.0 +annoy==1.15.2 numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" # Development requirements diff --git a/setup.cfg b/setup.cfg index 06f2da1..4fc73cb 100644 --- a/setup.cfg +++ b/setup.cfg @@ -31,7 +31,7 @@ install_requires = srsly>=0.2.0 catalogue>=0.0.4 wasabi>=0.4.0,<1.1.0 - annoy>=1.16.0,<2.0.0 + annoy==1.15.2 numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" From 836a6235a6b92013651fbab4c8ee98f0744764bf Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 Nov 2019 00:31:54 +0100 Subject: [PATCH 8/8] Add serialization [ci skip] --- sense2vec/sense2vec.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 1be993c..d9d7c93 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -34,7 +34,12 @@ def __init__( self.strings = StringStore() if strings is None else strings self.index = None self.freqs: Dict[int, int] = {} - self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"} + self.cfg = { + "senses": senses, + "annoy_metric": "euclidean", + "make_key": "default", + "split_key": "default", + } self.cfg.update(overrides) @property @@ -273,6 +278,7 @@ def build_index(self, metric: str = "euclidean", n_trees: int = 100): metric (unicode): The metric to use. n_trees (int): The number of trees to build. """ + self.cfg["annoy_metric"] = metric self.index = AnnoyIndex(self.vectors.shape[1], metric) for key, vector in self.vectors.items(): # The key ints are too big so use the row for annoy @@ -320,6 +326,8 @@ def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): srsly.write_json(path / "freqs.json", list(self.freqs.items())) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") + if "index" not in exclude and self.index is not None: + self.index.save(str(path / "index.ann")) def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a directory. @@ -330,6 +338,7 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """ path = Path(path) strings_path = path / "strings.json" + index_path = path / "index.ann" freqs_path = path / "freqs.json" self.vectors = Vectors().from_disk(path) self.cfg.update(srsly.read_json(path / "cfg")) @@ -337,4 +346,7 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) + if "index" not in exclude and index_path.exists(): + self.index = AnnoyIndex(self.vectors.shape[1], self.cfg["annoy_metric"]) + self.index.load(str(index_path)) return self