From ce37867475131c275661ecf67dc05aa1aa33d3ad Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 19 Nov 2019 18:46:45 +0100
Subject: [PATCH 1/8] Replace Vectors.most_similar with annoy

---
 requirements.txt        |  1 +
 sense2vec/sense2vec.py  | 42 ++++++++++++++++++++++++++---------------
 sense2vec/util.py       | 17 +++++++++++++++++
 setup.cfg               |  1 +
 tests/test_sense2vec.py |  1 +
 5 files changed, 47 insertions(+), 15 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index a7827ff..2d640ca 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,7 @@ spacy>=2.2.2,<3.0.0
 srsly>=0.2.0
 catalogue>=0.0.4
 # Third-party dependencies
+annoy>=1.16.0,<2.0.0
 numpy>=1.15.0
 importlib_metadata>=0.20; python_version < "3.8"
 # Development requirements
diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py
index 93127ec..cd4d77c 100644
--- a/sense2vec/sense2vec.py
+++ b/sense2vec/sense2vec.py
@@ -2,10 +2,11 @@
 from pathlib import Path
 from spacy.vectors import Vectors
 from spacy.strings import StringStore
+from annoy import AnnoyIndex
 import numpy
 import srsly
 
-from .util import registry, SimpleFrozenDict
+from .util import registry, SimpleFrozenDict, get_similarity
 
 
 class Sense2Vec(object):
@@ -29,8 +30,10 @@ def __init__(
             registered via the registry, e.g. {"make_key": "custom_make_key"}.
         RETURNS (Sense2Vec): The newly constructed object.
         """
+        self._index_metric = "euclidean"
         self.vectors = Vectors(shape=shape, name=vectors_name)
         self.strings = StringStore() if strings is None else strings
+        self.index = AnnoyIndex(self.vectors.shape[1], self._index_metric)
         self.freqs: Dict[int, int] = {}
         self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"}
         self.cfg.update(overrides)
@@ -171,13 +174,7 @@ def similarity(
             keys_b = [keys_b]
         average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0)
         average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0)
-        if average_a.all() == 0 or average_b.all() == 0:
-            return 0.0
-        norm_a = numpy.linalg.norm(average_a)
-        norm_b = numpy.linalg.norm(average_b)
-        if norm_a == norm_b:
-            return 1.0
-        return numpy.dot(average_a, average_b) / (norm_a * norm_b)
+        return get_similarity(average_a, average_b)
 
     def most_similar(
         self,
@@ -203,13 +200,14 @@ def most_similar(
         if len(self.vectors) < n_similar:
             n_similar = len(self.vectors)
         vecs = numpy.vstack([self[key] for key in keys])
-        average = vecs.mean(axis=0, keepdims=True)
-        result_keys, _, scores = self.vectors.most_similar(
-            average, n=n_similar, batch_size=batch_size
-        )
-        result = list(zip(result_keys.flatten(), scores.flatten()))
-        result = [(self.strings[key], score) for key, score in result if key]
-        result = [(key, score) for key, score in result if key not in keys]
+        average = vecs.mean(axis=0, keepdims=False)
+        nns = self.index.get_nns_by_vector(average, n_similar, include_distances=True)
+        result = []
+        for row, distance in zip(*nns):
+            key = self.strings[self.vectors.find(row=row)[0]]
+            if key not in keys:
+                score = 1.0 if distance == 0.0 else get_similarity(average, self[key])
+                result.append((key, score))
         return result
 
     def get_other_senses(
@@ -258,6 +256,18 @@ def get_best_sense(
                     freqs.append((freq, key))
         return max(freqs)[1] if freqs else None
 
+    def build_index(self):
+        """Build an AnnoyIndex from the vectors. Used for fast calculation of
+        the approximate nearest neighbors in Sense2Vec.most_similar. This
+        method should be called after modifying the vectors.
+        """
+        self.index = AnnoyIndex(self.vectors.shape[1], self._index_metric)
+        for key, vector in self.vectors.items():
+            # The key ints are too big so use the row for annoy
+            row = self.vectors.find(key=key)
+            self.index.add_item(row, vector)
+        self.index.build(100)
+
     def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes:
         """Serialize a Sense2Vec object to a bytestring.
 
@@ -284,6 +294,7 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
         self.cfg.update(data.get("cfg", {}))
         if "strings" not in exclude and "strings" in data:
             self.strings = StringStore().from_bytes(data["strings"])
+        self.build_index()
         return self
 
     def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
@@ -315,4 +326,5 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
             self.freqs = dict(srsly.read_json(freqs_path))
         if "strings" not in exclude and strings_path.exists():
             self.strings = StringStore().from_disk(strings_path)
+        self.build_index()
         return self
diff --git a/sense2vec/util.py b/sense2vec/util.py
index 496a7b4..4a653ed 100644
--- a/sense2vec/util.py
+++ b/sense2vec/util.py
@@ -3,6 +3,7 @@
 from spacy.tokens import Doc, Token, Span
 from spacy.util import filter_spans
 import catalogue
+import numpy
 
 try:
     import importlib.metadata as importlib_metadata  # Python 3.8
@@ -167,6 +168,22 @@ def merge_phrases(doc: Doc) -> Doc:
     return doc
 
 
+def get_similarity(vec_a: numpy.ndarray, vec_b: numpy.ndarray) -> float:
+    """Calculate the similarity of two vectors.
+
+    vec_a (numpy.ndarray): The vector.
+    vec_b (numpy.ndarray): The other vector.
+    RETURNS (float): The similarity score.
+    """
+    if vec_a.all() == 0 or vec_b.all() == 0:
+        return 0.0
+    norm_a = numpy.linalg.norm(vec_a)
+    norm_b = numpy.linalg.norm(vec_b)
+    if norm_a == norm_b:
+        return 1.0
+    return numpy.dot(vec_a, vec_b) / (norm_a * norm_b)
+
+
 class SimpleFrozenDict(dict):
     """Simplified implementation of a frozen dict, mainly used as default
     function or method argument (for arguments that should default to empty
diff --git a/setup.cfg b/setup.cfg
index 2a11bf0..06f2da1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,6 +31,7 @@ install_requires =
     srsly>=0.2.0
     catalogue>=0.0.4
     wasabi>=0.4.0,<1.1.0
+    annoy>=1.16.0,<2.0.0
     numpy>=1.15.0
     importlib_metadata>=0.20; python_version < "3.8"
 
diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py
index fbf4657..340bf25 100644
--- a/tests/test_sense2vec.py
+++ b/tests/test_sense2vec.py
@@ -89,6 +89,7 @@ def test_sense2vec_most_similar():
     s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32))
     s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
     s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32))
+    s2v.build_index()
     result1 = s2v.most_similar(["x"], n=2)
     assert len(result1) == 2
     assert result1[0][0] == "a"

From 190beacda266ebb0f1668cea1533ef354fbbc384 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 19 Nov 2019 19:49:28 +0100
Subject: [PATCH 2/8] Make index building optional

---
 sense2vec/sense2vec.py  | 50 ++++++++++++++++++++++++-----------------
 tests/test_sense2vec.py |  6 +++--
 2 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py
index cd4d77c..1be993c 100644
--- a/sense2vec/sense2vec.py
+++ b/sense2vec/sense2vec.py
@@ -30,10 +30,9 @@ def __init__(
             registered via the registry, e.g. {"make_key": "custom_make_key"}.
         RETURNS (Sense2Vec): The newly constructed object.
         """
-        self._index_metric = "euclidean"
         self.vectors = Vectors(shape=shape, name=vectors_name)
         self.strings = StringStore() if strings is None else strings
-        self.index = AnnoyIndex(self.vectors.shape[1], self._index_metric)
+        self.index = None
         self.freqs: Dict[int, int] = {}
         self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"}
         self.cfg.update(overrides)
@@ -183,7 +182,8 @@ def most_similar(
         batch_size: int = 16,
     ) -> List[Tuple[str, float]]:
         """Get the most similar entries in the table. If more than one key is
-        provided, the average of the vectors is used.
+        provided, the average of the vectors is used. To make this faster,
+        you can run Sense2Vec.build_index, which uses the annoy library.
 
         keys (unicode / int / iterable): The string or integer key(s) to compare to.
         n (int): The number of similar keys to return.
@@ -200,15 +200,24 @@ def most_similar(
         if len(self.vectors) < n_similar:
             n_similar = len(self.vectors)
         vecs = numpy.vstack([self[key] for key in keys])
-        average = vecs.mean(axis=0, keepdims=False)
-        nns = self.index.get_nns_by_vector(average, n_similar, include_distances=True)
-        result = []
-        for row, distance in zip(*nns):
-            key = self.strings[self.vectors.find(row=row)[0]]
-            if key not in keys:
-                score = 1.0 if distance == 0.0 else get_similarity(average, self[key])
-                result.append((key, score))
-        return result
+        if self.index is None:  # use the less efficient default way
+            avg = vecs.mean(axis=0, keepdims=True)
+            result_keys, _, scores = self.vectors.most_similar(
+                avg, n=n_similar, batch_size=batch_size
+            )
+            result = list(zip(result_keys.flatten(), scores.flatten()))
+            result = [(self.strings[key], score) for key, score in result if key]
+            return [(key, score) for key, score in result if key not in keys]
+        else:  # index is built, use annoy
+            avg = vecs.mean(axis=0, keepdims=False)
+            nns = self.index.get_nns_by_vector(avg, n_similar, include_distances=True)
+            result = []
+            for row, dist in zip(*nns):
+                key = self.strings[self.vectors.find(row=row)[0]]
+                if key not in keys:
+                    score = 1.0 if dist == 0.0 else get_similarity(avg, self[key])
+                    result.append((key, score))
+            return result
 
     def get_other_senses(
         self, key: Union[str, int], ignore_case: bool = True
@@ -256,17 +265,20 @@ def get_best_sense(
                     freqs.append((freq, key))
         return max(freqs)[1] if freqs else None
 
-    def build_index(self):
-        """Build an AnnoyIndex from the vectors. Used for fast calculation of
-        the approximate nearest neighbors in Sense2Vec.most_similar. This
-        method should be called after modifying the vectors.
+    def build_index(self, metric: str = "euclidean", n_trees: int = 100):
+        """Build an AnnoyIndex from the vectors. Used for faster calculation of
+        the approximate nearest neighbors in Sense2Vec.most_similar. See the
+        annoy docs for more details: https://github.com/spotify/annoy
+
+        metric (unicode): The metric to use.
+        n_trees (int): The number of trees to build.
         """
-        self.index = AnnoyIndex(self.vectors.shape[1], self._index_metric)
+        self.index = AnnoyIndex(self.vectors.shape[1], metric)
         for key, vector in self.vectors.items():
             # The key ints are too big so use the row for annoy
             row = self.vectors.find(key=key)
             self.index.add_item(row, vector)
-        self.index.build(100)
+        self.index.build(n_trees)
 
     def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes:
         """Serialize a Sense2Vec object to a bytestring.
@@ -294,7 +306,6 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()):
         self.cfg.update(data.get("cfg", {}))
         if "strings" not in exclude and "strings" in data:
             self.strings = StringStore().from_bytes(data["strings"])
-        self.build_index()
         return self
 
     def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
@@ -326,5 +337,4 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
             self.freqs = dict(srsly.read_json(freqs_path))
         if "strings" not in exclude and strings_path.exists():
             self.strings = StringStore().from_disk(strings_path)
-        self.build_index()
         return self
diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py
index 340bf25..969a62e 100644
--- a/tests/test_sense2vec.py
+++ b/tests/test_sense2vec.py
@@ -81,7 +81,8 @@ def test_sense2vec_similarity():
     assert s2v.similarity("a", "e") == 0.0
 
 
-def test_sense2vec_most_similar():
+@pytest.mark.parametrize("build_index", [True, False])
+def test_sense2vec_most_similar(build_index):
     s2v = Sense2Vec(shape=(6, 4))
     s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
     s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
@@ -89,7 +90,8 @@ def test_sense2vec_most_similar():
     s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32))
     s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
     s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32))
-    s2v.build_index()
+    if build_index:
+        s2v.build_index()
     result1 = s2v.most_similar(["x"], n=2)
     assert len(result1) == 2
     assert result1[0][0] == "a"

From 3d8f5a701764016ce13214903bd81f8a875d3c83 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 19 Nov 2019 19:49:31 +0100
Subject: [PATCH 3/8] Update README.md

---
 README.md | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a9c76bf..2aad471 100644
--- a/README.md
+++ b/README.md
@@ -417,7 +417,11 @@ assert s2v.similarity("machine_learning|NOUN", "machine_learning|NOUN") == 1.0
 #### <kbd>method</kbd> `Sense2Vec.most_similar`
 
 Get the most similar entries in the table. If more than one key is provided, the
-average of the vectors is used.
+average of the vectors is used. To make this faster, you can run
+`Sense2Vec.build_index`, which uses the
+[annoy](https://github.com/spotify/annoy) library to build an index of the
+vectors. This will make the initial load time slower, but will speed up the most
+similar calculations significantly.
 
 | Argument     | Type                      | Description                                             |
 | ------------ | ------------------------- | ------------------------------------------------------- |
@@ -466,6 +470,17 @@ assert s2v.get_best_sense("duck") == "duck|NOUN"
 assert s2v.get_best_sense("duck", ["VERB", "ADJ"]) == "duck|VERB"
 ```
 
+#### <kbd>method</kbd> `Sense2Vec.build_index`
+
+Build an `AnnoyIndex` from the vectors. Used for faster calculation of the
+approximate nearest neighbors in `Sense2Vec.most_similar`. See the
+[`annoy` docs](https://github.com/spotify/annoy) for more details.
+
+| Argument  | Type    | Description                                                                                       |
+| --------- | ------- | ------------------------------------------------------------------------------------------------- |
+| `metric`  | unicode | The [metric](https://github.com/spotify/annoy#full-python-api) to use. Defaults to `"euclidean"`. |
+| `n_trees` | int     | The number of trees to build. Defaults to `100`.                                                  |
+
 #### <kbd>method</kbd> `Sense2Vec.to_bytes`
 
 Serialize a `Sense2Vec` object to a bytestring.

From fb8a9930eaf05ac1cd14a9c96336320f6b14c3bd Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 19 Nov 2019 19:51:46 +0100
Subject: [PATCH 4/8] Make test more robust

---
 tests/test_sense2vec.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py
index 969a62e..68ae168 100644
--- a/tests/test_sense2vec.py
+++ b/tests/test_sense2vec.py
@@ -98,9 +98,9 @@ def test_sense2vec_most_similar(build_index):
     assert result1[0][1] == 1.0
     assert result1[0][1] == pytest.approx(1.0)
     assert result1[1][0] == "b"
-    result2 = s2v.most_similar(["a", "x"], n=2)
-    assert len(result2) == 2
-    assert sorted([key for key, _ in result2]) == ["b", "d"]
+    result2 = s2v.most_similar(["a", "x"], n=3)
+    assert len(result2) == 3
+    assert sorted([key for key, _ in result2]) == ["b", "c", "d"]
     result3 = s2v.most_similar(["a", "b"], n=3)
     assert len(result3) == 3
     assert "y" not in [key for key, _ in result3]

From b036e0e0f2265d4fe376b7518cda62d69d124900 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 19 Nov 2019 19:54:00 +0100
Subject: [PATCH 5/8] Update README.md [ci skip]

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 2aad471..c6a7b29 100644
--- a/README.md
+++ b/README.md
@@ -27,6 +27,8 @@ models.
 - spaCy **pipeline component** and **extension attributes**.
 - Fully **serializable** so you can easily ship your sense2vec vectors with your
   spaCy model packages.
+- Use [`annoy`](https://github.com/spotify/annoy) to build an index for super
+  fast approximate calculations of most similar vectors.
 - **Train your own vectors** using a pretrained spaCy model, raw text and
   [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via
   [fastText](https://github.com/facebookresearch/fastText)

From c3583e6903d26a73a0e756bc3c74b6b505f713f5 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 19 Nov 2019 19:54:37 +0100
Subject: [PATCH 6/8] Update README.md [ci skip]

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c6a7b29..0db952f 100644
--- a/README.md
+++ b/README.md
@@ -27,8 +27,8 @@ models.
 - spaCy **pipeline component** and **extension attributes**.
 - Fully **serializable** so you can easily ship your sense2vec vectors with your
   spaCy model packages.
-- Use [`annoy`](https://github.com/spotify/annoy) to build an index for super
-  fast approximate calculations of most similar vectors.
+- Use [`annoy`](https://github.com/spotify/annoy) to build an index for **super
+  fast approximate calculations** of most similar vectors.
 - **Train your own vectors** using a pretrained spaCy model, raw text and
   [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via
   [fastText](https://github.com/facebookresearch/fastText)

From f99154dfcaae83302827583d389e0a2db2225b00 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Tue, 19 Nov 2019 20:10:44 +0100
Subject: [PATCH 7/8] Try downgrading annoy

---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 2d640ca..5a7ccbf 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,7 +3,7 @@ spacy>=2.2.2,<3.0.0
 srsly>=0.2.0
 catalogue>=0.0.4
 # Third-party dependencies
-annoy>=1.16.0,<2.0.0
+annoy==1.15.2
 numpy>=1.15.0
 importlib_metadata>=0.20; python_version < "3.8"
 # Development requirements
diff --git a/setup.cfg b/setup.cfg
index 06f2da1..4fc73cb 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -31,7 +31,7 @@ install_requires =
     srsly>=0.2.0
     catalogue>=0.0.4
     wasabi>=0.4.0,<1.1.0
-    annoy>=1.16.0,<2.0.0
+    annoy==1.15.2
     numpy>=1.15.0
     importlib_metadata>=0.20; python_version < "3.8"
 

From 836a6235a6b92013651fbab4c8ee98f0744764bf Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 20 Nov 2019 00:31:54 +0100
Subject: [PATCH 8/8] Add serialization [ci skip]

---
 sense2vec/sense2vec.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py
index 1be993c..d9d7c93 100644
--- a/sense2vec/sense2vec.py
+++ b/sense2vec/sense2vec.py
@@ -34,7 +34,12 @@ def __init__(
         self.strings = StringStore() if strings is None else strings
         self.index = None
         self.freqs: Dict[int, int] = {}
-        self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"}
+        self.cfg = {
+            "senses": senses,
+            "annoy_metric": "euclidean",
+            "make_key": "default",
+            "split_key": "default",
+        }
         self.cfg.update(overrides)
 
     @property
@@ -273,6 +278,7 @@ def build_index(self, metric: str = "euclidean", n_trees: int = 100):
         metric (unicode): The metric to use.
         n_trees (int): The number of trees to build.
         """
+        self.cfg["annoy_metric"] = metric
         self.index = AnnoyIndex(self.vectors.shape[1], metric)
         for key, vector in self.vectors.items():
             # The key ints are too big so use the row for annoy
@@ -320,6 +326,8 @@ def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
         srsly.write_json(path / "freqs.json", list(self.freqs.items()))
         if "strings" not in exclude:
             self.strings.to_disk(path / "strings.json")
+        if "index" not in exclude and self.index is not None:
+            self.index.save(str(path / "index.ann"))
 
     def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
         """Load a Sense2Vec object from a directory.
@@ -330,6 +338,7 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
         """
         path = Path(path)
         strings_path = path / "strings.json"
+        index_path = path / "index.ann"
         freqs_path = path / "freqs.json"
         self.vectors = Vectors().from_disk(path)
         self.cfg.update(srsly.read_json(path / "cfg"))
@@ -337,4 +346,7 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()):
             self.freqs = dict(srsly.read_json(freqs_path))
         if "strings" not in exclude and strings_path.exists():
             self.strings = StringStore().from_disk(strings_path)
+        if "index" not in exclude and index_path.exists():
+            self.index = AnnoyIndex(self.vectors.shape[1], self.cfg["annoy_metric"])
+            self.index.load(str(index_path))
         return self