From 9ece176ccf11a0476624dbe816ee8cb332103dc3 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:26:11 +0200 Subject: [PATCH 001/297] Update .gitignore --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 1dbc687..55d3e33 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,9 @@ +data/ +cythonize.dat +*.cpp +.pytest_cache +.vscode + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] @@ -8,6 +14,7 @@ __pycache__/ # Distribution / packaging .Python +.env/ env/ build/ develop-eggs/ From 4c64c11e58bbbd96ea3ae139a568295734aaa087 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:26:20 +0200 Subject: [PATCH 002/297] Drop Python 3.4 and add Python 3.6 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index de2a2d6..d4ca784 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,8 +2,8 @@ language: python python: - "2.7" - - "3.4" - "3.5" + - "3.6" install: - pip install -U numpy From 88da4160e40f3cbd2514e32b8a8788a992e74b46 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:26:49 +0200 Subject: [PATCH 003/297] Remove old scripts and for now --- bin/gensim2sense.py | 30 --------- bin/merge_text.py | 140 ------------------------------------------ bin/train_word2vec.py | 110 --------------------------------- buildbot.json | 26 -------- 4 files changed, 306 deletions(-) delete mode 100644 bin/gensim2sense.py delete mode 100644 bin/merge_text.py delete mode 100644 bin/train_word2vec.py delete mode 100644 buildbot.json diff --git a/bin/gensim2sense.py b/bin/gensim2sense.py deleted file mode 100644 index c6541c2..0000000 --- a/bin/gensim2sense.py +++ /dev/null @@ -1,30 +0,0 @@ -from sense2vec.vectors import VectorMap -from gensim.models import Word2Vec -import plac - -@plac.annotations( - gensim_model_path=("Location of gensim's .bin file"), - out_dir=("Location of output directory"), - min_count=("Min count", "option", "m", int), -) -def main(gensim_model_path, out_dir, min_count=None): - """Convert a gensim.models.Word2Vec file to VectorMap format""" - - gensim_model = Word2Vec.load(gensim_model_path) - vector_map = VectorMap(128) - - if min_count is None: - min_count = gensim_model.min_count - - for string in gensim_model.vocab: - vocab = gensim_model.vocab[string] - freq, idx = vocab.count, vocab.index - if freq < min_count: - continue - vector = gensim_model.syn0[idx] - vector_map.borrow(string, freq, vector) - - vector_map.save(out_dir) - -if __name__ == '__main__': - plac.call(main) \ No newline at end of file diff --git a/bin/merge_text.py b/bin/merge_text.py deleted file mode 100644 index 8390442..0000000 --- a/bin/merge_text.py +++ /dev/null @@ -1,140 +0,0 @@ -from __future__ import print_function, unicode_literals, division -import io -import bz2 -import logging -from toolz import partition -from os import path -import os -import re - -import spacy.en -from preshed.counter import PreshCounter -from spacy.tokens.doc import Doc - -from joblib import Parallel, delayed -import plac -try: - import ujson as json -except ImportError: - import json - - -LABELS = { - 'ENT': 'ENT', - 'PERSON': 'ENT', - 'NORP': 'ENT', - 'FAC': 'ENT', - 'ORG': 'ENT', - 'GPE': 'ENT', - 'LOC': 'ENT', - 'LAW': 'ENT', - 'PRODUCT': 'ENT', - 'EVENT': 'ENT', - 'WORK_OF_ART': 'ENT', - 'LANGUAGE': 'ENT', - 'DATE': 'DATE', - 'TIME': 'TIME', - 'PERCENT': 'PERCENT', - 'MONEY': 'MONEY', - 'QUANTITY': 'QUANTITY', - 'ORDINAL': 'ORDINAL', - 'CARDINAL': 'CARDINAL' -} - - -def parallelize(func, iterator, n_jobs, extra): - extra = tuple(extra) - return Parallel(n_jobs=n_jobs)(delayed(func)(*(item + extra)) for item in iterator) - - -def iter_comments(loc): - with bz2.BZ2File(loc) as file_: - for i, line in enumerate(file_): - yield ujson.loads(line)['body'] - - -pre_format_re = re.compile(r'^[\`\*\~]') -post_format_re = re.compile(r'[\`\*\~]$') -url_re = re.compile(r'\[([^]]+)\]\(%%URL\)') -link_re = re.compile(r'\[([^]]+)\]\(https?://[^\)]+\)') -def strip_meta(text): - text = link_re.sub(r'\1', text) - text = text.replace('>', '>').replace('<', '<') - text = pre_format_re.sub('', text) - text = post_format_re.sub('', text) - return text - - -def load_and_transform(batch_id, in_loc, out_dir): - out_loc = path.join(out_dir, '%d.txt' % batch_id) - if path.exists(out_loc): - return None - print('Batch', batch_id) - nlp = spacy.en.English(parser=False, tagger=False, matcher=False, entity=False) - with io.open(out_loc, 'w', encoding='utf8') as out_file: - with io.open(in_loc, 'rb') as in_file: - for byte_string in Doc.read_bytes(in_file): - doc = Doc(nlp.vocab).from_bytes(byte_string) - doc.is_parsed = True - out_file.write(transform_doc(doc)) - - -def parse_and_transform(batch_id, input_, out_dir): - out_loc = path.join(out_dir, '%d.txt' % batch_id) - if path.exists(out_loc): - return None - print('Batch', batch_id) - nlp = spacy.en.English() - nlp.matcher = None - with io.open(out_loc, 'w', encoding='utf8') as file_: - for text in input_: - file_.write(transform_doc(nlp(strip_meta(text)))) - - -def transform_doc(doc): - for ent in doc.ents: - ent.merge(ent.root.tag_, ent.text, LABELS[ent.label_]) - for np in doc.noun_chunks: - while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'): - np = np[1:] - np.merge(np.root.tag_, np.text, np.root.ent_type_) - strings = [] - for sent in doc.sents: - if sent.text.strip(): - strings.append(' '.join(represent_word(w) for w in sent if not w.is_space)) - if strings: - return '\n'.join(strings) + '\n' - else: - return '' - - -def represent_word(word): - if word.like_url: - return '%%URL|X' - text = re.sub(r'\s', '_', word.text) - tag = LABELS.get(word.ent_type_, word.pos_) - if not tag: - tag = '?' - return text + '|' + tag - - -@plac.annotations( - in_loc=("Location of input file"), - out_dir=("Location of input file"), - n_workers=("Number of workers", "option", "n", int), - load_parses=("Load parses from binary", "flag", "b"), -) -def main(in_loc, out_dir, n_workers=4, load_parses=False): - if not path.exists(out_dir): - path.join(out_dir) - if load_parses: - jobs = [path.join(in_loc, fn) for fn in os.listdir(in_loc)] - do_work = load_and_transform - else: - jobs = partition(200000, iter_comments(in_loc)) - do_work = parse_and_transform - parallelize(do_work, enumerate(jobs), n_workers, [out_dir]) - - -if __name__ == '__main__': - plac.call(main) diff --git a/bin/train_word2vec.py b/bin/train_word2vec.py deleted file mode 100644 index 78245dd..0000000 --- a/bin/train_word2vec.py +++ /dev/null @@ -1,110 +0,0 @@ -from __future__ import print_function, unicode_literals, division -import io -import bz2 -import logging -from os import path -import os -import random -from collections import defaultdict - -import plac -try: - import ujson as json -except ImportError: - import json -from gensim.models import Word2Vec -from preshed.counter import PreshCounter -from spacy.strings import hash_string - -logger = logging.getLogger(__name__) - - -class Corpus(object): - def __init__(self, directory, min_freq=10): - self.directory = directory - self.counts = PreshCounter() - self.strings = {} - self.min_freq = min_freq - - def count_doc(self, words): - # Get counts for this document - doc_counts = PreshCounter() - doc_strings = {} - for word in words: - key = hash_string(word) - doc_counts.inc(key, 1) - doc_strings[key] = word - - n = 0 - for key, count in doc_counts: - self.counts.inc(key, count) - # TODO: Why doesn't inc return this? =/ - corpus_count = self.counts[key] - # Remember the string when we exceed min count - if corpus_count >= self.min_freq and (corpus_count - count) < self.min_freq: - self.strings[key] = doc_strings[key] - n += count - return n - - def __iter__(self): - for text_loc in iter_dir(self.directory): - with io.open(text_loc, 'r', encoding='utf8') as file_: - sent_strs = list(file_) - random.shuffle(sent_strs) - for sent_str in sent_strs: - yield sent_str.split() - - -def iter_dir(loc): - for fn in os.listdir(loc): - if path.isdir(path.join(loc, fn)): - for sub in os.listdir(path.join(loc, fn)): - yield path.join(loc, fn, sub) - else: - yield path.join(loc, fn) - -@plac.annotations( - in_dir=("Location of input directory"), - out_loc=("Location of output file"), - n_workers=("Number of workers", "option", "n", int), - size=("Dimension of the word vectors", "option", "d", int), - window=("Context window size", "option", "w", int), - min_count=("Min count", "option", "m", int), - negative=("Number of negative samples", "option", "g", int), - nr_iter=("Number of iterations", "option", "i", int), -) -def main(in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2): - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) - model = Word2Vec( - size=size, - window=window, - min_count=min_count, - workers=n_workers, - sample=1e-5, - negative=negative - ) - corpus = Corpus(in_dir) - total_words = 0 - total_sents = 0 - for text_no, text_loc in enumerate(iter_dir(corpus.directory)): - with io.open(text_loc, 'r', encoding='utf8') as file_: - text = file_.read() - total_sents += text.count('\n') - total_words += corpus.count_doc(text.split()) - logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types", - text_no, total_words, len(corpus.strings)) - model.corpus_count = total_sents - model.raw_vocab = defaultdict(int) - for key, string in corpus.strings.items(): - model.raw_vocab[string] = corpus.counts[key] - model.scale_vocab() - model.finalize_vocab() - model.iter = nr_iter - model.train(corpus) - - model.save(out_loc) - - -if __name__ == '__main__': - plac.call(main) - diff --git a/buildbot.json b/buildbot.json deleted file mode 100644 index d67fa4a..0000000 --- a/buildbot.json +++ /dev/null @@ -1,26 +0,0 @@ -{ - "build": { - "sdist": [ - "pip install -r requirements.txt", - "pip install \"numpy<1.8\"", - "python setup.py sdist" - ], - "install": [ - "pip install -v source.tar.gz" - ], - "wheel": [ - "python untar.py source.tar.gz .", - "pip install \"numpy<1.8\"", - "python setup.py bdist_wheel", - "python cpdist.py dist" - ] - }, - "test": { - "after": ["install", "wheel"], - "run": [ - "python -m sense2vec.download --force" - ], - "package": "sense2vec", - "args": "--tb=native -x --models" - } -} From 94880d5d259ff35f2e74ab33eb9b169a6e5ea482 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:28:38 +0200 Subject: [PATCH 004/297] Fix formatting, meta and whitespace --- LICENSE | 3 +-- sense2vec/__init__.py | 3 +++ sense2vec/tests/conftest.py | 3 +++ sense2vec/tests/test_sense2vec.py | 3 +++ sense2vec/tests/test_vectors.py | 5 ++++- sense2vec/vectors.pxd | 5 ++--- sense2vec/vectors.pyx | 14 +++++++------- setup.py | 8 +++++--- 8 files changed, 28 insertions(+), 16 deletions(-) diff --git a/LICENSE b/LICENSE index ddd22e2..7d042a7 100644 --- a/LICENSE +++ b/LICENSE @@ -1,7 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016 spaCy GmbH - 2016 ExplosionAI UG (haftungsbeschränkt) +Copyright (C) 2016 spaCy GmbH, 2016 ExplosionAI UG (haftungsbeschränkt) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index d1b747d..a389f49 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -1,4 +1,7 @@ from . import util +# coding: utf8 +from __future__ import unicode_literals + from .vectors import VectorMap diff --git a/sense2vec/tests/conftest.py b/sense2vec/tests/conftest.py index 0670340..acbb8fa 100644 --- a/sense2vec/tests/conftest.py +++ b/sense2vec/tests/conftest.py @@ -1,3 +1,6 @@ +# coding: utf8 +from __future__ import unicode_literals + import pytest diff --git a/sense2vec/tests/test_sense2vec.py b/sense2vec/tests/test_sense2vec.py index 0268e1a..537fb66 100644 --- a/sense2vec/tests/test_sense2vec.py +++ b/sense2vec/tests/test_sense2vec.py @@ -1,3 +1,6 @@ +# coding: utf8 +from __future__ import unicode_literals + import pytest import sense2vec diff --git a/sense2vec/tests/test_vectors.py b/sense2vec/tests/test_vectors.py index 9d5d755..c7fb97a 100644 --- a/sense2vec/tests/test_vectors.py +++ b/sense2vec/tests/test_vectors.py @@ -1,7 +1,10 @@ +# coding: utf8 +from __future__ import unicode_literals + import pytest import numpy -from sense2vec.vectors import VectorStore +from ..vectors import VectorStore def test_init(): diff --git a/sense2vec/vectors.pxd b/sense2vec/vectors.pxd index ea7dfab..c3104a7 100644 --- a/sense2vec/vectors.pxd +++ b/sense2vec/vectors.pxd @@ -2,7 +2,6 @@ from libcpp.vector cimport vector from preshed.maps cimport PreshMap from spacy.strings cimport StringStore, hash_string from murmurhash.mrmr cimport hash64 - from cymem.cymem cimport Pool @@ -11,7 +10,7 @@ cdef class VectorMap: cdef readonly VectorStore data cdef readonly StringStore strings cdef readonly PreshMap freqs - + cdef class VectorStore: cdef readonly Pool mem @@ -20,7 +19,7 @@ cdef class VectorStore: cdef vector[float] norms cdef vector[float] _similarities cdef readonly int nr_dim - + cdef float get_l2_norm(const float* vec, int n) nogil diff --git a/sense2vec/vectors.pyx b/sense2vec/vectors.pyx index 50ce668..8bb5b05 100644 --- a/sense2vec/vectors.pyx +++ b/sense2vec/vectors.pyx @@ -57,7 +57,7 @@ cdef class VectorMap: Returns: length int >= 0 ''' return self.data.vectors.size() - + def __contains__(self, unicode string): '''Check whether the VectorMap has a given key. @@ -140,7 +140,7 @@ cdef class VectorMap: freq = self.freqs[hashed] yield string, (freq, self.data[i]) - + def most_similar(self, float[:] vector, int n=10): '''Find the keys of the N most similar entries, given a vector. @@ -218,7 +218,7 @@ cdef class VectorMap: cdef class VectorStore: '''Maintain an array of float* pointers for word vectors, which the - table may or may not own. Keys and frequencies sold separately --- + table may or may not own. Keys and frequencies sold separately --- we're just a dumb vector of data, that knows how to run linear-scan similarity queries.''' def __init__(self, int nr_dim): @@ -241,7 +241,7 @@ cdef class VectorStore: &vec[0], sizeof(ptr[0]) * self.nr_dim) self.norms.push_back(get_l2_norm(&ptr[0], self.nr_dim)) self.vectors.push_back(ptr) - + def borrow(self, float[:] vec): self.norms.push_back(get_l2_norm(&vec[0], self.nr_dim)) # Danger! User must ensure this is memory contiguous! @@ -249,7 +249,7 @@ cdef class VectorStore: def similarity(self, float[:] v1, float[:] v2): '''Measure the similarity between two vectors, using cosine. - + Arguments: v1 float[:] v2 float[:] @@ -280,7 +280,7 @@ cdef class VectorStore: self._similarities.resize(self.vectors.size()) linear_similarity(&indices[0], &scores[0], &self._similarities[0], n, &query[0], self.nr_dim, - &self.vectors[0], &self.norms[0], self.vectors.size(), + &self.vectors[0], &self.norms[0], self.vectors.size(), cosine_similarity) cached_result = <_CachedResult*>self.mem.alloc(sizeof(_CachedResult), 1) cached_result.n = n @@ -341,7 +341,7 @@ cdef void linear_similarity(int* indices, float* scores, float* tmp, queue.pop() # Fill the outputs i = 0 - while i < nr_out and not queue.empty(): + while i < nr_out and not queue.empty(): entry = queue.top() scores[nr_out-(i+1)] = -entry.first indices[nr_out-(i+1)] = entry.second diff --git a/setup.py b/setup.py index 91e4f34..bd15973 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,6 @@ #!/usr/bin/env python from __future__ import print_function + import os import shutil import subprocess @@ -40,7 +41,7 @@ '-fopenmp']) #else: # link_options['other'].extend([ -# '-fopenmp']) +# '-fopenmp']) # class build_ext_subclass(build_ext): @@ -150,11 +151,12 @@ def setup_package(): 'Operating System :: MacOS :: MacOS X', 'Operating System :: Microsoft :: Windows', 'Programming Language :: Cython', - 'Programming Language :: Python :: 2.6', + 'Programming Language :: Python :: 2', 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3.3', + 'Programming Language :: Python :: 3', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', + 'Programming Language :: Python :: 3.6', 'Topic :: Scientific/Engineering'], cmdclass = { 'build_ext': build_ext_subclass}, From 1b4271435b2bb1127421ee46fec7c01cf117808f Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:29:30 +0200 Subject: [PATCH 005/297] Update requirements and read in txt in setup.py --- requirements-all.txt | 12 ------------ requirements-dev.txt | 2 ++ requirements.txt | 10 +++------- setup.py | 12 +++++------- 4 files changed, 10 insertions(+), 26 deletions(-) delete mode 100644 requirements-all.txt create mode 100644 requirements-dev.txt diff --git a/requirements-all.txt b/requirements-all.txt deleted file mode 100644 index e497b38..0000000 --- a/requirements-all.txt +++ /dev/null @@ -1,12 +0,0 @@ -cython<0.24 -numpy>=1.7 -ujson>=1.34 -spacy>=0.100,<0.101 -preshed>=0.46,<0.47 -murmurhash>=0.26,<0.27 -cymem>=1.30,<1.32 -sputnik>=0.9.0,<0.10.0 -pytest -joblib -toolz -gensim diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..d44b81f --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +cython>=0.24,<0.28.0 +pytest>=3.0.6,<4.0.0 diff --git a/requirements.txt b/requirements.txt index 5daf2c0..3c9b44d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,5 @@ -cython<0.24 numpy>=1.7 -ujson>=1.34 -spacy>=0.100.0,<0.102.0 -preshed>=0.46,<0.47 -murmurhash>=0.26,<0.27 +ujson>=1.35 +preshed>=1.0.0,<2.0.0 +murmurhash>=0.28,<0.29 cymem>=1.30,<1.32 -sputnik>=0.9.0,<0.10.0 -pytest diff --git a/setup.py b/setup.py index bd15973..6f478fe 100644 --- a/setup.py +++ b/setup.py @@ -108,6 +108,10 @@ def setup_package(): get_python_inc(plat_specific=True), os.path.join(root, 'include')] + # Read in requirements and split into packages and URLs + with open(os.path.join(root, 'requirements.txt')) as f: + requirements = [line.strip() for line in f] + if (ccompiler.new_compiler().compiler_type == 'msvc' and msvccompiler.get_build_version() == 9): include_dirs.append(os.path.join(root, 'include', 'msvc9')) @@ -134,13 +138,7 @@ def setup_package(): url=about['uri'], license=about['license'], ext_modules=ext_modules, - install_requires=[ - 'numpy>=1.7', - 'spacy>=0.100,<0.102', - 'preshed>=0.46,<0.47', - 'murmurhash>=0.26,<0.27', - 'cymem>=1.30,<1.32', - 'sputnik>=0.9.0,<0.10.0'], + install_requires=requirements, classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: Console', From 5b39eab89007ec5e02f76144f635a8dd1ee32f8b Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:30:08 +0200 Subject: [PATCH 006/297] Update about.py and meta --- sense2vec/__init__.py | 1 + sense2vec/about.py | 22 ++++++++-------------- setup.py | 14 +++++++------- 3 files changed, 16 insertions(+), 21 deletions(-) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index a389f49..2a999ab 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .vectors import VectorMap +from .about import __version__ def load(name=None, via=None): diff --git a/sense2vec/about.py b/sense2vec/about.py index f214aec..47a994d 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1,14 +1,8 @@ -# inspired from: - -# https://python-packaging-user-guide.readthedocs.org/en/latest/single_source_version/ -# https://github.com/pypa/warehouse/blob/master/warehouse/__about__.py - -title = 'sense2vec' -version = '0.6.0' -summary = 'word2vec with NLP-specific tokens' -uri = 'https://github.com/explosion/sense2vec' -author = 'Matthew Honnibal' -email = 'matt@explosion.ai' -license = 'MIT' -release = True -default_model = 'reddit_vectors>=1.1.0,<1.2.0' +__title__ = 'sense2vec' +__version__ = '1.0.0a0' +__summary__ = 'Use NLP to go beyond vanilla word2vec' +__uri__ = 'https://github.com/explosion/sense2vec' +__author__ = 'Explosion AI' +__email__ = 'contact@explosion.ai' +__license__ = 'MIT' +__release__ = True diff --git a/setup.py b/setup.py index 6f478fe..5ad02b7 100644 --- a/setup.py +++ b/setup.py @@ -127,16 +127,16 @@ def setup_package(): generate_cython(root, src_path) setup( - name=about['title'], + name=about['__title__'], zip_safe=False, packages=PACKAGES, package_data={'': ['*.pyx', '*.pxd', '*.h']}, - description=about['summary'], - author=about['author'], - author_email=about['email'], - version=about['version'], - url=about['uri'], - license=about['license'], + description=about['__summary__'], + author=about['__author__'], + author_email=about['__email__'], + version=about['__version__'], + url=about['__uri__'], + license=about['__license__'], ext_modules=ext_modules, install_requires=requirements, classifiers=[ From 509b1942c2b27847fc8eadd8f33520987753e004 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:30:31 +0200 Subject: [PATCH 007/297] Fix whitespace --- sense2vec/vectors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/vectors.pyx b/sense2vec/vectors.pyx index 8bb5b05..3296f69 100644 --- a/sense2vec/vectors.pyx +++ b/sense2vec/vectors.pyx @@ -223,7 +223,7 @@ cdef class VectorStore: similarity queries.''' def __init__(self, int nr_dim): self.mem = Pool() - self.nr_dim = nr_dim + self.nr_dim = nr_dim zeros = self.mem.alloc(self.nr_dim, sizeof(float)) self.vectors.push_back(zeros) self.norms.push_back(0) From 3778b226d9edd33ad13da21c3409eb16d1a96772 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:32:06 +0200 Subject: [PATCH 008/297] Update vectors and StringStore Use updated port of spaCy v1.x's StringStore to map to consecutive IDs (instead of hash values). Replace load/dump methods with to_disk and from_disk for consistency and port over CFile --- sense2vec/_strings.pxd | 31 +++++ sense2vec/_strings.pyx | 293 +++++++++++++++++++++++++++++++++++++++++ sense2vec/cfile.pxd | 12 ++ sense2vec/cfile.pyx | 42 ++++++ sense2vec/vectors.pxd | 2 +- sense2vec/vectors.pyx | 13 +- 6 files changed, 385 insertions(+), 8 deletions(-) create mode 100644 sense2vec/_strings.pxd create mode 100644 sense2vec/_strings.pyx create mode 100644 sense2vec/cfile.pxd create mode 100644 sense2vec/cfile.pyx diff --git a/sense2vec/_strings.pxd b/sense2vec/_strings.pxd new file mode 100644 index 0000000..fa88a6d --- /dev/null +++ b/sense2vec/_strings.pxd @@ -0,0 +1,31 @@ +from libc.stdint cimport int64_t, uint64_t, int32_t + +from cymem.cymem cimport Pool +from preshed.maps cimport PreshMap +from murmurhash.mrmr cimport hash64 + + +ctypedef uint64_t hash_t +ctypedef int32_t attr_t + + +cpdef hash_t hash_string(unicode string) except 0 + + +ctypedef union Utf8Str: + unsigned char[8] s + unsigned char* p + + +cdef class StringStore: + cdef Pool mem + cdef Utf8Str* c + cdef int64_t size + cdef bint is_frozen + + cdef public PreshMap _map + cdef public PreshMap _oov + cdef int64_t _resize_at + + cdef const Utf8Str* intern_unicode(self, unicode py_string) + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) diff --git a/sense2vec/_strings.pyx b/sense2vec/_strings.pyx new file mode 100644 index 0000000..b3c483d --- /dev/null +++ b/sense2vec/_strings.pyx @@ -0,0 +1,293 @@ +# cython: infer_types=True +# coding: utf8 +from __future__ import unicode_literals, absolute_import + +cimport cython +from libc.string cimport memcpy +from libc.stdint cimport uint64_t, uint32_t +from murmurhash.mrmr cimport hash64, hash32 +from preshed.maps cimport map_iter, key_t +import ujson +import os + + +cpdef hash_t hash_string(unicode string) except 0: + chars = string.encode('utf8') + return hash_utf8(chars, len(chars)) + + +cdef hash_t hash_utf8(char* utf8_string, int length) nogil: + return hash64(utf8_string, length, 1) + + +cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil: + return hash32(utf8_string, length, 1) + + +cdef unicode _decode(const Utf8Str* string): + cdef int i, length + if string.s[0] < sizeof(string.s) and string.s[0] != 0: + return string.s[1:string.s[0]+1].decode('utf8') + elif string.p[0] < 255: + return string.p[1:string.p[0]+1].decode('utf8') + else: + i = 0 + length = 0 + while string.p[i] == 255: + i += 1 + length += 255 + length += string.p[i] + i += 1 + return string.p[i:length + i].decode('utf8') + + +cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *: + cdef int n_length_bytes + cdef int i + cdef Utf8Str string + cdef uint32_t ulength = length + if length < sizeof(string.s): + string.s[0] = length + memcpy(&string.s[1], chars, length) + return string + elif length < 255: + string.p = mem.alloc(length + 1, sizeof(unsigned char)) + string.p[0] = length + memcpy(&string.p[1], chars, length) + assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] + return string + else: + i = 0 + n_length_bytes = (length // 255) + 1 + string.p = mem.alloc(length + n_length_bytes, sizeof(unsigned char)) + for i in range(n_length_bytes-1): + string.p[i] = 255 + string.p[n_length_bytes-1] = length % 255 + memcpy(&string.p[n_length_bytes], chars, length) + assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] + return string + + +cdef class StringStore: + """ + Map strings to and from integer IDs. + """ + def __init__(self, strings=None, freeze=False): + """ + Create the StringStore. + + Arguments: + strings: A sequence of unicode strings to add to the store. + """ + self.mem = Pool() + self._map = PreshMap() + self._oov = PreshMap() + self._resize_at = 10000 + self.c = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) + self.size = 1 + self.is_frozen = freeze + if strings is not None: + for string in strings: + _ = self[string] + + property size: + def __get__(self): + return self.size -1 + + def __reduce__(self): + # TODO: OOV words, for the is_frozen stuff? + if self.is_frozen: + raise NotImplementedError( + "Currently missing support for pickling StringStore when " + "is_frozen=True") + return (StringStore, (list(self),)) + + def __len__(self): + """ + The number of strings in the store. + + Returns: + int The number of strings in the store. + """ + return self.size-1 + + def __getitem__(self, object string_or_id): + """ + Retrieve a string from a given integer ID, or vice versa. + + Arguments: + string_or_id (bytes or unicode or int): + The value to encode. + Returns: + unicode or int: The value to retrieved. + """ + if isinstance(string_or_id, basestring) and len(string_or_id) == 0: + return 0 + elif string_or_id == 0: + return u'' + + cdef bytes byte_string + cdef const Utf8Str* utf8str + cdef uint64_t int_id + cdef uint32_t oov_id + if isinstance(string_or_id, (int, long)): + int_id = string_or_id + oov_id = string_or_id + if int_id < self.size: + return _decode(&self.c[int_id]) + else: + utf8str = self._oov.get(oov_id) + if utf8str is not NULL: + return _decode(utf8str) + else: + raise IndexError(string_or_id) + else: + if isinstance(string_or_id, bytes): + byte_string = string_or_id + elif isinstance(string_or_id, unicode): + byte_string = (string_or_id).encode('utf8') + else: + raise TypeError(type(string_or_id)) + utf8str = self._intern_utf8(byte_string, len(byte_string)) + if utf8str is NULL: + # TODO: We need to use 32 bit here, for compatibility with the + # vocabulary values. This makes birthday paradox probabilities + # pretty bad. + # We could also get unlucky here, and hash into a value that + # collides with the 'real' strings. + return hash32_utf8(byte_string, len(byte_string)) + else: + return utf8str - self.c + + def __contains__(self, unicode string not None): + """ + Check whether a string is in the store. + + Arguments: + string (unicode): The string to check. + Returns bool: + Whether the store contains the string. + """ + if len(string) == 0: + return True + cdef hash_t key = hash_string(string) + return self._map.get(key) is not NULL + + def __iter__(self): + """ + Iterate over the strings in the store, in order. + + Yields: unicode A string in the store. + """ + cdef int i + for i in range(self.size): + yield _decode(&self.c[i]) if i > 0 else u'' + # TODO: Iterate OOV here? + + def __reduce__(self): + strings = [""] + for i in range(1, self.size): + string = &self.c[i] + py_string = _decode(string) + strings.append(py_string) + return (StringStore, (strings,), None, None, None) + + def set_frozen(self, bint is_frozen): + # TODO + self.is_frozen = is_frozen + + def flush_oov(self): + self._oov = PreshMap() + + cdef const Utf8Str* intern_unicode(self, unicode py_string): + # 0 means missing, but we don't bother offsetting the index. + cdef bytes byte_string = py_string.encode('utf8') + return self._intern_utf8(byte_string, len(byte_string)) + + @cython.final + cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length): + # TODO: This function's API/behaviour is an unholy mess... + # 0 means missing, but we don't bother offsetting the index. + cdef hash_t key = hash_utf8(utf8_string, length) + cdef Utf8Str* value = self._map.get(key) + if value is not NULL: + return value + value = self._oov.get(key) + if value is not NULL: + return value + if self.is_frozen: + # OOV store uses 32 bit hashes. Pretty ugly :( + key32 = hash32_utf8(utf8_string, length) + # Important: Make the OOV store own the memory. That way it's trivial + # to flush them all. + value = self._oov.mem.alloc(1, sizeof(Utf8Str)) + value[0] = _allocate(self._oov.mem, utf8_string, length) + self._oov.set(key32, value) + return NULL + + if self.size == self._resize_at: + self._realloc() + self.c[self.size] = _allocate(self.mem, utf8_string, length) + self._map.set(key, &self.c[self.size]) + self.size += 1 + return &self.c[self.size-1] + + def to_disk(self, path): + """ + Save the strings to a JSON file. + + Arguments: + path (unicode / Path): The file path to save the strings. + Returns: + None + """ + with open(path, 'w') as file_: + string_data = ujson.dumps(list(self)) + if not isinstance(string_data, unicode): + string_data = string_data.decode('utf8') + file_.write(string_data) + + def from_disk(self, path): + """ + Load the strings from a JSON file. + + Arguments: + path (unicode / Path): The file from which to load the strings. + Returns: + None + """ + with open(path) as file_: + strings = ujson.load(file_) + if strings == ['']: + return None + cdef unicode string + for string in strings: + # explicit None/len check instead of simple truth testing + # (bug in Cython <= 0.23.4) + if string is not None and len(string): + self.intern_unicode(string) + + def _realloc(self): + # We want to map straight to pointers, but they'll be invalidated if + # we resize our array. So, first we remap to indices, then we resize, + # then we can acquire the new pointers. + cdef Pool tmp_mem = Pool() + keys = tmp_mem.alloc(self.size, sizeof(key_t)) + cdef key_t key + cdef void* value + cdef const Utf8Str ptr + cdef int i = 0 + cdef size_t offset + while map_iter(self._map.c_map, &i, &key, &value): + # Find array index with pointer arithmetic + offset = ((value) - self.c) + keys[offset] = key + + self._resize_at *= 2 + cdef size_t new_size = self._resize_at * sizeof(Utf8Str) + self.c = self.mem.realloc(self.c, new_size) + + self._map = PreshMap(self.size) + for i in range(self.size): + if keys[i]: + self._map.set(keys[i], &self.c[i]) diff --git a/sense2vec/cfile.pxd b/sense2vec/cfile.pxd new file mode 100644 index 0000000..2ca9e29 --- /dev/null +++ b/sense2vec/cfile.pxd @@ -0,0 +1,12 @@ +from libc.stdio cimport fopen, fclose, fread, fwrite, FILE +from cymem.cymem cimport Pool + +cdef class CFile: + cdef FILE* fp + cdef bint is_open + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/sense2vec/cfile.pyx b/sense2vec/cfile.pyx new file mode 100644 index 0000000..7817594 --- /dev/null +++ b/sense2vec/cfile.pyx @@ -0,0 +1,42 @@ +from libc.stdio cimport fopen, fclose, fread, fwrite, FILE + + +cdef class CFile: + def __init__(self, loc, mode): + if isinstance(mode, unicode): + mode_str = mode.encode('ascii') + else: + mode_str = mode + cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc + self.fp = fopen(bytes_loc, mode_str) + if self.fp == NULL: + raise IOError("Could not open binary file %s" % bytes_loc) + self.is_open = True + + def __dealloc__(self): + if self.is_open: + fclose(self.fp) + + def close(self): + fclose(self.fp) + self.is_open = False + + cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: + st = fread(dest, elem_size, number, self.fp) + if st != number: + raise IOError + + cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: + st = fwrite(src, elem_size, number, self.fp) + if st != number: + raise IOError + + cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: + cdef void* dest = mem.alloc(number, elem_size) + self.read_into(dest, number, elem_size) + return dest + + def write_unicode(self, unicode value): + cdef bytes py_bytes = value.encode('utf8') + cdef char* chars = py_bytes + self.write(sizeof(char), len(py_bytes), chars) diff --git a/sense2vec/vectors.pxd b/sense2vec/vectors.pxd index c3104a7..9c98aa4 100644 --- a/sense2vec/vectors.pxd +++ b/sense2vec/vectors.pxd @@ -1,8 +1,8 @@ from libcpp.vector cimport vector from preshed.maps cimport PreshMap -from spacy.strings cimport StringStore, hash_string from murmurhash.mrmr cimport hash64 from cymem.cymem cimport Pool +from ._strings cimport StringStore, hash_string cdef class VectorMap: diff --git a/sense2vec/vectors.pyx b/sense2vec/vectors.pyx index 3296f69..6543b31 100644 --- a/sense2vec/vectors.pyx +++ b/sense2vec/vectors.pyx @@ -12,9 +12,7 @@ from libc.math cimport sqrt from libcpp.pair cimport pair from libcpp.queue cimport priority_queue from libcpp.vector cimport vector -from spacy.cfile cimport CFile from preshed.maps cimport PreshMap -from spacy.strings cimport StringStore, hash_string from murmurhash.mrmr cimport hash64 from cymem.cymem cimport Pool @@ -26,6 +24,9 @@ try: except ImportError: import json +from .cfile cimport CFile +from ._strings cimport StringStore, hash_string + ctypedef pair[float, int] Entry ctypedef priority_queue[Entry] Queue @@ -184,8 +185,7 @@ cdef class VectorMap: * data_dir/freqs.json --- The frequencies. * data_dir/vectors.bin --- The vectors. ''' - with open(path.join(data_dir, 'strings.json'), 'w') as file_: - self.strings.dump(file_) + self.strings.to_disk(path.join(data_dir, 'strings.json')) self.data.save(path.join(data_dir, 'vectors.bin')) freqs = [] cdef uint64_t hashed @@ -206,8 +206,7 @@ cdef class VectorMap: * data_dir/vectors.bin --- The vectors. ''' self.data.load(path.join(data_dir, 'vectors.bin')) - with open(path.join(data_dir, 'strings.json')) as file_: - self.strings.load(file_) + self.strings.from_disk(path.join(data_dir, 'strings.json')) with open(path.join(data_dir, 'freqs.json')) as file_: freqs = json.load(file_) cdef uint64_t hashed @@ -232,7 +231,7 @@ cdef class VectorStore: def __getitem__(self, int i): cdef float* ptr = self.vectors.at(i) cv = ptr - return numpy.asarray(cv) + return numpy.asarray(cv, dtype='float32') def add(self, float[:] vec): assert len(vec) == self.nr_dim From fb915238fada8992e56b359966e6a06fd8132c36 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:32:26 +0200 Subject: [PATCH 009/297] Update modules in setup.py --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5ad02b7..fb080ba 100644 --- a/setup.py +++ b/setup.py @@ -22,7 +22,9 @@ ] MOD_NAMES = [ - 'sense2vec.vectors' + 'sense2vec.vectors', + 'sense2vec.cfile', + 'sense2vec._strings' ] From d68c57c0ff9b09623fcc03c31d357232fd411ac3 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:32:50 +0200 Subject: [PATCH 010/297] Remove old Sputnik and downloading/loading mess --- sense2vec/__init__.py | 6 ++---- sense2vec/download.py | 38 -------------------------------------- sense2vec/util.py | 26 -------------------------- 3 files changed, 2 insertions(+), 68 deletions(-) delete mode 100644 sense2vec/download.py delete mode 100644 sense2vec/util.py diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 2a999ab..61f63d2 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -1,4 +1,3 @@ -from . import util # coding: utf8 from __future__ import unicode_literals @@ -6,8 +5,7 @@ from .about import __version__ -def load(name=None, via=None): - package = util.get_package_by_name(name, via=via) +def load(vectors_path): vector_map = VectorMap(128) - vector_map.load(package.path) + vector_map.load(vectors_path) return vector_map diff --git a/sense2vec/download.py b/sense2vec/download.py deleted file mode 100644 index 11e990b..0000000 --- a/sense2vec/download.py +++ /dev/null @@ -1,38 +0,0 @@ -from __future__ import print_function -import sys - -import plac -import sputnik -from sputnik.package_list import (PackageNotFoundException, - CompatiblePackageNotFoundException) - -from sense2vec import about - - -@plac.annotations( - force=("Force overwrite", "flag", "f", bool), -) -def main(force=False): - if force: - sputnik.purge(about.__title__, about.__version__) - - try: - sputnik.package(about.__title__, about.__version__, about.__default_model__) - print("Model already installed. Please run '%s --force to reinstall." % sys.argv[0], file=sys.stderr) - sys.exit(1) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - pass - - package = sputnik.install(about.__title__, about.__version__, about.__default_model__) - - try: - sputnik.package(about.__title__, about.__version__, about.__default_model__) - except (PackageNotFoundException, CompatiblePackageNotFoundException): - print("Model failed to install. Please run '%s --force." % sys.argv[0], file=sys.stderr) - sys.exit(1) - - print("Model successfully installed.", file=sys.stderr) - - -if __name__ == '__main__': - plac.call(main) diff --git a/sense2vec/util.py b/sense2vec/util.py deleted file mode 100644 index de356e9..0000000 --- a/sense2vec/util.py +++ /dev/null @@ -1,26 +0,0 @@ -import sputnik -from sputnik.dir_package import DirPackage -from sputnik.package_list import (PackageNotFoundException, - CompatiblePackageNotFoundException) - -from . import about - - -def get_package(data_dir): - if not isinstance(data_dir, six.string_types): - raise RuntimeError('data_dir must be a string') - return DirPackage(data_dir) - - -def get_package_by_name(name=None, via=None): - try: - return sputnik.package(about.title, about.version, - name or about.default_model, data_path=via) - except PackageNotFoundException as e: - raise RuntimeError("Model not installed. Please run 'python -m " - "sense2vec.download' to install latest compatible " - "model.") - except CompatiblePackageNotFoundException as e: - raise RuntimeError("Installed model is not compatible with sense2vec " - "version. Please run 'python -m sense2vec.download " - "--force' to install latest compatible model.") From 00acfe234e963afad230d442bf318f305c201730 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:32:58 +0200 Subject: [PATCH 011/297] Update model test --- sense2vec/tests/test_sense2vec.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sense2vec/tests/test_sense2vec.py b/sense2vec/tests/test_sense2vec.py index 537fb66..3529ac2 100644 --- a/sense2vec/tests/test_sense2vec.py +++ b/sense2vec/tests/test_sense2vec.py @@ -2,13 +2,18 @@ from __future__ import unicode_literals import pytest +from os import path -import sense2vec +from .. import load + + +data_path = path.join(path.dirname(__file__), '..', '..', 'data') @pytest.mark.models -def test_sample(): - s2v = sense2vec.load('reddit_vectors') +@pytest.mark.parametrize('model', ['reddit_vectors-1.1.0']) +def test_sample(model): + s2v = load(path.join(data_path, model)) freq, query_vector = s2v[u"beekeepers|NOUN"] assert freq is not None assert s2v.most_similar(query_vector, 3)[0] == \ From b7cb4bdfe41fb00d49234222b3d18c598c61af10 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:35:29 +0200 Subject: [PATCH 012/297] Add pipeline component for spaCy v2.x --- sense2vec/__init__.py | 78 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 61f63d2..6fca047 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -9,3 +9,81 @@ def load(vectors_path): vector_map = VectorMap(128) vector_map.load(vectors_path) return vector_map + + +class Sense2VecComponent(object): + """ + spaCy v2.0 pipeline component. + + USAGE: + >>> import spacy + >>> from sense2vec import Sense2VecComponent + >>> nlp = spacy.load('en') + >>> s2v = Sense2VecComponent('/path/to/model') + >>> nlp.add_pipe(s2v) + >>> doc = nlp(u"A text about natural language processing.") + >>> assert doc[3].text == 'natural language processing' + >>> assert doc[3]._.in_s2v + >>> print(doc[3]._.s2v_most_similar(20)) + """ + name = 'sense2vec' + + def __init__(self, vectors_path): + self.s2v = load(vectors_path) + self.first_run = True + + def __call__(self, doc): + if self.first_run: + self.init_component(doc) + self.first_run = False + if not doc.is_tagged: + raise ValueError("Can't run sense2vec: document not tagged.") + for ent in doc.ents: + ent.merge(tag=ent.root.tag_, lemma=ent.root.lemma_, + ent_type=ent.label_) + for np in doc.noun_chunks: + while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'): + np = np[1:] + np.merge(tag=np.root.tag_, lemma=np.root.lemma_, + ent_type=np.root.ent_type_) + return doc + + def init_component(self, doc): + # initialise the attributes here only if the component is added to the + # pipeline and used – otherwise, tokens will still get the attributes + # even if the component is only created and not added + Token = doc[0].__class__ + Span = doc[:1].__class__ + Token.set_extension('in_s2v', getter=lambda t: self.in_s2v(t)) + Token.set_extension('s2v_freq', getter=lambda t: self.s2v_freq(t)) + Token.set_extension('s2v_vec', getter=lambda t: self.s2v_vec(t)) + Token.set_extension('s2v_most_similar', method=lambda t, n: self.s2v_most_sim(t, n)) + Span.set_extension('in_s2v', getter=lambda s: self.in_s2v(s, 'ent')) + Span.set_extension('s2v_freq', getter=lambda s: self.s2v_freq(s, 'ent')) + Span.set_extension('s2v_vec', getter=lambda s: self.s2v_vec(s, 'ent')) + Span.set_extension('s2v_most_similar', method=lambda s, n: self.s2v_most_sim(s, n, 'ent')) + + def in_s2v(self, obj, attr='pos'): + return self._get_query(obj, attr) in self.s2v + + def s2v_freq(self, obj, attr='pos'): + freq, _ = self.s2v[self._get_query(obj, attr)] + return freq + + def s2v_vec(self, obj, attr='pos'): + _, vector = self.s2v[self._get_query(obj, attr)] + return vector + + def s2v_most_sim(self, obj, n_similar=10, attr='pos'): + _, vector = self.s2v[self._get_query(obj, attr)] + words, scores = self.s2v.most_similar(vector, n_similar) + words = [word.replace('_', ' ') for word in words] + words = [tuple(word.rsplit('|', 1)) for word in words] + return list(zip(words, scores)) + + def _get_query(self, obj, attr='pos'): + # no pos_ and label_ shouldn't happen – unless it's an unmerged + # non-entity Span (in which case we just use the root's pos) + pos = obj.pos_ if hasattr(obj, 'pos_') else obj.root.pos_ + sense = obj.label_ if (attr == 'ent' and obj.label_) else pos + return obj.text.replace(' ', '_') + '|' + sense From 91369d44096197cd9098b14b182b51866f36a146 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:35:38 +0200 Subject: [PATCH 013/297] Update README --- README.rst | 405 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 362 insertions(+), 43 deletions(-) diff --git a/README.rst b/README.rst index 7b94950..79f7859 100644 --- a/README.rst +++ b/README.rst @@ -1,76 +1,395 @@ -sense2vec: Use spaCy to go beyond vanilla word2vec -************************************************** +sense2vec: Use NLP to go beyond vanilla word2vec +************************************************ -Read about sense2vec in our `blog post `_. You can try an online demo of the technology `here `_ and use the open-source `REST server `_. +sense2vec (`Trask et. al `_, 2015) is a nice +twist on `word2vec `_ that lets you +learn more interesting, detailed and context-sensitive word vectors. For an +interactive example of the technology, see our +`sense2vec demo `_ that lets you explore +semantic similarities across all Reddit comments of 2015. -.. image:: https://travis-ci.org/explosion/sense2vec.svg?branch=master +This library is a simple Python/Cython implementation that lets you load +and query sense2vec vectors. While it's best used in combination with `spaCy `_, the ``sense2vec`` library itself is very lightweight +and can be used as a standalone module. See below for usage details. + +.. image:: https://img.shields.io/travis/explosion/sense2vec/master.svg?style=flat-square :target: https://travis-ci.org/explosion/sense2vec :alt: Build Status -.. image:: https://img.shields.io/pypi/v/sense2vec.svg +.. image:: https://img.shields.io/github/release/explosion/sense2vec.svg?style=flat-square + :target: https://github.com/explosion/sense2vec/releases + :alt: Current Release Version + +.. image:: https://img.shields.io/pypi/v/sense2vec.svg?style=flat-square :target: https://pypi.python.org/pypi/sense2vec :alt: pypi Version +Usage Examples +============== -Overview -======== +Usage with spaCy +---------------- -There are three relevant files in this repository: +.. code:: python -``bin/merge_text.py`` ------------------ + import spacy + from sense2vec import Sense2VecComponent -This script pre-processes text using spaCy, so that the sense2vec model can be trained using Gensim. + nlp = spacy.load('en') + s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0') + nlp.add_pipe(s2v) -``bin/train_word2vec.py`` ---------------------- + doc = nlp(u"A sentence about natural language processing.") + assert doc[3].text == u"natural language processing" + freq = doc[3]._.s2v_freq + vector = doc[3]._.s2v_vec + most_similar = doc[3]._.s2v_most_similar(3) + # [(('natural language processing', 'NOUN'), 1.0), + # (('machine learning', 'NOUN'), 0.8986966609954834), + # (('computer vision', 'NOUN'), 0.8636297583580017)] -This script reads a directory of text files, and then trains a word2vec model using Gensim. The script includes its own -vocabulary counting code, because Gensim's vocabulary count is a bit slow for our large, sparse vocabulary. +Standalone usage without spaCy +------------------------------ -``sense2vec/vectors.pyx`` ---------------------- +.. code:: python -To serve the similarity queries, we wrote a small vector-store class in Cython. This made it easier to add an efficient -cache in front of the service. It also less memory than Gensim's Word2Vec class, as it doesn't hold the keys as Python -unicode strings. + import sense2vec -Similarity queries could be faster, if we had made all vectors contiguous in memory, instead of holding them -as an array of pointers. However, we wanted to allow a ``.borrow()`` method, so that vectors can be added to the store -by reference, without copying the data. + s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') + query = 'natural_language_processing|NOUN' + assert query in s2v + freq, vector = s2v[query] + words, scores = s2v.most_similar(vector, 3) + most_similar = list(zip(words, scores)) + # [('natural language processing|NOUN', 1.0), + # ('machine learning|NOUN', 0.8986966609954834), + # ('computer vision|NOUN', 0.8636297583580017)] -Installation -============ +Installation & Setup +==================== -Until there is a PyPI release you can install sense2vec by: +==================== === +**Operating system** macOS / OS X, Linux, Windows (Cygwin, MinGW, Visual Studio) +**Python version** CPython 2.7, 3.4+. Only 64 bit. +**Package managers** `pip `_ (source packages only) +==================== === -1. cloning the repository -2. run ``pip install -r requirements.txt`` -3. ``pip install -e .`` -4. install the latest model via ``sputnik --name sense2vec --repository-url http://index.spacy.io install reddit_vectors`` +sense2vec releases are available as source packages on pip: + +.. code:: bash + + pip install sense2vec + +The Reddit vectors model is attached to the +`latest release `_. To load it +in, download the ``.tar.gz`` archive, unpack it and point ``sense2vec.load`` to +the extracted data directory: + +.. code:: python -You might also be tempted to simply run ``pip install -e git+git://github.com/spacy-io/sense2vec.git#egg=sense2vec`` instead of steps 1-3, but it expects `Cython `_ to be present. + import sense2vec + s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') Usage ===== +Usage with spaCy v2.x +--------------------- + +The easiest way to use the library and vectors is to plug it into your spaCy +pipeline. Note that ``sense2vec`` doesn't depend on spaCy, so you'll have to +install it separately and download the English model. + +.. code:: bash + + pip install -U spacy + python -m spacy download en + +The ``sense2vec`` package exposes a ``Sense2VecComponent``, which can be +initialised with the data path and added to your spaCy pipeline as a +`custom pipeline component `_. +By default, components are added to the *end of the pipeline*, which is the +recommended position for this component, since it needs access to the dependency +parse and, if available, named entities. + .. code:: python - import sense2vec - model = sense2vec.load() - freq, query_vector = model["natural_language_processing|NOUN"] - model.most_similar(query_vector, n=3) + import spacy + from sense2vec import Sense2VecComponent + + nlp = spacy.load('en') + s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0') + nlp.add_pipe(s2v) + +The pipeline component will **merge noun phrases and entities** according to +the same schema used when training the sense2vec models (e.g. noun chunks +without determiners like "the"). This ensures that you'll be able to retrieve +meaningful vectors for phrases in your text. The component will also add +serveral `extension attributes and methods `_ +to spaCy's ``Token`` and ``Span`` objects that let you retrieve vectors and +frequencies, as well as most similar terms. .. code:: python - (['natural_language_processing|NOUN', 'machine_learning|NOUN', 'computer_vision|NOUN'], ) + doc = nlp(u"A sentence about natural language processing.") + assert doc[3].text == u"natural language processing" + assert doc[3]._.in_s2v + freq = doc[3]._.s2v_freq + vector = doc[3]._.s2v_vec + most_similar = doc[3]._.s2v_most_similar(10) + +For entities, the entity labels are used as the "sense" (instead of the +token's part-of-speech tag): + +.. code:: python + + doc = nlp(u"A sentence about Facebook and Google.") + for ent in doc.ents: + assert ent._.in_s2v + most_similar = ent._.s2v_most_similar(3) + +Available attributes +^^^^^^^^^^^^^^^^^^^^ + +The following attributes are available via the `._` property – for example +``token._.in_s2v``: + +==================== ============== ==================== === +Name Attribute Type Type Description +==================== ============== ==================== === +``in_s2v`` property bool Whether a key exists in the vector map. +``s2v_freq`` property int The frequency of the given key. +``s2v_vec`` property ``ndarray[float32]`` The vector of the given key. +``s2v_most_similar`` method list Get the ``n`` most similar terms. Returns a list of ``((word, sense), score)`` tuples. +==================== ============== ==================== === + +**A note on span attributes:** Under the hood, entities in ``doc.ents`` are +``Span`` objects. This is why the pipeline component also adds attributes and +methods to spans and not just tokens. However, it's not recommended to use the +sense2vec attributes on arbitrary slices of the document, since the model likely +won't have a key for the respective text. ``Span`` objects also don't have a +part-of-speech tag, so if no entity label is present, the "sense" defaults to +the root's part-of-speech tag. + +Standalone usage +---------------- + +To use only the ``sense2vec`` library, you can import the package and then call +its ``load()`` method to load in the vectors. + +.. code:: python + + import sense2vec + s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') + +``sense2vec.load`` returns an instance of the ``VectorMap`` class, which you +can interact with via the following methods: + +``VectorMap.__len__`` +^^^^^^^^^^^^^^^^^^^^^ + +The total number of entries in the map. + +=========== ==== === +Argument Type Description +=========== ==== === +**RETURNS** int The number of entries in the map. +=========== ==== === + +.. code:: python + + s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') + assert len(s2v) == 1195261 + +``VectorMap.__contains__`` +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Check whether the ``VectorMap`` has a given key. Keys consist of the word +string, a pipe and the "sense", i.e. the part-of-speech tag or entity label. +For example: ``'duck|NOUN'`` or ``'duck|VERB'``. See the section on "Senses" +below for more details. Also note that the underlying vector table is +**case-sensitive**. + +=========== ======= === +Argument Type Description +=========== ======= === +``string`` unicode The key to check. +**RETURNS** bool Whether the key is part of the map. +=========== ======= === + +.. code:: python + + assert 'duck|NOUN' in s2v + assert 'duck|VERB' in s2v + assert 'dkdksl|VERB' not in s2v + +``VectorMap.__getitem__`` +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Retrieve a ``(frequency, vector)`` tuple from the vector map. The frequency is +an integer, the vector a ``numpy.ndarray(dtype='float32')``. If the key is not +found, a ``KeyError`` is raised. + +=========== ======= === +Argument Type Description +=========== ======= === +``string`` unicode The key to retrieve the frequency and vector for. +**RETURNS** tuple The ``(frequency, vector)`` tuple. +=========== ======= === + +.. code:: python + + freq, vector = s2v['duck|NOUN'] + +``VectorMap.__setitem__`` +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Assign a ``(frequency, vector)`` tuple to the vector map. The frequency should +be an integer, the vector a ``numpy.ndarray(dtype='float32')``. + +=========== ======= === +Argument Type Description +=========== ======= === +``key`` unicode The key to assign the frequency and vector to. +``value`` tuple The ``(frequency, vector)`` tuple to assign. +=========== ======= === + +.. code:: python + + freq, vector = s2v['avocado|NOUN'] + s2v['🥑|NOUN'] = (freq, vector) + +``VectorMap.__iter__``, ``VectorMap.keys`` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Iterate over the keys in the map, in order of insertion. + +=========== ======= === +Argument Type Description +=========== ======= === +**YIELDS** unicode The keys in the map. +=========== ======= === + +``VectorMap.values`` +^^^^^^^^^^^^^^^^^^^^ + +Iterate over the values in the map, in order of insertion and yield +``(frequency, vector)`` tuples from the vector map. The frequency is an integer, +the vector a ``numpy.ndarray(dtype='float32')`` + +=========== ======= === +Argument Type Description +=========== ======= === +**YIELDS** tuple The values in the map. +=========== ======= === + +``VectorMap.items`` +^^^^^^^^^^^^^^^^^^^ + +Iterate over the items in the map, in order of insertion and yield +``(key, (frequency, vector))`` tuples from the vector map. The frequency is an integer, the vector a ``numpy.ndarray(dtype='float32')`` + +=========== ======= === +Argument Type Description +=========== ======= === +**YIELDS** tuple The items in the map. +=========== ======= === + +``VectorMap.most_similar`` +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Find the keys of the ``n`` most similar entries, given a vector. Note that +the *most* similar entry with a score of ``1.0`` will be the key of the query +vector itself. + +=========== ================================== === +Argument Type Description +=========== ================================== === +``vector`` ``numpy.ndarray(dtype='float32')`` The vector to compare to. +``n`` int The number of entries to return. Defaults to ``10``. +**RETURNS** tuple A ``(words, scores)`` tuple. +=========== ================================== === + +.. code:: python + + freq, vector = s2v['avocado|NOUN'] + words, scores = s2v.most_similar(vector, n=3) + for word, score in zip(words, scores): + print(word, score) + # avocado|NOUN 1.0 + # avacado|NOUN 0.970944344997406 + # spinach|NOUN 0.962776780128479 + +``VectorMap.save`` +^^^^^^^^^^^^^^^^^^ + +Serialize the model to a directory. This will export three files to the output +directory: a ``strings.json`` containing the keys in insertion order, a +``freqs.json`` containing the frequencies and a ``vectors.bin`` containing the +vectors. + +============ ======= === +Argument Type Description +============ ======= === +``data_dir`` unicode The path to the output directory. +============ ======= === + +``VectorMap.load`` +^^^^^^^^^^^^^^^^^^ + +Load a model from a directory. Expects three files in the directory (see +``VectorMap.save`` for details). + +============ ======= === +Argument Type Description +============ ======= === +``data_dir`` unicode The path to load the model from. +============ ======= === + +Senses +====== + +The pre-trained Reddit vectors support the following "senses", either +part-of-speech tags or entity labels. For more details, see spaCy's +`annotation scheme overview `_. + +========= ========================== === +Tag Description Examples +========= ========================== === +``ADJ`` adjective big, old, green +``ADP`` adposition in, to, during +``ADV`` adverb very, tomorrow, down, where +``AUX`` auxiliary is, has (done), will (do) +``CONJ`` conjunction and, or, but +``DET`` determiner a, an, the +``INTJ`` interjection psst, ouch, bravo, hello +``NOUN`` noun girl, cat, tree, air, beauty +``NUM`` numeral 1, 2017, one, seventy-seven, MMXIV +``PART`` particle 's, not +``PRON`` pronoun I, you, he, she, myself, somebody +``PROPN`` proper noun Mary, John, London, NATO, HBO +``PUNCT`` punctuation , ? ( ) +``SCONJ`` subordinating conjunction if, while, that +``SYM`` symbol $, %, =, :), 😝 +``VERB`` verb run, runs, running, eat, ate, eating +========= ========================== === -For additional performance experimental support for BLAS can be enabled by setting the `USE_BLAS` environment variable before installing (e.g. ``USE_BLAS=1 pip install ...``). This requires an up-to-date BLAS/OpenBlas/Atlas installation. +=============== === +Entity Label Description +=============== === +``PERSON`` People, including fictional. +``NORP`` Nationalities or religious or political groups. +``FACILITY`` Buildings, airports, highways, bridges, etc. +``ORG`` Companies, agencies, institutions, etc. +``GPE`` Countries, cities, states. +``LOC`` Non-GPE locations, mountain ranges, bodies of water. +``PRODUCT`` Objects, vehicles, foods, etc. (Not services.) +``EVENT`` Named hurricanes, battles, wars, sports events, etc. +``WORK_OF_ART`` Titles of books, songs, etc. +``LANGUAGE`` Any named language. +=============== === -Support -======= +Training a sense2vec model +========================== -* CPython 2.6, 2.7, 3.3, 3.4, 3.5 (only 64 bit) -* OSX -* Linux -* Windows +**🚧 TODO:** Update training scripts for spaCy v2.x. From 6b4dd685f74c73bc8fcb2202dab76d8b630a1262 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 16:39:15 +0200 Subject: [PATCH 014/297] Install dev requirements on Travis --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index d4ca784..9696454 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,7 @@ python: install: - pip install -U numpy - pip install -r requirements.txt + - pip install -r requirements-dev.txt - pip install -e . script: From 7db0b6c86e60aa6419f575e838ba788e38a1c0ed Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 17:08:47 +0200 Subject: [PATCH 015/297] Update README --- README.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 79f7859..ad79237 100644 --- a/README.rst +++ b/README.rst @@ -8,9 +8,10 @@ interactive example of the technology, see our `sense2vec demo `_ that lets you explore semantic similarities across all Reddit comments of 2015. -This library is a simple Python/Cython implementation that lets you load -and query sense2vec vectors. While it's best used in combination with `spaCy `_, the ``sense2vec`` library itself is very lightweight -and can be used as a standalone module. See below for usage details. +This library is a simple Python/Cython implementation for loading and querying +sense2vec models. While it's best used in combination with +`spaCy `_, the ``sense2vec`` library itself is very lightweight +and can also be used as a standalone module. See below for usage details. .. image:: https://img.shields.io/travis/explosion/sense2vec/master.svg?style=flat-square :target: https://travis-ci.org/explosion/sense2vec From 80e51f033b07f4d65634c1c2565b94d992b8cba4 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 17:25:09 +0200 Subject: [PATCH 016/297] Fix requirements in setup.py --- setup.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index fb080ba..c66049e 100644 --- a/setup.py +++ b/setup.py @@ -110,10 +110,6 @@ def setup_package(): get_python_inc(plat_specific=True), os.path.join(root, 'include')] - # Read in requirements and split into packages and URLs - with open(os.path.join(root, 'requirements.txt')) as f: - requirements = [line.strip() for line in f] - if (ccompiler.new_compiler().compiler_type == 'msvc' and msvccompiler.get_build_version() == 9): include_dirs.append(os.path.join(root, 'include', 'msvc9')) @@ -140,7 +136,13 @@ def setup_package(): url=about['__uri__'], license=about['__license__'], ext_modules=ext_modules, - install_requires=requirements, + install_requires=[ + 'numpy>=1.7', + 'ujson>=1.35', + 'preshed>=1.0.0,<2.0.0', + 'murmurhash>=0.28,<0.29', + 'cymem>=1.30,<1.32' + ], classifiers=[ 'Development Status :: 4 - Beta', 'Environment :: Console', From 030dadfbaf93a12a70446371df9741652a60d79f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Apr 2018 20:28:23 +0200 Subject: [PATCH 017/297] Update README.rst --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index ad79237..952bf69 100644 --- a/README.rst +++ b/README.rst @@ -13,6 +13,8 @@ sense2vec models. While it's best used in combination with `spaCy `_, the ``sense2vec`` library itself is very lightweight and can also be used as a standalone module. See below for usage details. +🦆 **Version 1.0 alpha out now!** `Read the release notes here. `_ + .. image:: https://img.shields.io/travis/explosion/sense2vec/master.svg?style=flat-square :target: https://travis-ci.org/explosion/sense2vec :alt: Build Status From 4c5ddfc79ad6f1776368d22e96468d80e9ac60ce Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 8 Apr 2018 20:34:51 +0200 Subject: [PATCH 018/297] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 952bf69..3d1f5bb 100644 --- a/README.rst +++ b/README.rst @@ -19,7 +19,7 @@ and can also be used as a standalone module. See below for usage details. :target: https://travis-ci.org/explosion/sense2vec :alt: Build Status -.. image:: https://img.shields.io/github/release/explosion/sense2vec.svg?style=flat-square +.. image:: https://img.shields.io/github/release/explosion/sense2vec/all.svg?style=flat-square :target: https://github.com/explosion/sense2vec/releases :alt: Current Release Version From 87240f58982fdbe78ef0bd159e0f48c75bba1a57 Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 20:41:22 +0200 Subject: [PATCH 019/297] Use explicit unicode strings in examples --- README.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/README.rst b/README.rst index 3d1f5bb..9288f3c 100644 --- a/README.rst +++ b/README.rst @@ -43,7 +43,7 @@ Usage with spaCy nlp.add_pipe(s2v) doc = nlp(u"A sentence about natural language processing.") - assert doc[3].text == u"natural language processing" + assert doc[3].text == u'natural language processing' freq = doc[3]._.s2v_freq vector = doc[3]._.s2v_vec most_similar = doc[3]._.s2v_most_similar(3) @@ -59,7 +59,7 @@ Standalone usage without spaCy import sense2vec s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') - query = 'natural_language_processing|NOUN' + query = u'natural_language_processing|NOUN' assert query in s2v freq, vector = s2v[query] words, scores = s2v.most_similar(vector, 3) @@ -135,7 +135,7 @@ frequencies, as well as most similar terms. .. code:: python doc = nlp(u"A sentence about natural language processing.") - assert doc[3].text == u"natural language processing" + assert doc[3].text == u'natural language processing' assert doc[3]._.in_s2v freq = doc[3]._.s2v_freq vector = doc[3]._.s2v_vec @@ -222,9 +222,9 @@ Argument Type Description .. code:: python - assert 'duck|NOUN' in s2v - assert 'duck|VERB' in s2v - assert 'dkdksl|VERB' not in s2v + assert u'duck|NOUN' in s2v + assert u'duck|VERB' in s2v + assert u'dkdksl|VERB' not in s2v ``VectorMap.__getitem__`` ^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -242,7 +242,7 @@ Argument Type Description .. code:: python - freq, vector = s2v['duck|NOUN'] + freq, vector = s2v[u'duck|NOUN'] ``VectorMap.__setitem__`` ^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -259,8 +259,8 @@ Argument Type Description .. code:: python - freq, vector = s2v['avocado|NOUN'] - s2v['🥑|NOUN'] = (freq, vector) + freq, vector = s2v[u'avocado|NOUN'] + s2v[u'🥑|NOUN'] = (freq, vector) ``VectorMap.__iter__``, ``VectorMap.keys`` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -315,7 +315,7 @@ Argument Type Description .. code:: python - freq, vector = s2v['avocado|NOUN'] + freq, vector = s2v[u'avocado|NOUN'] words, scores = s2v.most_similar(vector, n=3) for word, score in zip(words, scores): print(word, score) From d5d34e34da2ff51dd5715a871cc903f64f8e17df Mon Sep 17 00:00:00 2001 From: ines Date: Sun, 8 Apr 2018 22:22:17 +0200 Subject: [PATCH 020/297] Remove deleted file from manifest --- MANIFEST.in | 1 - 1 file changed, 1 deletion(-) diff --git a/MANIFEST.in b/MANIFEST.in index e15d9de..6977488 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,3 @@ recursive-include include *.h -include buildbot.json include LICENSE include README.rst From a196a6267d01c179aab52cf25bd082f585ef42e1 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 9 Apr 2018 00:15:47 +0200 Subject: [PATCH 021/297] Fix key format and add note --- README.rst | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 9288f3c..c2c6c81 100644 --- a/README.rst +++ b/README.rst @@ -64,9 +64,9 @@ Standalone usage without spaCy freq, vector = s2v[query] words, scores = s2v.most_similar(vector, 3) most_similar = list(zip(words, scores)) - # [('natural language processing|NOUN', 1.0), - # ('machine learning|NOUN', 0.8986966609954834), - # ('computer vision|NOUN', 0.8636297583580017)] + # [('natural_language_processing|NOUN', 1.0), + # ('machine_learning|NOUN', 0.8986966609954834), + # ('computer_vision|NOUN', 0.8636297583580017)] Installation & Setup ==================== @@ -186,7 +186,13 @@ its ``load()`` method to load in the vectors. s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') ``sense2vec.load`` returns an instance of the ``VectorMap`` class, which you -can interact with via the following methods: +can interact with via the following methods. + +⚠️ **Important note:** When interacting with the ``VectorMap`` directly, the +keys need to follow the scheme of ``phrase_text|SENSE`` (note the ``_`` instead +of spaces and the ``|`` before the tag or label) – for example, +``machine_learning|NOUN``. Also note that the underlying vector table is +case-sensitive. ``VectorMap.__len__`` ^^^^^^^^^^^^^^^^^^^^^ From d2854af24dcfe951e144236a3efa121ff29b36c4 Mon Sep 17 00:00:00 2001 From: ines Date: Tue, 10 Apr 2018 15:22:25 +0200 Subject: [PATCH 022/297] Raise error early if vectors path doesn't exist (see #44) --- sense2vec/__init__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 6fca047..0ded5a3 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -1,11 +1,15 @@ # coding: utf8 from __future__ import unicode_literals +from os import path + from .vectors import VectorMap from .about import __version__ def load(vectors_path): + if not path.exists(vectors_path): + raise IOError("Can't find data directory: {}".format(vectors_path)) vector_map = VectorMap(128) vector_map.load(vectors_path) return vector_map From 702d9a1e10916e656b6c0cd938fc4ffe12eaeb59 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 19 Apr 2018 17:59:12 -0400 Subject: [PATCH 023/297] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index c2c6c81..5838c9f 100644 --- a/README.rst +++ b/README.rst @@ -81,7 +81,7 @@ sense2vec releases are available as source packages on pip: .. code:: bash - pip install sense2vec + pip install sense2vec==1.0.0a0 The Reddit vectors model is attached to the `latest release `_. To load it From 198a7c3fa3a4447a1ecc0bea057ab1b96e431999 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 29 Apr 2018 01:49:04 +0200 Subject: [PATCH 024/297] Add encoding See explosion/spaCy/#2271 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c66049e..e585116 100644 --- a/setup.py +++ b/setup.py @@ -103,7 +103,7 @@ def setup_package(): about = {} exec(f.read(), about) - with open(os.path.join(root, 'README.rst')) as f: + with open(os.path.join(root, 'README.rst'), encoding='utf8') as f: readme = f.read() include_dirs = [ From b0174715a5e764f16acb19197c99ecc7a425de93 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 29 Apr 2018 01:53:16 +0200 Subject: [PATCH 025/297] Fix open calls and use io.open --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index e585116..8395695 100644 --- a/setup.py +++ b/setup.py @@ -2,6 +2,7 @@ from __future__ import print_function import os +import io import shutil import subprocess import sys @@ -99,11 +100,11 @@ def setup_package(): return clean(root) with chdir(root): - with open(os.path.join(root, src_path, 'about.py')) as f: + with io.open(os.path.join(root, src_path, 'about.py'), encoding='utf8') as f: about = {} exec(f.read(), about) - with open(os.path.join(root, 'README.rst'), encoding='utf8') as f: + with io.open(os.path.join(root, 'README.rst'), encoding='utf8') as f: readme = f.read() include_dirs = [ From 566e6cb2f8f23c83012eddd3f50659be8fb4cf86 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 1 Jun 2018 15:50:17 +0200 Subject: [PATCH 026/297] Ignore tmp directory --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 55d3e33..e310d67 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ +tmp/ data/ cythonize.dat *.cpp From 237c2140eabfd556ee6e68792e6f9e7ded31b3a2 Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 1 Jun 2018 15:50:37 +0200 Subject: [PATCH 027/297] Move transform_doc out into own function --- sense2vec/__init__.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 0ded5a3..c78ebda 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -15,6 +15,24 @@ def load(vectors_path): return vector_map +def transform_doc(doc): + """ + Transform a spaCy Doc to match the sense2vec format: merge entities + into one token and merge noun chunks without determiners. + """ + #if not doc.is_tagged: + # raise ValueError("Can't run sense2vec: document not tagged.") + for ent in doc.ents: + ent.merge(tag=ent.root.tag_, lemma=ent.root.lemma_, + ent_type=ent.label_) + for np in doc.noun_chunks: + while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'): + np = np[1:] + np.merge(tag=np.root.tag_, lemma=np.root.lemma_, + ent_type=np.root.ent_type_) + return doc + + class Sense2VecComponent(object): """ spaCy v2.0 pipeline component. @@ -40,16 +58,7 @@ def __call__(self, doc): if self.first_run: self.init_component(doc) self.first_run = False - if not doc.is_tagged: - raise ValueError("Can't run sense2vec: document not tagged.") - for ent in doc.ents: - ent.merge(tag=ent.root.tag_, lemma=ent.root.lemma_, - ent_type=ent.label_) - for np in doc.noun_chunks: - while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'): - np = np[1:] - np.merge(tag=np.root.tag_, lemma=np.root.lemma_, - ent_type=np.root.ent_type_) + doc = transform_doc(doc) return doc def init_component(self, doc): From 2ec352672e00e0e6692eb671f00223e56d1139cd Mon Sep 17 00:00:00 2001 From: ines Date: Fri, 1 Jun 2018 16:00:12 +0200 Subject: [PATCH 028/297] Add preprocessing script WIP --- bin/preprocess.py | 73 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 bin/preprocess.py diff --git a/bin/preprocess.py b/bin/preprocess.py new file mode 100644 index 0000000..f3ad1e9 --- /dev/null +++ b/bin/preprocess.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python +# coding: utf-8 +"""This script can be used to preprocess a corpus for training a sense2vec +model. It take text file with one sentence per line, and outputs a text file +with one sentence per line in the expected sense2vec format (merged noun +phrases, concatenated phrases with underscores and added "senses"). + +Example input: +Rats, mould and broken furniture: the scandal of the UK's refugee housing + +Example output: +Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT +the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN + +DISCLAIMER: The sense2vec training and preprocessing tools are still a work in +progress. Please note that this script hasn't been optimised for efficiency yet +and doesn't paralellize or batch up any of the work, so you might have to +add this functionality yourself for now. +""" +from __future__ import print_function, unicode_literals + +from sense2vec import transform_doc +import spacy +from pathlib import Path +from tqdm import tqdm +import re +import plac + + +def represent_word(word): + if word.like_url: + return '%%URL|X' + text = re.sub(r'\s', '_', word.text) + tag = word.ent_type_ or word.pos_ or '?' + return text + '|' + tag + + +def represent_doc(doc): + strings = [] + for sent in doc.sents: + if sent.text.strip(): + words = ' '.join(represent_word(w) for w in sent if not w.is_space) + strings.append(words) + return '\n'.join(strings) + '\n' if strings else '' + + +@plac.annotations( + in_file=("Path to input file", "positional", None, str), + out_file=("Path to output file", "positional", None, str), + spacy_model=("Name of spaCy model to use", "positional", None, str), + n_workers=("Number of workers", "option", "n", int)) +def main(in_file, out_file, spacy_model='en_core_web_sm', n_workers=4): + input_path = Path(in_file) + output_path = Path(out_file) + if not input_path.exists(): + raise IOError("Can't find input file: {}".format(input_path)) + nlp = spacy.load(spacy_model) + print("Using spaCy model {}".format(spacy_model)) + nlp.add_pipe(transform_doc, name='sense2vec') + lines_count = 0 + with input_path.open('r', encoding='utf8') as texts: + docs = nlp.pipe(texts, n_threads=n_workers) + lines = (represent_doc(doc) for doc in docs) + with output_path.open('w', encoding='utf8') as f: + for line in tqdm(lines, desc='Lines', unit=''): + lines_count += 1 + f.write(line) + print("Successfully preprocessed {} lines".format(lines_count)) + print("{}".format(output_path.resolve())) + + +if __name__ == '__main__': + plac.call(main) From 367f1ebfc0608486ef6633f61d3270b9d93494f7 Mon Sep 17 00:00:00 2001 From: Niek Bartholomeus Date: Fri, 20 Jul 2018 12:23:28 +0200 Subject: [PATCH 029/297] fix the bug that hard codes 128 dimensions in vectors.pyx --- sense2vec/vectors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/vectors.pyx b/sense2vec/vectors.pyx index 6543b31..63c9769 100644 --- a/sense2vec/vectors.pyx +++ b/sense2vec/vectors.pyx @@ -313,7 +313,7 @@ cdef class VectorStore: for i in range(nr_vector): cfile.read_into(&tmp[0], self.nr_dim, sizeof(tmp[0])) ptr = &tmp[0] - cv = ptr + cv = ptr if i >= 1: self.add(cv) cfile.close() From 5ff720b3fc5b2ad90d25946cfcc03c667ef3c9be Mon Sep 17 00:00:00 2001 From: Niek Bartholomeus Date: Fri, 20 Jul 2018 13:20:12 +0200 Subject: [PATCH 030/297] fix the bug that hard codes 128 dimensions in vectors.pyx --- sense2vec/vectors.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/vectors.pyx b/sense2vec/vectors.pyx index 63c9769..0944ac3 100644 --- a/sense2vec/vectors.pyx +++ b/sense2vec/vectors.pyx @@ -313,7 +313,7 @@ cdef class VectorStore: for i in range(nr_vector): cfile.read_into(&tmp[0], self.nr_dim, sizeof(tmp[0])) ptr = &tmp[0] - cv = ptr + cv = ptr if i >= 1: self.add(cv) cfile.close() From a1cdec183c39138ca24106a0e072c33fabb4809f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 May 2019 11:02:59 +0200 Subject: [PATCH 031/297] WIP: add training script --- bin/train.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 bin/train.py diff --git a/bin/train.py b/bin/train.py new file mode 100644 index 0000000..6f3a121 --- /dev/null +++ b/bin/train.py @@ -0,0 +1,42 @@ +from gensim.models import Word2Vec +from gensim.models.word2vec import PathLineSentences +from sense2vec.vectors import VectorMap +import plac + + +@plac.annotations( + in_dir=("Location of input directory", "positional", None, str), + out_file=("Location of output file", "positional", None, str), + n_workers=("Number of workers", "option", "n", int), + size=("Dimension of the word vectors", "option", "d", int), + window=("Context window size", "option", "w", int), + min_count=("Min count", "option", "m", int), + negative=("Number of negative samples", "option", "g", int), + nr_iter=("Number of iterations", "option", "i", int),) +def train(in_dir, out_file, negative=5, n_workers=4, window=5, size=128, + min_count=10, nr_iter=2): + w2v_model = Word2Vec(size=size, window=window, min_count=min_count, + workers=workers, sample=1e-5, negative=negative, + iter=epochs) + sentences = PathLineSentences(in_dir) + print("Building the vocabulary...") + w2v_model.build_vocab(sentences) + print("Training the model...") + w2v_model.train(sentences, total_examples=w2v_model.corpus_count, + epochs=w2v_model.iter) + print("Creating the sense2vec model...") + vector_map = VectorMap(size) + for string in w2v_model.wv.vocab: + vocab = w2v_model.wv.vocab[string] + freq, idx = vocab.count, vocab.index + if freq < min_count: + continue + vector = w2v_model.wv.vectors[idx] + vector_map.borrow(string, freq, vector) + print("Saving the model...") + vector_map.save(out_file) + print("Saved model to file: ", out_file) + + +if __name__ == '__main__': + plac.call(main) From 59726a751dee51363778b83fcb6728b5cf4867e7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 May 2019 11:03:04 +0200 Subject: [PATCH 032/297] Auto-format --- setup.py | 148 +++++++++++++++++++++++++++---------------------------- 1 file changed, 73 insertions(+), 75 deletions(-) diff --git a/setup.py b/setup.py index 8395695..346a30e 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,6 @@ import os import io -import shutil import subprocess import sys import contextlib @@ -17,64 +16,58 @@ from distutils.core import Extension, setup -PACKAGES = [ - 'sense2vec', - 'sense2vec.tests' -] +PACKAGES = ["sense2vec", "sense2vec.tests"] -MOD_NAMES = [ - 'sense2vec.vectors', - 'sense2vec.cfile', - 'sense2vec._strings' -] +MOD_NAMES = ["sense2vec.vectors", "sense2vec.cfile", "sense2vec._strings"] # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used -compile_options = {'msvc' : ['/Ox', '/EHsc'], - 'other' : ['-O3', '-Wno-unused-function', - '-fno-stack-protector']} -link_options = {'msvc' : [], - 'other' : ['-fno-stack-protector']} - - -if os.environ.get('USE_BLAS') == '1': - compile_options['other'].extend([ - '-DUSE_BLAS=1', - '-fopenmp']) -#else: +compile_options = { + "msvc": ["/Ox", "/EHsc"], + "other": ["-O3", "-Wno-unused-function", "-fno-stack-protector"], +} +link_options = {"msvc": [], "other": ["-fno-stack-protector"]} + + +if os.environ.get("USE_BLAS") == "1": + compile_options["other"].extend(["-DUSE_BLAS=1", "-fopenmp"]) +# else: # link_options['other'].extend([ # '-fopenmp']) # + class build_ext_subclass(build_ext): def build_extensions(self): for e in self.extensions: e.extra_compile_args = compile_options.get( - self.compiler.compiler_type, compile_options['other']) + self.compiler.compiler_type, compile_options["other"] + ) for e in self.extensions: e.extra_link_args = link_options.get( - self.compiler.compiler_type, link_options['other']) + self.compiler.compiler_type, link_options["other"] + ) build_ext.build_extensions(self) def generate_cython(root, source): - print('Cythonizing sources') - p = subprocess.call([sys.executable, - os.path.join(root, 'bin', 'cythonize.py'), - source]) + print("Cythonizing sources") + p = subprocess.call( + [sys.executable, os.path.join(root, "bin", "cythonize.py"), source] + ) if p != 0: - raise RuntimeError('Running cythonize failed') + raise RuntimeError("Running cythonize failed") def is_source_release(path): - return os.path.exists(os.path.join(path, 'PKG-INFO')) + return os.path.exists(os.path.join(path, "PKG-INFO")) def clean(path): for name in MOD_NAMES: - name = name.replace('.', '/') - for ext in ['.so', '.html', '.cpp', '.c']: + name = name.replace(".", "/") + for ext in [".so", ".html", ".cpp", ".c"]: file_path = os.path.join(path, name + ext) if os.path.exists(file_path): os.unlink(file_path) @@ -94,77 +87,82 @@ def chdir(new_dir): def setup_package(): root = os.path.abspath(os.path.dirname(__file__)) - src_path = 'sense2vec' + src_path = "sense2vec" - if len(sys.argv) > 1 and sys.argv[1] == 'clean': + if len(sys.argv) > 1 and sys.argv[1] == "clean": return clean(root) with chdir(root): - with io.open(os.path.join(root, src_path, 'about.py'), encoding='utf8') as f: + with io.open(os.path.join(root, src_path, "about.py"), encoding="utf8") as f: about = {} exec(f.read(), about) - with io.open(os.path.join(root, 'README.rst'), encoding='utf8') as f: + with io.open(os.path.join(root, "README.rst"), encoding="utf8") as f: readme = f.read() include_dirs = [ get_python_inc(plat_specific=True), - os.path.join(root, 'include')] + os.path.join(root, "include"), + ] - if (ccompiler.new_compiler().compiler_type == 'msvc' - and msvccompiler.get_build_version() == 9): - include_dirs.append(os.path.join(root, 'include', 'msvc9')) + if ( + ccompiler.new_compiler().compiler_type == "msvc" + and msvccompiler.get_build_version() == 9 + ): + include_dirs.append(os.path.join(root, "include", "msvc9")) ext_modules = [] for mod_name in MOD_NAMES: - mod_path = mod_name.replace('.', '/') + '.cpp' + mod_path = mod_name.replace(".", "/") + ".cpp" ext_modules.append( - Extension(mod_name, [mod_path], - language='c++', include_dirs=include_dirs)) + Extension( + mod_name, [mod_path], language="c++", include_dirs=include_dirs + ) + ) if not is_source_release(root): generate_cython(root, src_path) setup( - name=about['__title__'], + name=about["__title__"], zip_safe=False, packages=PACKAGES, - package_data={'': ['*.pyx', '*.pxd', '*.h']}, - description=about['__summary__'], - author=about['__author__'], - author_email=about['__email__'], - version=about['__version__'], - url=about['__uri__'], - license=about['__license__'], + package_data={"": ["*.pyx", "*.pxd", "*.h"]}, + description=about["__summary__"], + author=about["__author__"], + author_email=about["__email__"], + version=about["__version__"], + url=about["__uri__"], + license=about["__license__"], ext_modules=ext_modules, install_requires=[ - 'numpy>=1.7', - 'ujson>=1.35', - 'preshed>=1.0.0,<2.0.0', - 'murmurhash>=0.28,<0.29', - 'cymem>=1.30,<1.32' + "numpy>=1.7", + "ujson>=1.35", + "preshed>=1.0.0,<2.0.0", + "murmurhash>=0.28,<0.29", + "cymem>=1.30,<1.32", ], classifiers=[ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: MIT License', - 'Operating System :: POSIX :: Linux', - 'Operating System :: MacOS :: MacOS X', - 'Operating System :: Microsoft :: Windows', - 'Programming Language :: Cython', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', - 'Programming Language :: Python :: 3.5', - 'Programming Language :: Python :: 3.6', - 'Topic :: Scientific/Engineering'], - cmdclass = { - 'build_ext': build_ext_subclass}, + "Development Status :: 4 - Beta", + "Environment :: Console", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: MIT License", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows", + "Programming Language :: Cython", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Topic :: Scientific/Engineering", + ], + cmdclass={"build_ext": build_ext_subclass}, ) -if __name__ == '__main__': +if __name__ == "__main__": setup_package() From c03d43bf96ddbc16bd4a70ffa8f930b0331223ea Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 29 May 2019 11:03:13 +0200 Subject: [PATCH 033/297] Use string name for GitHub's dependents parser --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 346a30e..6f7a817 100644 --- a/setup.py +++ b/setup.py @@ -124,7 +124,7 @@ def setup_package(): generate_cython(root, src_path) setup( - name=about["__title__"], + name="sense2vec", zip_safe=False, packages=PACKAGES, package_data={"": ["*.pyx", "*.pxd", "*.h"]}, From ed7993aca7034bc267c8b329f7d1965ba6d7ad71 Mon Sep 17 00:00:00 2001 From: vincent d warmerdam Date: Fri, 5 Jul 2019 11:42:20 +0200 Subject: [PATCH 034/297] readme update fix the spacy version to prevent errors --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 5838c9f..3de2730 100644 --- a/README.rst +++ b/README.rst @@ -105,7 +105,7 @@ install it separately and download the English model. .. code:: bash - pip install -U spacy + pip install -U spacy==2.0.0 python -m spacy download en The ``sense2vec`` package exposes a ``Sense2VecComponent``, which can be From 76ecdaac0ad2fe864ec9b0904df3b8de27a25cc1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 5 Sep 2019 13:39:56 +0200 Subject: [PATCH 035/297] Increment versions --- requirements.txt | 9 ++++----- sense2vec/about.py | 2 +- setup.py | 10 +++++----- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/requirements.txt b/requirements.txt index 3c9b44d..cc8c356 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ -numpy>=1.7 -ujson>=1.35 -preshed>=1.0.0,<2.0.0 -murmurhash>=0.28,<0.29 -cymem>=1.30,<1.32 +numpy>=1.15.0 +cymem>=2.0.2,<2.1.0 +murmurhash>=0.28.0,<1.1.0 +preshed>=2.0.1,<2.1.0 diff --git a/sense2vec/about.py b/sense2vec/about.py index 47a994d..0393db4 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1,5 +1,5 @@ __title__ = 'sense2vec' -__version__ = '1.0.0a0' +__version__ = '1.1.0' __summary__ = 'Use NLP to go beyond vanilla word2vec' __uri__ = 'https://github.com/explosion/sense2vec' __author__ = 'Explosion AI' diff --git a/setup.py b/setup.py index 6f7a817..37a1c85 100644 --- a/setup.py +++ b/setup.py @@ -136,11 +136,11 @@ def setup_package(): license=about["__license__"], ext_modules=ext_modules, install_requires=[ - "numpy>=1.7", - "ujson>=1.35", - "preshed>=1.0.0,<2.0.0", - "murmurhash>=0.28,<0.29", - "cymem>=1.30,<1.32", + "numpy>=1.15.0", + "srsly>=0.1.0,<1.1.0", + "preshed>=2.0.1,<2.1.0", + "murmurhash>=0.28.0,<1.1.0", + "cymem>=2.0.2,<2.1.0", ], classifiers=[ "Development Status :: 4 - Beta", From a4b26d6190cd20e92ae375ab7320d05670482e89 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 5 Sep 2019 13:47:00 +0200 Subject: [PATCH 036/297] Require srsly --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index cc8c356..c1c4cb2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ numpy>=1.15.0 cymem>=2.0.2,<2.1.0 murmurhash>=0.28.0,<1.1.0 preshed>=2.0.1,<2.1.0 +srsly>=0.1.0,<1.1.0 From c3e7780d68c30c5853add3e00b09546d21cddfc8 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 5 Sep 2019 13:47:13 +0200 Subject: [PATCH 037/297] Use srsly instead of ujson --- sense2vec/_strings.pyx | 7 +++---- sense2vec/vectors.pyx | 11 +++-------- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/sense2vec/_strings.pyx b/sense2vec/_strings.pyx index b3c483d..1f9da11 100644 --- a/sense2vec/_strings.pyx +++ b/sense2vec/_strings.pyx @@ -3,11 +3,11 @@ from __future__ import unicode_literals, absolute_import cimport cython +import srsly from libc.string cimport memcpy from libc.stdint cimport uint64_t, uint32_t from murmurhash.mrmr cimport hash64, hash32 from preshed.maps cimport map_iter, key_t -import ujson import os @@ -242,7 +242,7 @@ cdef class StringStore: None """ with open(path, 'w') as file_: - string_data = ujson.dumps(list(self)) + string_data = srsly.json_dumps(list(self)) if not isinstance(string_data, unicode): string_data = string_data.decode('utf8') file_.write(string_data) @@ -256,8 +256,7 @@ cdef class StringStore: Returns: None """ - with open(path) as file_: - strings = ujson.load(file_) + strings = srsly.read_json(path) if strings == ['']: return None cdef unicode string diff --git a/sense2vec/vectors.pyx b/sense2vec/vectors.pyx index 6543b31..652cf43 100644 --- a/sense2vec/vectors.pyx +++ b/sense2vec/vectors.pyx @@ -18,11 +18,8 @@ from murmurhash.mrmr cimport hash64 from cymem.cymem cimport Pool cimport numpy as np import numpy +import srsly from os import path -try: - import ujson as json -except ImportError: - import json from .cfile cimport CFile from ._strings cimport StringStore, hash_string @@ -195,8 +192,7 @@ cdef class VectorMap: if not freq: continue freqs.append([string, freq]) - with open(path.join(data_dir, 'freqs.json'), 'w') as file_: - json.dump(freqs, file_) + srsly.write_json(path.join(data_dir, "freqs.json")) def load(self, data_dir): '''Load from a directory: @@ -207,8 +203,7 @@ cdef class VectorMap: ''' self.data.load(path.join(data_dir, 'vectors.bin')) self.strings.from_disk(path.join(data_dir, 'strings.json')) - with open(path.join(data_dir, 'freqs.json')) as file_: - freqs = json.load(file_) + freqs = srsly.read_json(path.join(data_dir, "freqs.json")) cdef uint64_t hashed for string, freq in freqs: hashed = hash_string(string) From db7962184a2a33dc3b82c63d34df12cd55e390f3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Sep 2019 14:04:31 +0200 Subject: [PATCH 038/297] Tidy up and merge requirements --- .travis.yml | 15 --------------- requirements-dev.txt | 2 -- requirements.txt | 3 +++ 3 files changed, 3 insertions(+), 17 deletions(-) delete mode 100644 .travis.yml delete mode 100644 requirements-dev.txt diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9696454..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -language: python - -python: - - "2.7" - - "3.5" - - "3.6" - -install: - - pip install -U numpy - - pip install -r requirements.txt - - pip install -r requirements-dev.txt - - pip install -e . - -script: - - python -m pytest sense2vec diff --git a/requirements-dev.txt b/requirements-dev.txt deleted file mode 100644 index d44b81f..0000000 --- a/requirements-dev.txt +++ /dev/null @@ -1,2 +0,0 @@ -cython>=0.24,<0.28.0 -pytest>=3.0.6,<4.0.0 diff --git a/requirements.txt b/requirements.txt index c1c4cb2..2a38b3a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,3 +3,6 @@ cymem>=2.0.2,<2.1.0 murmurhash>=0.28.0,<1.1.0 preshed>=2.0.1,<2.1.0 srsly>=0.1.0,<1.1.0 +# Development requirements +cython>=0.24,<0.28.0 +pytest>=3.0.6,<4.0.0 From 2ded85701c95e4747f7e1df5ab6eb073fdfd14bd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Sep 2019 14:05:03 +0200 Subject: [PATCH 039/297] Set up CI with Azure Pipelines [skip ci] --- azure-pipelines.yml | 61 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 azure-pipelines.yml diff --git a/azure-pipelines.yml b/azure-pipelines.yml new file mode 100644 index 0000000..e6e7df7 --- /dev/null +++ b/azure-pipelines.yml @@ -0,0 +1,61 @@ +trigger: + batch: true + branches: + include: + - '*' + +jobs: +- job: 'Test' + strategy: + matrix: + Python27Linux: + imageName: 'ubuntu-16.04' + python.version: '2.7' + Python27Mac: + imageName: 'macos-10.13' + python.version: '2.7' + Python35Linux: + imageName: 'ubuntu-16.04' + python.version: '3.5' + Python35Windows: + imageName: 'vs2017-win2016' + python.version: '3.5' + Python35Mac: + imageName: 'macos-10.13' + python.version: '3.5' + Python36Linux: + imageName: 'ubuntu-16.04' + python.version: '3.6' + Python36Windows: + imageName: 'vs2017-win2016' + python.version: '3.6' + Python36Mac: + imageName: 'macos-10.13' + python.version: '3.6' + Python37Linux: + imageName: 'ubuntu-16.04' + python.version: '3.7' + Python37Windows: + imageName: 'vs2017-win2016' + python.version: '3.7' + Python37Mac: + imageName: 'macos-10.13' + python.version: '3.7' + maxParallel: 4 + pool: + vmImage: $(imageName) + + steps: + - task: UsePythonVersion@0 + inputs: + versionSpec: '$(python.version)' + architecture: 'x64' + + - script: pip install -r requirements.txt + displayName: 'Install dependencies' + + - script: pip install -e . + displayName: 'Build and install' + + - script: python -m pytest sense2vec + displayName: 'Run tests' From 592a1b34d86476f9a60fbe31b55cc74973da52a4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Sep 2019 14:30:47 +0200 Subject: [PATCH 040/297] Update README --- MANIFEST.in | 2 +- README.md | 347 ++++++++++++++++++++++++++++++++++++++++++++ README.rst | 404 ---------------------------------------------------- setup.py | 4 +- 4 files changed, 351 insertions(+), 406 deletions(-) create mode 100644 README.md delete mode 100644 README.rst diff --git a/MANIFEST.in b/MANIFEST.in index 6977488..43d36be 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,3 @@ recursive-include include *.h include LICENSE -include README.rst +include README.md diff --git a/README.md b/README.md new file mode 100644 index 0000000..81c4371 --- /dev/null +++ b/README.md @@ -0,0 +1,347 @@ + + +# sense2vec: Use NLP to go beyond vanilla word2vec + +sense2vec [Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice +twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you +learn more interesting, detailed and context-sensitive word vectors. For an +interactive example of the technology, see our +[sense2vec demo](https://demos.explosion.ai/sense2vec) that lets you explore +semantic similarities across all Reddit comments of 2015. + +This library is a simple Python/Cython implementation for loading and querying +sense2vec models. While it's best used in combination with +[spaCy](https://spacy.io), the `sense2vec` library itself is very lightweight +and can also be used as a standalone module. See below for usage details. + +🦆 **Version 1.0 alpha out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) + +[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/12/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=12) +[![Current Release Version](https://img.shields.io/github/v/release/explosion/sense2vec.svg?style=flat-square&include_prereleases)](https://github.com/explosion/sense2vec/releases) +[![pypi Version](https://img.shields.io/pypi/v/sense2vec.svg?style=flat-square)](https://pypi.org/project/sense2vec/) + +## Usage Examples + +### Usage with spaCy + +```python +import spacy +from sense2vec import Sense2VecComponent + +nlp = spacy.load("en_core_web_sm") +s2v = Sense2VecComponent("/path/to/reddit_vectors-1.1.0") +nlp.add_pipe(s2v) + +doc = nlp("A sentence about natural language processing.") +assert doc[3].text == "natural language processing" +freq = doc[3]._.s2v_freq +vector = doc[3]._.s2v_vec +most_similar = doc[3]._.s2v_most_similar(3) +# [(('natural language processing', 'NOUN'), 1.0), +# (('machine learning', 'NOUN'), 0.8986966609954834), +# (('computer vision', 'NOUN'), 0.8636297583580017)] +``` + +### Standalone usage without spaCy + +```python +import sense2vec + +s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") +query = "natural_language_processing|NOUN" +assert query in s2v +freq, vector = s2v[query] +words, scores = s2v.most_similar(vector, 3) +most_similar = list(zip(words, scores)) +# [('natural_language_processing|NOUN', 1.0), +# ('machine_learning|NOUN', 0.8986966609954834), +# ('computer_vision|NOUN', 0.8636297583580017)] +``` + +## Installation & Setup + +sense2vec releases are available on pip: + +```bash +pip install sense2vec==1.0.0a0 +``` + +The Reddit vectors model is attached to the +[latest release](https://github.com/explosion/sense2vec/releases). To load it +in, download the `.tar.gz` archive, unpack it and point `sense2vec.load` to +the extracted data directory: + +```python +import sense2vec +s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") +``` + +## Usage + +## Usage with spaCy v2.x + +The easiest way to use the library and vectors is to plug it into your spaCy +pipeline. Note that `sense2vec` doesn't depend on spaCy, so you'll have to +install it separately and download the English model. + +```bash +pip install -U spacy==2.0.0 +python -m spacy download en_core_web_sm +``` + +The `sense2vec` package exposes a `Sense2VecComponent`, which can be +initialised with the data path and added to your spaCy pipeline as a +[custom pipeline component](https://spacy.io/usage/processing-pipelines#custom-components). +By default, components are added to the _end of the pipeline_, which is the +recommended position for this component, since it needs access to the dependency +parse and, if available, named entities. + +```python +import spacy +from sense2vec import Sense2VecComponent + +nlp = spacy.load("en_core_web_sm") +s2v = Sense2VecComponent("/path/to/reddit_vectors-1.1.0") +nlp.add_pipe(s2v) +``` + +The pipeline component will **merge noun phrases and entities** according to +the same schema used when training the sense2vec models (e.g. noun chunks +without determiners like "the"). This ensures that you'll be able to retrieve +meaningful vectors for phrases in your text. The component will also add +serveral [extension attributes and methods](https://spacy.io/usage/processing-pipelines#custom-components-attributes) +to spaCy's `Token` and `Span` objects that let you retrieve vectors and +frequencies, as well as most similar terms. + +```python +doc = nlp("A sentence about natural language processing.") +assert doc[3].text == "natural language processing" +assert doc[3]._.in_s2v +freq = doc[3]._.s2v_freq +vector = doc[3]._.s2v_vec +most_similar = doc[3]._.s2v_most_similar(10) +``` + +For entities, the entity labels are used as the "sense" (instead of the +token's part-of-speech tag): + +```python +doc = nlp("A sentence about Facebook and Google.") +for ent in doc.ents: + assert ent._.in_s2v + most_similar = ent._.s2v_most_similar(3) +``` + +### Available attributes + +The following attributes are available via the `._` property – for example +`token._.in_s2v`: + +| Name | Attribute Type | Type | Description | +| ------------------ | -------------- | ------------------ | ---------------------------------------------------------------------------------- | +| `in_s2v` | property | bool | Whether a key exists in the vector map. | +| `s2v_freq` | property | int | The frequency of the given key. | +| `s2v_vec` | property | `ndarray[float32]` | The vector of the given key. | +| `s2v_most_similar` | method | list | Get the `n` most similar terms. Returns a list of `((word, sense), score)` tuples. | + +> ⚠️ **A note on span attributes:** Under the hood, entities in `doc.ents` are +> `Span` objects. This is why the pipeline component also adds attributes and +> methods to spans and not just tokens. However, it's not recommended to use the +> sense2vec attributes on arbitrary slices of the document, since the model likely +> won't have a key for the respective text. `Span` objects also don't have a +> part-of-speech tag, so if no entity label is present, the "sense" defaults to +> the root's part-of-speech tag. + +### Standalone usage + +To use only the `sense2vec` library, you can import the package and then call +its `load()` method to load in the vectors. + +```python +import sense2vec +s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") +``` + +`sense2vec.load` returns an instance of the `VectorMap` class, which you +can interact with via the following methods. + +> ⚠️ **Important note:** When interacting with the `VectorMap` directly, the +> keys need to follow the scheme of `phrase_text|SENSE` (note the `_` instead +> of spaces and the `|` before the tag or label) – for example, +> `machine_learning|NOUN`. Also note that the underlying vector table is +> case-sensitive. + +#### method `VectorMap.__len__` + +The total number of entries in the map. + +| Argument | Type | Description | +| ----------- | ---- | --------------------------------- | +| **RETURNS** | int | The number of entries in the map. | + +```python +s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") +assert len(s2v) == 1195261 +``` + +#### method `VectorMap.__contains__` + +Check whether the `VectorMap` has a given key. Keys consist of the word +string, a pipe and the "sense", i.e. the part-of-speech tag or entity label. +For example: `'duck|NOUN'` or `'duck|VERB'`. See the section on "Senses" +below for more details. Also note that the underlying vector table is +**case-sensitive**. + +| Argument | Type | Description | +| ----------- | ------- | ----------------------------------- | +| `string` | unicode | The key to check. | +| **RETURNS** | bool | Whether the key is part of the map. | + +```python +assert "duck|NOUN" in s2v +assert "duck|VERB" in s2v +assert "dkdksl|VERB" not in s2v +``` + +#### method `VectorMap.__getitem__` + +Retrieve a `(frequency, vector)` tuple from the vector map. The frequency is +an integer, the vector a `numpy.ndarray(dtype='float32')`. If the key is not +found, a `KeyError` is raised. + +| Argument | Type | Description | +| ----------- | ------- | ------------------------------------------------- | +| `string` | unicode | The key to retrieve the frequency and vector for. | +| **RETURNS** | tuple | The `(frequency, vector)` tuple. | + +```python +freq, vector = s2v["duck|NOUN"] +``` + +#### method `VectorMap.__setitem__` + +Assign a `(frequency, vector)` tuple to the vector map. The frequency should +be an integer, the vector a `numpy.ndarray(dtype='float32')`. + +| Argument | Type | Description | +| -------- | ------- | ---------------------------------------------- | +| `key` | unicode | The key to assign the frequency and vector to. | +| `value` | tuple | The `(frequency, vector)` tuple to assign. | + +```python +freq, vector = s2v["avocado|NOUN"] +s2v["🥑|NOUN"] = (freq, vector) +``` + +#### method `VectorMap.__iter__`, `VectorMap.keys` + +Iterate over the keys in the map, in order of insertion. + +| Argument | Type | Description | +| ---------- | ------- | -------------------- | +| **YIELDS** | unicode | The keys in the map. | + +#### method `VectorMap.values` + +Iterate over the values in the map, in order of insertion and yield +`(frequency, vector)` tuples from the vector map. The frequency is an integer, +the vector a `numpy.ndarray(dtype='float32')` + +| Argument | Type | Description | +| ---------- | ----- | ---------------------- | +| **YIELDS** | tuple | The values in the map. | + +#### method `VectorMap.items` + +Iterate over the items in the map, in order of insertion and yield +`(key, (frequency, vector))` tuples from the vector map. The frequency is an +integer, the vector a `numpy.ndarray(dtype='float32')` + +| Argument | Type | Description | +| ---------- | ----- | --------------------- | +| **YIELDS** | tuple | The items in the map. | + +#### method `VectorMap.most_similar` + +Find the keys of the `n` most similar entries, given a vector. Note that +the _most_ similar entry with a score of `1.0` will be the key of the query +vector itself. + +| Argument | Type | Description | +| ----------- | -------------------------------- | -------------------------------------------------- | +| `vector` | `numpy.ndarray(dtype='float32')` | The vector to compare to. | +| `n` | int | The number of entries to return. Defaults to `10`. | +| **RETURNS** | tuple | A `(words, scores)` tuple. | + +```python +freq, vector = s2v["avocado|NOUN"] +words, scores = s2v.most_similar(vector, n=3) +for word, score in zip(words, scores): + print(word, score) +# avocado|NOUN 1.0 +# avacado|NOUN 0.970944344997406 +# spinach|NOUN 0.962776780128479 +``` + +#### method `VectorMap.save` + +Serialize the model to a directory. This will export three files to the output +directory: a `strings.json` containing the keys in insertion order, a +`freqs.json` containing the frequencies and a `vectors.bin` containing the +vectors. + +| Argument | Type | Description | +| ---------- | ------- | --------------------------------- | +| `data_dir` | unicode | The path to the output directory. | + +#### method `VectorMap.load` + +Load a model from a directory. Expects three files in the directory (see +`VectorMap.save` for details). + +| Argument | Type | Description | +| ---------- | ------- | -------------------------------- | +| `data_dir` | unicode | The path to load the model from. | + +## Senses + +The pre-trained Reddit vectors support the following "senses", either +part-of-speech tags or entity labels. For more details, see spaCy's +[annotation scheme overview](https://spacy.io/api/annotation). + +| Tag | Description | Examples | +| ------- | ------------------------- | ------------------------------------ | +| `ADJ` | adjective | big, old, green | +| `ADP` | adposition | in, to, during | +| `ADV` | adverb | very, tomorrow, down, where | +| `AUX` | auxiliary  | is, has (done), will (do) | +| `CONJ` | conjunction | and, or, but | +| `DET` | determiner | a, an, the | +| `INTJ` | interjection | psst, ouch, bravo, hello | +| `NOUN` | noun | girl, cat, tree, air, beauty | +| `NUM` | numeral | 1, 2017, one, seventy-seven, MMXIV | +| `PART` | particle | 's, not | +| `PRON` | pronoun | I, you, he, she, myself, somebody | +| `PROPN` | proper noun | Mary, John, London, NATO, HBO | +| `PUNCT` | punctuation | , ? ( ) | +| `SCONJ` | subordinating conjunction | if, while, that | +| `SYM` | symbol | \$, %, =, :), 😝 | +| `VERB` | verb | run, runs, running, eat, ate, eating | + +| Entity Label | Description | +| ------------- | ---------------------------------------------------- | +| `PERSON` | People, including fictional. | +| `NORP` | Nationalities or religious or political groups. | +| `FACILITY` | Buildings, airports, highways, bridges, etc. | +| `ORG` | Companies, agencies, institutions, etc. | +| `GPE` | Countries, cities, states. | +| `LOC` | Non-GPE locations, mountain ranges, bodies of water. | +| `PRODUCT` | Objects, vehicles, foods, etc. (Not services.) | +| `EVENT` | Named hurricanes, battles, wars, sports events, etc. | +| `WORK_OF_ART` | Titles of books, songs, etc. | +| `LANGUAGE` | Any named language. | + +# Training a sense2vec model + +> **🚧 Under construction:** We're currently updating the training scripts for +> spaCy v2.x. diff --git a/README.rst b/README.rst deleted file mode 100644 index 3de2730..0000000 --- a/README.rst +++ /dev/null @@ -1,404 +0,0 @@ -sense2vec: Use NLP to go beyond vanilla word2vec -************************************************ - -sense2vec (`Trask et. al `_, 2015) is a nice -twist on `word2vec `_ that lets you -learn more interesting, detailed and context-sensitive word vectors. For an -interactive example of the technology, see our -`sense2vec demo `_ that lets you explore -semantic similarities across all Reddit comments of 2015. - -This library is a simple Python/Cython implementation for loading and querying -sense2vec models. While it's best used in combination with -`spaCy `_, the ``sense2vec`` library itself is very lightweight -and can also be used as a standalone module. See below for usage details. - -🦆 **Version 1.0 alpha out now!** `Read the release notes here. `_ - -.. image:: https://img.shields.io/travis/explosion/sense2vec/master.svg?style=flat-square - :target: https://travis-ci.org/explosion/sense2vec - :alt: Build Status - -.. image:: https://img.shields.io/github/release/explosion/sense2vec/all.svg?style=flat-square - :target: https://github.com/explosion/sense2vec/releases - :alt: Current Release Version - -.. image:: https://img.shields.io/pypi/v/sense2vec.svg?style=flat-square - :target: https://pypi.python.org/pypi/sense2vec - :alt: pypi Version - -Usage Examples -============== - -Usage with spaCy ----------------- - -.. code:: python - - import spacy - from sense2vec import Sense2VecComponent - - nlp = spacy.load('en') - s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0') - nlp.add_pipe(s2v) - - doc = nlp(u"A sentence about natural language processing.") - assert doc[3].text == u'natural language processing' - freq = doc[3]._.s2v_freq - vector = doc[3]._.s2v_vec - most_similar = doc[3]._.s2v_most_similar(3) - # [(('natural language processing', 'NOUN'), 1.0), - # (('machine learning', 'NOUN'), 0.8986966609954834), - # (('computer vision', 'NOUN'), 0.8636297583580017)] - -Standalone usage without spaCy ------------------------------- - -.. code:: python - - import sense2vec - - s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') - query = u'natural_language_processing|NOUN' - assert query in s2v - freq, vector = s2v[query] - words, scores = s2v.most_similar(vector, 3) - most_similar = list(zip(words, scores)) - # [('natural_language_processing|NOUN', 1.0), - # ('machine_learning|NOUN', 0.8986966609954834), - # ('computer_vision|NOUN', 0.8636297583580017)] - -Installation & Setup -==================== - -==================== === -**Operating system** macOS / OS X, Linux, Windows (Cygwin, MinGW, Visual Studio) -**Python version** CPython 2.7, 3.4+. Only 64 bit. -**Package managers** `pip `_ (source packages only) -==================== === - -sense2vec releases are available as source packages on pip: - -.. code:: bash - - pip install sense2vec==1.0.0a0 - -The Reddit vectors model is attached to the -`latest release `_. To load it -in, download the ``.tar.gz`` archive, unpack it and point ``sense2vec.load`` to -the extracted data directory: - -.. code:: python - - import sense2vec - s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') - -Usage -===== - -Usage with spaCy v2.x ---------------------- - -The easiest way to use the library and vectors is to plug it into your spaCy -pipeline. Note that ``sense2vec`` doesn't depend on spaCy, so you'll have to -install it separately and download the English model. - -.. code:: bash - - pip install -U spacy==2.0.0 - python -m spacy download en - -The ``sense2vec`` package exposes a ``Sense2VecComponent``, which can be -initialised with the data path and added to your spaCy pipeline as a -`custom pipeline component `_. -By default, components are added to the *end of the pipeline*, which is the -recommended position for this component, since it needs access to the dependency -parse and, if available, named entities. - -.. code:: python - - import spacy - from sense2vec import Sense2VecComponent - - nlp = spacy.load('en') - s2v = Sense2VecComponent('/path/to/reddit_vectors-1.1.0') - nlp.add_pipe(s2v) - -The pipeline component will **merge noun phrases and entities** according to -the same schema used when training the sense2vec models (e.g. noun chunks -without determiners like "the"). This ensures that you'll be able to retrieve -meaningful vectors for phrases in your text. The component will also add -serveral `extension attributes and methods `_ -to spaCy's ``Token`` and ``Span`` objects that let you retrieve vectors and -frequencies, as well as most similar terms. - -.. code:: python - - doc = nlp(u"A sentence about natural language processing.") - assert doc[3].text == u'natural language processing' - assert doc[3]._.in_s2v - freq = doc[3]._.s2v_freq - vector = doc[3]._.s2v_vec - most_similar = doc[3]._.s2v_most_similar(10) - -For entities, the entity labels are used as the "sense" (instead of the -token's part-of-speech tag): - -.. code:: python - - doc = nlp(u"A sentence about Facebook and Google.") - for ent in doc.ents: - assert ent._.in_s2v - most_similar = ent._.s2v_most_similar(3) - -Available attributes -^^^^^^^^^^^^^^^^^^^^ - -The following attributes are available via the `._` property – for example -``token._.in_s2v``: - -==================== ============== ==================== === -Name Attribute Type Type Description -==================== ============== ==================== === -``in_s2v`` property bool Whether a key exists in the vector map. -``s2v_freq`` property int The frequency of the given key. -``s2v_vec`` property ``ndarray[float32]`` The vector of the given key. -``s2v_most_similar`` method list Get the ``n`` most similar terms. Returns a list of ``((word, sense), score)`` tuples. -==================== ============== ==================== === - -**A note on span attributes:** Under the hood, entities in ``doc.ents`` are -``Span`` objects. This is why the pipeline component also adds attributes and -methods to spans and not just tokens. However, it's not recommended to use the -sense2vec attributes on arbitrary slices of the document, since the model likely -won't have a key for the respective text. ``Span`` objects also don't have a -part-of-speech tag, so if no entity label is present, the "sense" defaults to -the root's part-of-speech tag. - -Standalone usage ----------------- - -To use only the ``sense2vec`` library, you can import the package and then call -its ``load()`` method to load in the vectors. - -.. code:: python - - import sense2vec - s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') - -``sense2vec.load`` returns an instance of the ``VectorMap`` class, which you -can interact with via the following methods. - -⚠️ **Important note:** When interacting with the ``VectorMap`` directly, the -keys need to follow the scheme of ``phrase_text|SENSE`` (note the ``_`` instead -of spaces and the ``|`` before the tag or label) – for example, -``machine_learning|NOUN``. Also note that the underlying vector table is -case-sensitive. - -``VectorMap.__len__`` -^^^^^^^^^^^^^^^^^^^^^ - -The total number of entries in the map. - -=========== ==== === -Argument Type Description -=========== ==== === -**RETURNS** int The number of entries in the map. -=========== ==== === - -.. code:: python - - s2v = sense2vec.load('/path/to/reddit_vectors-1.1.0') - assert len(s2v) == 1195261 - -``VectorMap.__contains__`` -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Check whether the ``VectorMap`` has a given key. Keys consist of the word -string, a pipe and the "sense", i.e. the part-of-speech tag or entity label. -For example: ``'duck|NOUN'`` or ``'duck|VERB'``. See the section on "Senses" -below for more details. Also note that the underlying vector table is -**case-sensitive**. - -=========== ======= === -Argument Type Description -=========== ======= === -``string`` unicode The key to check. -**RETURNS** bool Whether the key is part of the map. -=========== ======= === - -.. code:: python - - assert u'duck|NOUN' in s2v - assert u'duck|VERB' in s2v - assert u'dkdksl|VERB' not in s2v - -``VectorMap.__getitem__`` -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Retrieve a ``(frequency, vector)`` tuple from the vector map. The frequency is -an integer, the vector a ``numpy.ndarray(dtype='float32')``. If the key is not -found, a ``KeyError`` is raised. - -=========== ======= === -Argument Type Description -=========== ======= === -``string`` unicode The key to retrieve the frequency and vector for. -**RETURNS** tuple The ``(frequency, vector)`` tuple. -=========== ======= === - -.. code:: python - - freq, vector = s2v[u'duck|NOUN'] - -``VectorMap.__setitem__`` -^^^^^^^^^^^^^^^^^^^^^^^^^ - -Assign a ``(frequency, vector)`` tuple to the vector map. The frequency should -be an integer, the vector a ``numpy.ndarray(dtype='float32')``. - -=========== ======= === -Argument Type Description -=========== ======= === -``key`` unicode The key to assign the frequency and vector to. -``value`` tuple The ``(frequency, vector)`` tuple to assign. -=========== ======= === - -.. code:: python - - freq, vector = s2v[u'avocado|NOUN'] - s2v[u'🥑|NOUN'] = (freq, vector) - -``VectorMap.__iter__``, ``VectorMap.keys`` -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Iterate over the keys in the map, in order of insertion. - -=========== ======= === -Argument Type Description -=========== ======= === -**YIELDS** unicode The keys in the map. -=========== ======= === - -``VectorMap.values`` -^^^^^^^^^^^^^^^^^^^^ - -Iterate over the values in the map, in order of insertion and yield -``(frequency, vector)`` tuples from the vector map. The frequency is an integer, -the vector a ``numpy.ndarray(dtype='float32')`` - -=========== ======= === -Argument Type Description -=========== ======= === -**YIELDS** tuple The values in the map. -=========== ======= === - -``VectorMap.items`` -^^^^^^^^^^^^^^^^^^^ - -Iterate over the items in the map, in order of insertion and yield -``(key, (frequency, vector))`` tuples from the vector map. The frequency is an integer, the vector a ``numpy.ndarray(dtype='float32')`` - -=========== ======= === -Argument Type Description -=========== ======= === -**YIELDS** tuple The items in the map. -=========== ======= === - -``VectorMap.most_similar`` -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Find the keys of the ``n`` most similar entries, given a vector. Note that -the *most* similar entry with a score of ``1.0`` will be the key of the query -vector itself. - -=========== ================================== === -Argument Type Description -=========== ================================== === -``vector`` ``numpy.ndarray(dtype='float32')`` The vector to compare to. -``n`` int The number of entries to return. Defaults to ``10``. -**RETURNS** tuple A ``(words, scores)`` tuple. -=========== ================================== === - -.. code:: python - - freq, vector = s2v[u'avocado|NOUN'] - words, scores = s2v.most_similar(vector, n=3) - for word, score in zip(words, scores): - print(word, score) - # avocado|NOUN 1.0 - # avacado|NOUN 0.970944344997406 - # spinach|NOUN 0.962776780128479 - -``VectorMap.save`` -^^^^^^^^^^^^^^^^^^ - -Serialize the model to a directory. This will export three files to the output -directory: a ``strings.json`` containing the keys in insertion order, a -``freqs.json`` containing the frequencies and a ``vectors.bin`` containing the -vectors. - -============ ======= === -Argument Type Description -============ ======= === -``data_dir`` unicode The path to the output directory. -============ ======= === - -``VectorMap.load`` -^^^^^^^^^^^^^^^^^^ - -Load a model from a directory. Expects three files in the directory (see -``VectorMap.save`` for details). - -============ ======= === -Argument Type Description -============ ======= === -``data_dir`` unicode The path to load the model from. -============ ======= === - -Senses -====== - -The pre-trained Reddit vectors support the following "senses", either -part-of-speech tags or entity labels. For more details, see spaCy's -`annotation scheme overview `_. - -========= ========================== === -Tag Description Examples -========= ========================== === -``ADJ`` adjective big, old, green -``ADP`` adposition in, to, during -``ADV`` adverb very, tomorrow, down, where -``AUX`` auxiliary is, has (done), will (do) -``CONJ`` conjunction and, or, but -``DET`` determiner a, an, the -``INTJ`` interjection psst, ouch, bravo, hello -``NOUN`` noun girl, cat, tree, air, beauty -``NUM`` numeral 1, 2017, one, seventy-seven, MMXIV -``PART`` particle 's, not -``PRON`` pronoun I, you, he, she, myself, somebody -``PROPN`` proper noun Mary, John, London, NATO, HBO -``PUNCT`` punctuation , ? ( ) -``SCONJ`` subordinating conjunction if, while, that -``SYM`` symbol $, %, =, :), 😝 -``VERB`` verb run, runs, running, eat, ate, eating -========= ========================== === - -=============== === -Entity Label Description -=============== === -``PERSON`` People, including fictional. -``NORP`` Nationalities or religious or political groups. -``FACILITY`` Buildings, airports, highways, bridges, etc. -``ORG`` Companies, agencies, institutions, etc. -``GPE`` Countries, cities, states. -``LOC`` Non-GPE locations, mountain ranges, bodies of water. -``PRODUCT`` Objects, vehicles, foods, etc. (Not services.) -``EVENT`` Named hurricanes, battles, wars, sports events, etc. -``WORK_OF_ART`` Titles of books, songs, etc. -``LANGUAGE`` Any named language. -=============== === - -Training a sense2vec model -========================== - -**🚧 TODO:** Update training scripts for spaCy v2.x. diff --git a/setup.py b/setup.py index 37a1c85..e2bcb5e 100644 --- a/setup.py +++ b/setup.py @@ -97,7 +97,7 @@ def setup_package(): about = {} exec(f.read(), about) - with io.open(os.path.join(root, "README.rst"), encoding="utf8") as f: + with io.open(os.path.join(root, "README.md"), encoding="utf8") as f: readme = f.read() include_dirs = [ @@ -129,6 +129,8 @@ def setup_package(): packages=PACKAGES, package_data={"": ["*.pyx", "*.pxd", "*.h"]}, description=about["__summary__"], + long_description=readme, + long_description_content_type="text/markdown", author=about["__author__"], author_email=about["__email__"], version=about["__version__"], From 6e18804b2727ecd1c840824079d0daae8961e975 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Sep 2019 14:31:23 +0200 Subject: [PATCH 041/297] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 81c4371..6abd17c 100644 --- a/README.md +++ b/README.md @@ -341,7 +341,7 @@ part-of-speech tags or entity labels. For more details, see spaCy's | `WORK_OF_ART` | Titles of books, songs, etc. | | `LANGUAGE` | Any named language. | -# Training a sense2vec model +## Training a sense2vec model > **🚧 Under construction:** We're currently updating the training scripts for > spaCy v2.x. From 01d33ce67ceb7312e780f604dd0ab1f2eabb7e84 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Sep 2019 14:48:35 +0200 Subject: [PATCH 042/297] Auto-format --- sense2vec/__init__.py | 55 ++++++++++++++++--------------- sense2vec/about.py | 14 ++++---- sense2vec/tests/conftest.py | 7 ++-- sense2vec/tests/test_sense2vec.py | 13 +++++--- sense2vec/tests/test_vectors.py | 26 +++++++-------- 5 files changed, 61 insertions(+), 54 deletions(-) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index c78ebda..774beb3 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -20,16 +20,14 @@ def transform_doc(doc): Transform a spaCy Doc to match the sense2vec format: merge entities into one token and merge noun chunks without determiners. """ - #if not doc.is_tagged: + # if not doc.is_tagged: # raise ValueError("Can't run sense2vec: document not tagged.") for ent in doc.ents: - ent.merge(tag=ent.root.tag_, lemma=ent.root.lemma_, - ent_type=ent.label_) + ent.merge(tag=ent.root.tag_, lemma=ent.root.lemma_, ent_type=ent.label_) for np in doc.noun_chunks: - while len(np) > 1 and np[0].dep_ not in ('advmod', 'amod', 'compound'): + while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): np = np[1:] - np.merge(tag=np.root.tag_, lemma=np.root.lemma_, - ent_type=np.root.ent_type_) + np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_) return doc @@ -48,7 +46,8 @@ class Sense2VecComponent(object): >>> assert doc[3]._.in_s2v >>> print(doc[3]._.s2v_most_similar(20)) """ - name = 'sense2vec' + + name = "sense2vec" def __init__(self, vectors_path): self.s2v = load(vectors_path) @@ -67,36 +66,40 @@ def init_component(self, doc): # even if the component is only created and not added Token = doc[0].__class__ Span = doc[:1].__class__ - Token.set_extension('in_s2v', getter=lambda t: self.in_s2v(t)) - Token.set_extension('s2v_freq', getter=lambda t: self.s2v_freq(t)) - Token.set_extension('s2v_vec', getter=lambda t: self.s2v_vec(t)) - Token.set_extension('s2v_most_similar', method=lambda t, n: self.s2v_most_sim(t, n)) - Span.set_extension('in_s2v', getter=lambda s: self.in_s2v(s, 'ent')) - Span.set_extension('s2v_freq', getter=lambda s: self.s2v_freq(s, 'ent')) - Span.set_extension('s2v_vec', getter=lambda s: self.s2v_vec(s, 'ent')) - Span.set_extension('s2v_most_similar', method=lambda s, n: self.s2v_most_sim(s, n, 'ent')) - - def in_s2v(self, obj, attr='pos'): + Token.set_extension("in_s2v", getter=lambda t: self.in_s2v(t)) + Token.set_extension("s2v_freq", getter=lambda t: self.s2v_freq(t)) + Token.set_extension("s2v_vec", getter=lambda t: self.s2v_vec(t)) + Token.set_extension( + "s2v_most_similar", method=lambda t, n: self.s2v_most_sim(t, n) + ) + Span.set_extension("in_s2v", getter=lambda s: self.in_s2v(s, "ent")) + Span.set_extension("s2v_freq", getter=lambda s: self.s2v_freq(s, "ent")) + Span.set_extension("s2v_vec", getter=lambda s: self.s2v_vec(s, "ent")) + Span.set_extension( + "s2v_most_similar", method=lambda s, n: self.s2v_most_sim(s, n, "ent") + ) + + def in_s2v(self, obj, attr="pos"): return self._get_query(obj, attr) in self.s2v - def s2v_freq(self, obj, attr='pos'): + def s2v_freq(self, obj, attr="pos"): freq, _ = self.s2v[self._get_query(obj, attr)] return freq - def s2v_vec(self, obj, attr='pos'): + def s2v_vec(self, obj, attr="pos"): _, vector = self.s2v[self._get_query(obj, attr)] return vector - def s2v_most_sim(self, obj, n_similar=10, attr='pos'): + def s2v_most_sim(self, obj, n_similar=10, attr="pos"): _, vector = self.s2v[self._get_query(obj, attr)] words, scores = self.s2v.most_similar(vector, n_similar) - words = [word.replace('_', ' ') for word in words] - words = [tuple(word.rsplit('|', 1)) for word in words] + words = [word.replace("_", " ") for word in words] + words = [tuple(word.rsplit("|", 1)) for word in words] return list(zip(words, scores)) - def _get_query(self, obj, attr='pos'): + def _get_query(self, obj, attr="pos"): # no pos_ and label_ shouldn't happen – unless it's an unmerged # non-entity Span (in which case we just use the root's pos) - pos = obj.pos_ if hasattr(obj, 'pos_') else obj.root.pos_ - sense = obj.label_ if (attr == 'ent' and obj.label_) else pos - return obj.text.replace(' ', '_') + '|' + sense + pos = obj.pos_ if hasattr(obj, "pos_") else obj.root.pos_ + sense = obj.label_ if (attr == "ent" and obj.label_) else pos + return obj.text.replace(" ", "_") + "|" + sense diff --git a/sense2vec/about.py b/sense2vec/about.py index 0393db4..ac8899d 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1,8 +1,8 @@ -__title__ = 'sense2vec' -__version__ = '1.1.0' -__summary__ = 'Use NLP to go beyond vanilla word2vec' -__uri__ = 'https://github.com/explosion/sense2vec' -__author__ = 'Explosion AI' -__email__ = 'contact@explosion.ai' -__license__ = 'MIT' +__title__ = "sense2vec" +__version__ = "1.1.0" +__summary__ = "Use NLP to go beyond vanilla word2vec" +__uri__ = "https://github.com/explosion/sense2vec" +__author__ = "Explosion AI" +__email__ = "contact@explosion.ai" +__license__ = "MIT" __release__ = True diff --git a/sense2vec/tests/conftest.py b/sense2vec/tests/conftest.py index acbb8fa..fc66ee7 100644 --- a/sense2vec/tests/conftest.py +++ b/sense2vec/tests/conftest.py @@ -5,11 +5,12 @@ def pytest_addoption(parser): - parser.addoption("--models", action="http://github.com/store_true", - help="include tests that require full models") + parser.addoption( + "--models", action="http://github.com/store_true", help="include tests that require full models" + ) def pytest_runtest_setup(item): - for opt in ['models']: + for opt in ["models"]: if opt in item.keywords and not item.config.getoption("--%s" % opt): pytest.skip("need --%s option to run" % opt) diff --git a/sense2vec/tests/test_sense2vec.py b/sense2vec/tests/test_sense2vec.py index 3529ac2..2460e23 100644 --- a/sense2vec/tests/test_sense2vec.py +++ b/sense2vec/tests/test_sense2vec.py @@ -7,14 +7,17 @@ from .. import load -data_path = path.join(path.dirname(__file__), '..', '..', 'data') +data_path = path.join(path.dirname(__file__), "..", "..", "data") @pytest.mark.models -@pytest.mark.parametrize('model', ['reddit_vectors-1.1.0']) +@pytest.mark.parametrize("model", ["reddit_vectors-1.1.0"]) def test_sample(model): s2v = load(path.join(data_path, model)) - freq, query_vector = s2v[u"beekeepers|NOUN"] + freq, query_vector = s2v["beekeepers|NOUN"] assert freq is not None - assert s2v.most_similar(query_vector, 3)[0] == \ - [u'beekeepers|NOUN', u'honey_bees|NOUN', u'Beekeepers|NOUN'] + assert s2v.most_similar(query_vector, 3)[0] == [ + "beekeepers|NOUN", + "honey_bees|NOUN", + "Beekeepers|NOUN", + ] diff --git a/sense2vec/tests/test_vectors.py b/sense2vec/tests/test_vectors.py index c7fb97a..64825f1 100644 --- a/sense2vec/tests/test_vectors.py +++ b/sense2vec/tests/test_vectors.py @@ -16,9 +16,9 @@ def test_init(): def test_add(): vecs = VectorStore(128) - good = numpy.ndarray(shape=(vecs.nr_dim,), dtype='float32') + good = numpy.ndarray(shape=(vecs.nr_dim,), dtype="float32") vecs.add(good) - bad = numpy.ndarray(shape=(vecs.nr_dim+1,), dtype='float32') + bad = numpy.ndarray(shape=(vecs.nr_dim + 1,), dtype="float32") with pytest.raises(AssertionError) as excinfo: vecs.add(bad) @@ -26,9 +26,9 @@ def test_add(): @pytest.mark.xfail def test_borrow(): vecs = VectorStore(128) - good = numpy.ndarray(shape=(vecs.nr_dim,), dtype='float32') + good = numpy.ndarray(shape=(vecs.nr_dim,), dtype="float32") vecs.borrow(good) - bad = numpy.ndarray(shape=(vecs.nr_dim+1,), dtype='float32') + bad = numpy.ndarray(shape=(vecs.nr_dim + 1,), dtype="float32") with pytest.raises(AssertionError) as excinfo: vecs.borrow(bad) @@ -36,15 +36,15 @@ def test_borrow(): @pytest.mark.xfail def test_most_similar(): vecs = VectorStore(4) - vecs.add(numpy.asarray([4,2,2,2], dtype='float32')) - vecs.add(numpy.asarray([4,4,2,2], dtype='float32')) - vecs.add(numpy.asarray([4,4,4,2], dtype='float32')) - vecs.add(numpy.asarray([4,4,4,4], dtype='float32')) + vecs.add(numpy.asarray([4, 2, 2, 2], dtype="float32")) + vecs.add(numpy.asarray([4, 4, 2, 2], dtype="float32")) + vecs.add(numpy.asarray([4, 4, 4, 2], dtype="float32")) + vecs.add(numpy.asarray([4, 4, 4, 4], dtype="float32")) - indices, scores = vecs.most_similar( - numpy.asarray([4,2,2,2], dtype='float32'), 4) + indices, scores = vecs.most_similar(numpy.asarray([4, 2, 2, 2], dtype="float32"), 4) print(list(scores)) - assert list(indices) == [0,1] + assert list(indices) == [0, 1] indices, scores = vecs.most_similar( - numpy.asarray([0.1,1,1,1], dtype='float32'), 4) - assert list(indices) == [4,3] + numpy.asarray([0.1, 1, 1, 1], dtype="float32"), 4 + ) + assert list(indices) == [4, 3] From cd1c91a5ac5d1cee53a498edb77fc5a831ed8940 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Sep 2019 15:00:04 +0200 Subject: [PATCH 043/297] Create .flake8 --- .flake8 | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .flake8 diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..55b597f --- /dev/null +++ b/.flake8 @@ -0,0 +1,8 @@ +[flake8] +ignore = E203, E266, E501, E731, W503 +max-line-length = 80 +select = B,C,E,F,W,T4,B9 +exclude = + .env, + .git, + __pycache__, From 10e234740e0b87bba30c76a51b93c9ca22d6d479 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Sep 2019 15:00:34 +0200 Subject: [PATCH 044/297] Rewrite transform_doc logic --- sense2vec/__init__.py | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 774beb3..45d6edb 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -15,19 +15,35 @@ def load(vectors_path): return vector_map +def filter_spans(spans): + # Filter a sequence of spans so they don't contain overlaps + get_sort_key = lambda span: (span.end - span.start, span.start) + sorted_spans = sorted(spans, key=get_sort_key, reverse=True) + result = [] + seen_tokens = set() + for span in sorted_spans: + if span.start not in seen_tokens and span.end - 1 not in seen_tokens: + result.append(span) + seen_tokens.update(range(span.start, span.end)) + return result + + def transform_doc(doc): """ Transform a spaCy Doc to match the sense2vec format: merge entities into one token and merge noun chunks without determiners. """ - # if not doc.is_tagged: - # raise ValueError("Can't run sense2vec: document not tagged.") - for ent in doc.ents: - ent.merge(tag=ent.root.tag_, lemma=ent.root.lemma_, ent_type=ent.label_) + spans = list(doc.ents) for np in doc.noun_chunks: while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): np = np[1:] - np.merge(tag=np.root.tag_, lemma=np.root.lemma_, ent_type=np.root.ent_type_) + spans.append(np) + spans = filter_spans(spans) + with doc.retokenize() as retokenizer: + for span in spans: + root = span.root + attrs = {"tag": root.tag_, "lemma": root.lemma_, "ent_type": root.ent_type_} + retokenizer.merge(span, attrs=attrs) return doc From b3aee4159d3b0683a1b7334ae13311cda093001d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 5 Sep 2019 15:02:49 +0200 Subject: [PATCH 045/297] Move helpers to util --- sense2vec/__init__.py | 35 ++--------------------------------- sense2vec/util.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 33 deletions(-) create mode 100644 sense2vec/util.py diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 45d6edb..a7523d9 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -4,7 +4,8 @@ from os import path from .vectors import VectorMap -from .about import __version__ +from .util import transform_doc +from .about import __version__ # noqa: F401 def load(vectors_path): @@ -15,38 +16,6 @@ def load(vectors_path): return vector_map -def filter_spans(spans): - # Filter a sequence of spans so they don't contain overlaps - get_sort_key = lambda span: (span.end - span.start, span.start) - sorted_spans = sorted(spans, key=get_sort_key, reverse=True) - result = [] - seen_tokens = set() - for span in sorted_spans: - if span.start not in seen_tokens and span.end - 1 not in seen_tokens: - result.append(span) - seen_tokens.update(range(span.start, span.end)) - return result - - -def transform_doc(doc): - """ - Transform a spaCy Doc to match the sense2vec format: merge entities - into one token and merge noun chunks without determiners. - """ - spans = list(doc.ents) - for np in doc.noun_chunks: - while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): - np = np[1:] - spans.append(np) - spans = filter_spans(spans) - with doc.retokenize() as retokenizer: - for span in spans: - root = span.root - attrs = {"tag": root.tag_, "lemma": root.lemma_, "ent_type": root.ent_type_} - retokenizer.merge(span, attrs=attrs) - return doc - - class Sense2VecComponent(object): """ spaCy v2.0 pipeline component. diff --git a/sense2vec/util.py b/sense2vec/util.py new file mode 100644 index 0000000..e6be9a2 --- /dev/null +++ b/sense2vec/util.py @@ -0,0 +1,34 @@ +# coding: utf8 +from __future__ import unicode_literals + + +def filter_spans(spans): + # Filter a sequence of spans so they don't contain overlaps + get_sort_key = lambda span: (span.end - span.start, span.start) + sorted_spans = sorted(spans, key=get_sort_key, reverse=True) + result = [] + seen_tokens = set() + for span in sorted_spans: + if span.start not in seen_tokens and span.end - 1 not in seen_tokens: + result.append(span) + seen_tokens.update(range(span.start, span.end)) + return result + + +def transform_doc(doc): + """ + Transform a spaCy Doc to match the sense2vec format: merge entities + into one token and merge noun chunks without determiners. + """ + spans = list(doc.ents) + for np in doc.noun_chunks: + while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): + np = np[1:] + spans.append(np) + spans = filter_spans(spans) + with doc.retokenize() as retokenizer: + for span in spans: + root = span.root + attrs = {"tag": root.tag_, "lemma": root.lemma_, "ent_type": root.ent_type_} + retokenizer.merge(span, attrs=attrs) + return doc From d6e67cfc9a42ab5924d1bb76b83c18eeb936584a Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2019 12:37:42 +0200 Subject: [PATCH 046/297] Set version to 1.1.0a0 --- sense2vec/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/about.py b/sense2vec/about.py index ac8899d..ee24854 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1,5 +1,5 @@ __title__ = "sense2vec" -__version__ = "1.1.0" +__version__ = "1.1.0a0" __summary__ = "Use NLP to go beyond vanilla word2vec" __uri__ = "https://github.com/explosion/sense2vec" __author__ = "Explosion AI" From f9f6374be9683e2299583ff9382dfde21bbc949f Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2019 12:38:08 +0200 Subject: [PATCH 047/297] Add push-tag script --- bin/push-tag.sh | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100755 bin/push-tag.sh diff --git a/bin/push-tag.sh b/bin/push-tag.sh new file mode 100755 index 0000000..50b50c9 --- /dev/null +++ b/bin/push-tag.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash + +set -e + +# Insist repository is clean +git diff-index --quiet HEAD + +git checkout $1 +git pull origin $1 +git push origin $1 + +version=$(grep "__version__ = " spacy/about.py) +version=${version/__version__ = } +version=${version/\'/} +version=${version/\'/} +version=${version/\"/} +version=${version/\"/} +git tag "v$version" +git push origin "v$version" From 97537442628883597f1b15c3e562b6eb0d74bf55 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 9 Sep 2019 12:39:05 +0200 Subject: [PATCH 048/297] Fix name --- bin/push-tag.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/push-tag.sh b/bin/push-tag.sh index 50b50c9..a5a29c9 100755 --- a/bin/push-tag.sh +++ b/bin/push-tag.sh @@ -9,7 +9,7 @@ git checkout $1 git pull origin $1 git push origin $1 -version=$(grep "__version__ = " spacy/about.py) +version=$(grep "__version__ = " sense2vec/about.py) version=${version/__version__ = } version=${version/\'/} version=${version/\'/} From 580f0ca1c852c0af670fd2bb3e61e5af660eec11 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 9 Sep 2019 15:14:53 +0200 Subject: [PATCH 049/297] Set version to 1.0.0a1 --- sense2vec/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/about.py b/sense2vec/about.py index ee24854..c1171b5 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1,5 +1,5 @@ __title__ = "sense2vec" -__version__ = "1.1.0a0" +__version__ = "1.0.0a1" __summary__ = "Use NLP to go beyond vanilla word2vec" __uri__ = "https://github.com/explosion/sense2vec" __author__ = "Explosion AI" From fff3cec5bf2f2f28ba5189f8f7542acdedd84c2d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 12 Sep 2019 19:31:09 +0200 Subject: [PATCH 050/297] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6abd17c..1838013 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ most_similar = list(zip(words, scores)) sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a0 +pip install sense2vec==1.0.0a1 ``` The Reddit vectors model is attached to the From ff4a9744a41681f1702b817c5cf7a2ce401bb47a Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Sun, 22 Sep 2019 10:57:08 -0700 Subject: [PATCH 051/297] Adding sense2vec prodigy recipe --- prodigy_recipes/phrases.py | 196 +++++++++++++++++++++++++++++++++++++ 1 file changed, 196 insertions(+) create mode 100644 prodigy_recipes/phrases.py diff --git a/prodigy_recipes/phrases.py b/prodigy_recipes/phrases.py new file mode 100644 index 0000000..6fc5bab --- /dev/null +++ b/prodigy_recipes/phrases.py @@ -0,0 +1,196 @@ +# coding: utf8 +from __future__ import unicode_literals +import sys +from pathlib import Path + +import prodigy +from prodigy.core import recipe_args +from prodigy.components.db import connect +from prodigy.components.sorters import Probability +from prodigy.util import log, prints, split_string, set_hashes +import requests +import sense2vec +from spacy.lang.en import English +import srsly + + +@prodigy.recipe('phrases.teach', + dataset=recipe_args["dataset"], + vectors_path=("Path to pretrained sense2vec vectors"), + seeds=("One or more comma-separated seed terms", "option", "se", split_string), + threshold=("Similarity threshold for sense2vec", "option", "t", float), + resume=("Resume from existing phrases dataset", "flag", "R", bool) +) +def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, resume=False): + """ + Bootstrap a terminology list with word vectors and seeds terms. Prodigy + will suggest similar terms based on the word vectors, and update the + target vector accordingly. + """ + SENSES = ["auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", + "NUM", "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", + "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", + "WORK_OF_ART", "LANGUAGE"] + + print("Loading") + LEMMATIZER = English().vocab.morphology.lemmatizer + S2V = sense2vec.load(vectors_path) + print("Loaded!") + + DB = connect() + seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds] + DB.add_examples(seed_tasks, datasets=[dataset]) + + accept_phrases = seeds + reject_phrases = [] + + seen = set(accept_phrases) + sensed = set() + + if resume: + prev = DB.get_dataset(dataset) + prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] + prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"] + accept_phrases += prev_accept + reject_phrases += prev_reject + + seen.update(set(accept_phrases)) + seen.update(set(reject_phrases)) + + def format_for_s2v(word, sense): + return word.replace(" ", "_") + "|" + sense + + def get_best(word, sense): + if sense != "auto": # if sense is specified, find respective entry + if format_for_s2v(word, sense) in S2V: + return (word, sense) + return (None, None) + freqs = [] + casings = [word, word.upper(), word.title()] if word.islower() else [word] + for text in casings: # try options + for tag in SENSES: + query = format_for_s2v(text, tag) + if query in S2V: + freqs.append((S2V[query][0], (text, tag))) + return max(freqs)[1] if freqs else (None, None) + + def get_similar(word, sense, n=100): + query = format_for_s2v(word, sense) + if query not in S2V: + return [] + freq, query_vector = S2V[query] + words, scores = S2V.most_similar(query_vector, n) + words = [word.rsplit("|", 1) for word in words] + # Don't know why we'd be getting unsensed entries, but fix. + words = [entry for entry in words if len(entry) == 2] + words = [(word.replace("_", " "), sense) for word, sense in words] + return zip(words, scores) + + def find_similar(word: str, sense: str = "auto", n_results: int = 200): + """Find similar terms for a given term and optional sense.""" + best_word, best_sense = get_best(word, sense) + results = [] + if not word or not best_word: + return results + seen = set([best_word, min(LEMMATIZER(best_word, best_sense))]) + similar = get_similar(best_word, best_sense, n_results) + for (word_entry, sense_entry), score in similar: + head = min(LEMMATIZER(word_entry, sense_entry)) + if head not in seen and score > threshold: + freq, _ = S2V[format_for_s2v(word_entry, sense_entry)] + results.append((score, word_entry)) + seen.add(head) + if len(results) >= n_results: + break + return results + + def update(answers): + """Updates accept_phrases so that the stream can find new phrases""" + for answer in answers: + if answer['answer'] == 'accept': + accept_phrases.append(answer['text']) + elif answer['answer'] == 'reject': + reject_phrases.append(answer['text']) + + def get_stream(): + """Continue querying sense2vec whenever we get a new phrase and presenting + examples to the user with a similarity above the threshold parameter""" + while True: + seen.update(set([rp.lower() for rp in reject_phrases])) + for p in accept_phrases: + if p.lower() not in sensed: + sensed.add(p.lower()) + for score, phrase in find_similar(p): + if phrase.lower() not in seen: + seen.add(phrase.lower()) + yield score, {"text": phrase, 'meta': {'score': score}} + + stream = Probability(get_stream()) + + return { + 'view_id': 'text', + 'dataset': dataset, + 'stream': stream, + 'update': update + } + + +@prodigy.recipe( + "phrases.to-patterns", + dataset=recipe_args["dataset"], + label=recipe_args["label"], + output_file=recipe_args["output_file"], +) +def to_patterns(dataset=None, label=None, output_file=None): + """ + Convert a list of seed phrases to a list of match patterns that can be used + with ner.match. If no output file is specified, each pattern is printed + so the recipe's output can be piped forward to ner.match. + + This is pretty much an exact copy of terms.to-patterns. + The pattern for each example is just split on whitespace so instead of: + + {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]} + + + which won't match anything you'll get: + + {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} + """ + if label is None: + prints( + "--label is a required argument", + "This is the label that will be assigned to all patterns " + "created from terms collected in this dataset. ", + exits=1, + error=True, + ) + + DB = connect() + + def get_pattern(term, label): + return {"label": label, "pattern": [{"lower": t.lower()} for t in term["text"].split()]} + + log("RECIPE: Starting recipe terms.to-patterns", locals()) + if dataset is None: + log("RECIPE: Reading input terms from sys.stdin") + terms = (srsly.json_loads(line) for line in sys.stdin) + else: + if dataset not in DB: + prints("Can't find dataset '{}'".format(dataset), exits=1, error=True) + terms = DB.get_dataset(dataset) + log( + "RECIPE: Reading {} input terms from dataset {}".format(len(terms), dataset) + ) + if output_file: + patterns = [ + get_pattern(term, label) for term in terms if term["answer"] == "accept" + ] + log("RECIPE: Generated {} patterns".format(len(patterns))) + srsly.write_jsonl(output_file, patterns) + prints("Exported {} patterns".format(len(patterns)), output_file) + else: + log("RECIPE: Outputting patterns") + for term in terms: + if term["answer"] == "accept": + print(srsly.json_dumps(get_pattern(term, label))) From 7cb0841b8ab5d1677744ff1fb26ad3880793322c Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Mon, 23 Sep 2019 10:37:23 -0700 Subject: [PATCH 052/297] Add default smaller batch_size to avoid loading... forever. Remove probability --- prodigy_recipes/phrases.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/prodigy_recipes/phrases.py b/prodigy_recipes/phrases.py index 6fc5bab..88d6793 100644 --- a/prodigy_recipes/phrases.py +++ b/prodigy_recipes/phrases.py @@ -19,19 +19,20 @@ vectors_path=("Path to pretrained sense2vec vectors"), seeds=("One or more comma-separated seed terms", "option", "se", split_string), threshold=("Similarity threshold for sense2vec", "option", "t", float), + batch_size=("Batch size for submitting annotations", "option", "bs", int), resume=("Resume from existing phrases dataset", "flag", "R", bool) ) -def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, resume=False): +def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, batch_size=5, resume=False): """ - Bootstrap a terminology list with word vectors and seeds terms. Prodigy - will suggest similar terms based on the word vectors, and update the - target vector accordingly. + Bootstrap a terminology list sense2vec. Prodigy + will suggest similar terms based on the the most similar + phrases from sense2vec """ SENSES = ["auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE"] - + print("Loading") LEMMATIZER = English().vocab.morphology.lemmatizer S2V = sense2vec.load(vectors_path) @@ -123,15 +124,18 @@ def get_stream(): for score, phrase in find_similar(p): if phrase.lower() not in seen: seen.add(phrase.lower()) - yield score, {"text": phrase, 'meta': {'score': score}} + yield {"text": phrase, 'meta': {'score': score}} - stream = Probability(get_stream()) + stream = get_stream() return { 'view_id': 'text', 'dataset': dataset, 'stream': stream, - 'update': update + 'update': update, + 'config': { + 'batch_size': batch_size + } } From 71cbe0256f53b37727c1c66a6fe4a5c94d36930c Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Mon, 23 Sep 2019 10:43:04 -0700 Subject: [PATCH 053/297] Updating some comments/logging, removing unused imports --- prodigy_recipes/phrases.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/prodigy_recipes/phrases.py b/prodigy_recipes/phrases.py index 88d6793..cbec585 100644 --- a/prodigy_recipes/phrases.py +++ b/prodigy_recipes/phrases.py @@ -6,9 +6,7 @@ import prodigy from prodigy.core import recipe_args from prodigy.components.db import connect -from prodigy.components.sorters import Probability from prodigy.util import log, prints, split_string, set_hashes -import requests import sense2vec from spacy.lang.en import English import srsly @@ -33,10 +31,10 @@ def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, batch_size=5, re "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE"] - print("Loading") + log("RECIPE: Starting recipe phrases.to-patterns", locals()) LEMMATIZER = English().vocab.morphology.lemmatizer S2V = sense2vec.load(vectors_path) - print("Loaded!") + log("RECIPE: Finished loading sense2vec", locals()) DB = connect() seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds] @@ -57,6 +55,7 @@ def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, batch_size=5, re seen.update(set(accept_phrases)) seen.update(set(reject_phrases)) + log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}") def format_for_s2v(word, sense): return word.replace(" ", "_") + "|" + sense @@ -175,7 +174,7 @@ def to_patterns(dataset=None, label=None, output_file=None): def get_pattern(term, label): return {"label": label, "pattern": [{"lower": t.lower()} for t in term["text"].split()]} - log("RECIPE: Starting recipe terms.to-patterns", locals()) + log("RECIPE: Starting recipe phrases.to-patterns", locals()) if dataset is None: log("RECIPE: Reading input terms from sys.stdin") terms = (srsly.json_loads(line) for line in sys.stdin) @@ -184,7 +183,7 @@ def get_pattern(term, label): prints("Can't find dataset '{}'".format(dataset), exits=1, error=True) terms = DB.get_dataset(dataset) log( - "RECIPE: Reading {} input terms from dataset {}".format(len(terms), dataset) + "RECIPE: Reading {} input phrases from dataset {}".format(len(terms), dataset) ) if output_file: patterns = [ From e92df130e1c900c9d57317dde7f46dc252c20fc1 Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Mon, 23 Sep 2019 11:24:13 -0700 Subject: [PATCH 054/297] Removing f strings cause py 3.5, adding a top_n argument which is simpler than threshold in some cases --- prodigy_recipes/phrases.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/prodigy_recipes/phrases.py b/prodigy_recipes/phrases.py index cbec585..61c4882 100644 --- a/prodigy_recipes/phrases.py +++ b/prodigy_recipes/phrases.py @@ -17,10 +17,11 @@ vectors_path=("Path to pretrained sense2vec vectors"), seeds=("One or more comma-separated seed terms", "option", "se", split_string), threshold=("Similarity threshold for sense2vec", "option", "t", float), + top_n=("Only get the top n results for each accepted sense2vec term", "option", "n", int), batch_size=("Batch size for submitting annotations", "option", "bs", int), resume=("Resume from existing phrases dataset", "flag", "R", bool) ) -def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, batch_size=5, resume=False): +def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, top_n=200, batch_size=5, resume=False): """ Bootstrap a terminology list sense2vec. Prodigy will suggest similar terms based on the the most similar @@ -36,6 +37,10 @@ def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, batch_size=5, re S2V = sense2vec.load(vectors_path) log("RECIPE: Finished loading sense2vec", locals()) + # Seems to be a bug in sense2vec which gets < n similar senses not <= n + batch_size = min(batch_size, top_n * len(seeds)) + top_n = top_n + 1 + DB = connect() seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds] DB.add_examples(seed_tasks, datasets=[dataset]) @@ -55,7 +60,7 @@ def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, batch_size=5, re seen.update(set(accept_phrases)) seen.update(set(reject_phrases)) - log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}") + log("RECIPE: Resuming from {} previous examples in dataset {}".format(len(prev), dataset)) def format_for_s2v(word, sense): return word.replace(" ", "_") + "|" + sense @@ -86,7 +91,7 @@ def get_similar(word, sense, n=100): words = [(word.replace("_", " "), sense) for word, sense in words] return zip(words, scores) - def find_similar(word: str, sense: str = "auto", n_results: int = 200): + def find_similar(word: str, sense: str = "auto", n_results: int = top_n): """Find similar terms for a given term and optional sense.""" best_word, best_sense = get_best(word, sense) results = [] From bd36c9f923950290a3e6e7fac6242e1e393680b5 Mon Sep 17 00:00:00 2001 From: Kabir Khan Date: Thu, 26 Sep 2019 20:07:27 -0700 Subject: [PATCH 055/297] Resolving PR comments. Adding entry points for prodigy_recipes --- prodigy_recipes/phrases.py => prodigy_recipes.py | 6 +++--- setup.py | 6 ++++++ 2 files changed, 9 insertions(+), 3 deletions(-) rename prodigy_recipes/phrases.py => prodigy_recipes.py (97%) diff --git a/prodigy_recipes/phrases.py b/prodigy_recipes.py similarity index 97% rename from prodigy_recipes/phrases.py rename to prodigy_recipes.py index 61c4882..0cab5f1 100644 --- a/prodigy_recipes/phrases.py +++ b/prodigy_recipes.py @@ -12,7 +12,7 @@ import srsly -@prodigy.recipe('phrases.teach', +@prodigy.recipe('sense2vec.teach', dataset=recipe_args["dataset"], vectors_path=("Path to pretrained sense2vec vectors"), seeds=("One or more comma-separated seed terms", "option", "se", split_string), @@ -21,7 +21,7 @@ batch_size=("Batch size for submitting annotations", "option", "bs", int), resume=("Resume from existing phrases dataset", "flag", "R", bool) ) -def phrases_teach(dataset, vectors_path, seeds, threshold=0.85, top_n=200, batch_size=5, resume=False): +def teach(dataset, vectors_path, seeds, threshold=0.85, top_n=200, batch_size=5, resume=False): """ Bootstrap a terminology list sense2vec. Prodigy will suggest similar terms based on the the most similar @@ -144,7 +144,7 @@ def get_stream(): @prodigy.recipe( - "phrases.to-patterns", + "sense2vec.to-patterns", dataset=recipe_args["dataset"], label=recipe_args["label"], output_file=recipe_args["output_file"], diff --git a/setup.py b/setup.py index e2bcb5e..7a290fc 100644 --- a/setup.py +++ b/setup.py @@ -144,6 +144,12 @@ def setup_package(): "murmurhash>=0.28.0,<1.1.0", "cymem>=2.0.2,<2.1.0", ], + entry_points={ + "prodigy_recipes": [ + "teach = prodigy_recipes:teach", + "to_patterns = prodigy_recipes:to_patterns" + ] + }, classifiers=[ "Development Status :: 4 - Beta", "Environment :: Console", From f2135ab4566fdeff6ed3af7f966f1b2ce351b4e7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Sep 2019 13:59:27 +0200 Subject: [PATCH 056/297] Refactor --- bin/cythonize.py | 199 ----------------- requirements.txt | 7 +- sense2vec/__init__.pxd | 0 sense2vec/__init__.py | 212 ++++++++++++------ sense2vec/_strings.pxd | 31 --- sense2vec/_strings.pyx | 292 ------------------------ sense2vec/about.py | 5 +- sense2vec/cfile.pxd | 12 - sense2vec/cfile.pyx | 42 ---- sense2vec/tests/conftest.py | 16 -- sense2vec/tests/test_component.py | 69 ++++++ sense2vec/tests/test_sense2vec.py | 69 ++++-- sense2vec/tests/test_vectors.py | 50 ----- sense2vec/util.py | 56 +++-- sense2vec/vectors.pxd | 28 --- sense2vec/vectors.pyx | 358 ------------------------------ setup.py | 201 ++++------------- 17 files changed, 357 insertions(+), 1290 deletions(-) delete mode 100755 bin/cythonize.py delete mode 100644 sense2vec/__init__.pxd delete mode 100644 sense2vec/_strings.pxd delete mode 100644 sense2vec/_strings.pyx delete mode 100644 sense2vec/cfile.pxd delete mode 100644 sense2vec/cfile.pyx delete mode 100644 sense2vec/tests/conftest.py create mode 100644 sense2vec/tests/test_component.py delete mode 100644 sense2vec/tests/test_vectors.py delete mode 100644 sense2vec/vectors.pxd delete mode 100644 sense2vec/vectors.pyx diff --git a/bin/cythonize.py b/bin/cythonize.py deleted file mode 100755 index 2c18ae8..0000000 --- a/bin/cythonize.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python -""" cythonize - -Cythonize pyx files into C files as needed. - -Usage: cythonize [root_dir] - -Default [root_dir] is 'spacy'. - -Checks pyx files to see if they have been changed relative to their -corresponding C files. If they have, then runs cython on these files to -recreate the C files. - -The script thinks that the pyx files have changed relative to the C files -by comparing hashes stored in a database file. - -Simple script to invoke Cython (and Tempita) on all .pyx (.pyx.in) -files; while waiting for a proper build system. Uses file hashes to -figure out if rebuild is needed. - -For now, this script should be run by developers when changing Cython files -only, and the resulting C files checked in, so that end-users (and Python-only -developers) do not get the Cython/Tempita dependencies. - -Originally written by Dag Sverre Seljebotn, and copied here from: - -https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py - -Note: this script does not check any of the dependent C libraries; it only -operates on the Cython .pyx files. -""" - -from __future__ import division, print_function, absolute_import - -import os -import re -import sys -import hashlib -import subprocess - -HASH_FILE = 'cythonize.dat' -DEFAULT_ROOT = 'spacy' -VENDOR = 'spaCy' - -# WindowsError is not defined on unix systems -try: - WindowsError -except NameError: - WindowsError = None - -# -# Rules -# -def process_pyx(fromfile, tofile): - try: - from Cython.Compiler.Version import version as cython_version - from distutils.version import LooseVersion - if LooseVersion(cython_version) < LooseVersion('0.19'): - raise Exception('Building %s requires Cython >= 0.19' % VENDOR) - - except ImportError: - pass - - flags = ['--fast-fail'] - if tofile.endswith('.cpp'): - flags += ['--cplus'] - - try: - try: - r = subprocess.call(['cython'] + flags + ["-o", tofile, fromfile]) - if r != 0: - raise Exception('Cython failed') - except OSError: - # There are ways of installing Cython that don't result in a cython - # executable on the path, see gh-2397. - r = subprocess.call([sys.executable, '-c', - 'import sys; from Cython.Compiler.Main import ' - 'setuptools_main as main; sys.exit(main())'] + flags + - ["-o", tofile, fromfile]) - if r != 0: - raise Exception('Cython failed') - except OSError: - raise OSError('Cython needs to be installed') - -def process_tempita_pyx(fromfile, tofile): - try: - try: - from Cython import Tempita as tempita - except ImportError: - import tempita - except ImportError: - raise Exception('Building %s requires Tempita: ' - 'pip install --user Tempita' % VENDOR) - with open(fromfile, "r") as f: - tmpl = f.read() - pyxcontent = tempita.sub(tmpl) - assert fromfile.endswith('.pyx.in') - pyxfile = fromfile[:-len('.pyx.in')] + '.pyx' - with open(pyxfile, "w") as f: - f.write(pyxcontent) - process_pyx(pyxfile, tofile) - -rules = { - # fromext : function - '.pyx' : process_pyx, - '.pyx.in' : process_tempita_pyx - } -# -# Hash db -# -def load_hashes(filename): - # Return { filename : (sha1 of input, sha1 of output) } - if os.path.isfile(filename): - hashes = {} - with open(filename, 'r') as f: - for line in f: - filename, inhash, outhash = line.split() - hashes[filename] = (inhash, outhash) - else: - hashes = {} - return hashes - -def save_hashes(hash_db, filename): - with open(filename, 'w') as f: - for key, value in sorted(hash_db.items()): - f.write("%s %s %s\n" % (key, value[0], value[1])) - -def sha1_of_file(filename): - h = hashlib.sha1() - with open(filename, "rb") as f: - h.update(f.read()) - return h.hexdigest() - -# -# Main program -# - -def normpath(path): - path = path.replace(os.sep, '/') - if path.startswith('./'): - path = path[2:] - return path - -def get_hash(frompath, topath): - from_hash = sha1_of_file(frompath) - to_hash = sha1_of_file(topath) if os.path.exists(topath) else None - return (from_hash, to_hash) - -def process(path, fromfile, tofile, processor_function, hash_db): - fullfrompath = os.path.join(path, fromfile) - fulltopath = os.path.join(path, tofile) - current_hash = get_hash(fullfrompath, fulltopath) - if current_hash == hash_db.get(normpath(fullfrompath), None): - print('%s has not changed' % fullfrompath) - return - - orig_cwd = os.getcwd() - try: - os.chdir(path) - print('Processing %s' % fullfrompath) - processor_function(fromfile, tofile) - finally: - os.chdir(orig_cwd) - # changed target file, recompute hash - current_hash = get_hash(fullfrompath, fulltopath) - # store hash in db - hash_db[normpath(fullfrompath)] = current_hash - - -def find_process_files(root_dir): - hash_db = load_hashes(HASH_FILE) - for cur_dir, dirs, files in os.walk(root_dir): - for filename in files: - in_file = os.path.join(cur_dir, filename + ".in") - if filename.endswith('.pyx') and os.path.isfile(in_file): - continue - for fromext, function in rules.items(): - if filename.endswith(fromext): - toext = ".cpp" - # with open(os.path.join(cur_dir, filename), 'rb') as f: - # data = f.read() - # m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M) - # if m: - # toext = ".cxx" - fromfile = filename - tofile = filename[:-len(fromext)] + toext - process(cur_dir, fromfile, tofile, function, hash_db) - save_hashes(hash_db, HASH_FILE) - -def main(): - try: - root_dir = sys.argv[1] - except IndexError: - root_dir = DEFAULT_ROOT - find_process_files(root_dir) - - -if __name__ == '__main__': - main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 2a38b3a..320fcda 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,5 @@ +spacy>=2.1.0 numpy>=1.15.0 -cymem>=2.0.2,<2.1.0 -murmurhash>=0.28.0,<1.1.0 -preshed>=2.0.1,<2.1.0 srsly>=0.1.0,<1.1.0 # Development requirements -cython>=0.24,<0.28.0 -pytest>=3.0.6,<4.0.0 +pytest>=4.1.0 diff --git a/sense2vec/__init__.pxd b/sense2vec/__init__.pxd deleted file mode 100644 index e69de29..0000000 diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index a7523d9..049ec1a 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -1,90 +1,170 @@ # coding: utf8 from __future__ import unicode_literals -from os import path - -from .vectors import VectorMap -from .util import transform_doc +from pathlib import Path +from spacy.vectors import Vectors +from spacy.strings import StringStore +from spacy.tokens import Doc, Token, Span +import numpy +import srsly + +from .util import transform_doc, get_phrases, make_key, split_key from .about import __version__ # noqa: F401 def load(vectors_path): - if not path.exists(vectors_path): - raise IOError("Can't find data directory: {}".format(vectors_path)) - vector_map = VectorMap(128) - vector_map.load(vectors_path) - return vector_map + vectors_path = Path(vectors_path) + if not vectors_path.exists(): + raise IOError("Can't find vectors: {}".format(vectors_path)) + return Sense2Vec().from_disk(vectors_path) + + +class Sense2Vec(object): + def __init__(self, shape=(1000, 128), strings=None): + self.vectors = Vectors(shape=shape) + self.strings = StringStore() if strings is None else strings + + def __len__(self): + return len(self.vectors) + + def __contains__(self, key): + key = key if isinstance(key, int) else self.strings[key] + return key in self.vectors + + def __getitem__(self, key): + key = key if isinstance(key, int) else self.strings[key] + if key in self.vectors: + return self.vectors[key] + + def __iter__(self): + yield from self.items() + + def add(self, key, vector): + if not isinstance(key, int): + key = self.strings.add(key) + self.vectors.add(key, vector=vector) + + def items(self): + for key, value in self.vectors.items(): + yield self.strings[key], value + + def keys(self): + for key in self.vectors.keys(): + yield self.strings[key] + + def values(self): + yield from self.vectors.values() + + def most_similar(self, keys, n_similar=10): + vecs = [self[key] for key in keys if key in self] + arr = numpy.asarray(vecs, dtype=numpy.float32) + result_keys, _, scores = self.vectors.most_similar(arr) + result = zip(result_keys, scores) + result = [(self.strings[key], score) for key, score in result if key] + result = [(key, score) for key, score in result if key not in keys] + # TODO: handle this better? + return result[:n_similar] + + def to_bytes(self, exclude=tuple()): + data = {"vectors": self.vectors.to_bytes()} + if "strings" not in exclude: + data["strings"] = self.strings.to_bytes() + return srsly.msgpack_dumps(data) + + def from_bytes(self, bytes_data, exclude=tuple()): + data = srsly.msgpack_loads(bytes_data) + self.vectors = Vectors().from_bytes(data["vectors"]) + if "strings" not in exclude and "strings" in data: + self.strings = StringStore().from_bytes(data["strings"]) + return self + + def from_disk(self, path, exclude=tuple()): + path = Path(path) + strings_path = path / "strings.json" + self.vectors = Vectors().from_disk(path) + if "strings" not in exclude and strings_path.exists(): + self.strings = StringStore().from_disk(strings_path) + return self + + def to_disk(self, path, exclude=tuple()): + path = Path(path) + self.vectors.to_disk(path) + if "strings" not in exclude: + self.strings.to_disk(path / "strings.json") + return self class Sense2VecComponent(object): - """ - spaCy v2.0 pipeline component. - - USAGE: - >>> import spacy - >>> from sense2vec import Sense2VecComponent - >>> nlp = spacy.load('en') - >>> s2v = Sense2VecComponent('/path/to/model') - >>> nlp.add_pipe(s2v) - >>> doc = nlp(u"A text about natural language processing.") - >>> assert doc[3].text == 'natural language processing' - >>> assert doc[3]._.in_s2v - >>> print(doc[3]._.s2v_most_similar(20)) - """ - name = "sense2vec" - def __init__(self, vectors_path): - self.s2v = load(vectors_path) + def __init__( + self, + vocab=None, + shape=(1000, 128), + merge_phrases=False, + make_key=make_key, + split_key=split_key, + ): + strings = vocab.strings if vocab is not None else None + self.s2v = Sense2Vec(shape=shape, strings=strings) self.first_run = True + self.merge_phrases = merge_phrases + self.make_key = make_key + self.split_key = split_key + + @classmethod + def from_nlp(cls, nlp): + return cls(vocab=nlp.vocab) def __call__(self, doc): if self.first_run: self.init_component(doc) self.first_run = False - doc = transform_doc(doc) + # Store reference to s2v object on Doc to make sure it's right + doc._._s2v = self.s2v + if self.merge_phrases: + doc = transform_doc(doc) return doc def init_component(self, doc): # initialise the attributes here only if the component is added to the # pipeline and used – otherwise, tokens will still get the attributes # even if the component is only created and not added - Token = doc[0].__class__ - Span = doc[:1].__class__ - Token.set_extension("in_s2v", getter=lambda t: self.in_s2v(t)) - Token.set_extension("s2v_freq", getter=lambda t: self.s2v_freq(t)) - Token.set_extension("s2v_vec", getter=lambda t: self.s2v_vec(t)) - Token.set_extension( - "s2v_most_similar", method=lambda t, n: self.s2v_most_sim(t, n) - ) - Span.set_extension("in_s2v", getter=lambda s: self.in_s2v(s, "ent")) - Span.set_extension("s2v_freq", getter=lambda s: self.s2v_freq(s, "ent")) - Span.set_extension("s2v_vec", getter=lambda s: self.s2v_vec(s, "ent")) - Span.set_extension( - "s2v_most_similar", method=lambda s, n: self.s2v_most_sim(s, n, "ent") - ) - - def in_s2v(self, obj, attr="pos"): - return self._get_query(obj, attr) in self.s2v - - def s2v_freq(self, obj, attr="pos"): - freq, _ = self.s2v[self._get_query(obj, attr)] - return freq - - def s2v_vec(self, obj, attr="pos"): - _, vector = self.s2v[self._get_query(obj, attr)] - return vector - - def s2v_most_sim(self, obj, n_similar=10, attr="pos"): - _, vector = self.s2v[self._get_query(obj, attr)] - words, scores = self.s2v.most_similar(vector, n_similar) - words = [word.replace("_", " ") for word in words] - words = [tuple(word.rsplit("|", 1)) for word in words] - return list(zip(words, scores)) - - def _get_query(self, obj, attr="pos"): - # no pos_ and label_ shouldn't happen – unless it's an unmerged - # non-entity Span (in which case we just use the root's pos) - pos = obj.pos_ if hasattr(obj, "pos_") else obj.root.pos_ - sense = obj.label_ if (attr == "ent" and obj.label_) else pos - return obj.text.replace(" ", "_") + "|" + sense + Doc.set_extension("_s2v", default=None) + Doc.set_extension("s2v_phrases", getter=get_phrases) + Token.set_extension("s2v_key", getter=self.s2v_key) + Token.set_extension("in_s2v", getter=self.in_s2v) + Token.set_extension("s2v_vec", getter=self.s2v_vec) + Token.set_extension("s2v_most_similar", method=self.s2v_most_sim) + Span.set_extension("s2v_key", getter=self.s2v_key) + Span.set_extension("in_s2v", getter=self.in_s2v) + Span.set_extension("s2v_vec", getter=self.s2v_vec) + Span.set_extension("s2v_most_similar", method=self.s2v_most_sim) + + def in_s2v(self, obj): + return self.make_key(obj) in obj.doc._._s2v + + def s2v_vec(self, obj): + return obj.doc._._s2v[self.make_key(obj)] + + def s2v_key(self, obj): + return self.make_key(obj) + + def s2v_most_sim(self, obj, n_similar=10): + key = self.make_key(obj) + results = obj.doc._._s2v.most_similar([key], n_similar=n_similar) + return [(self.split_key(result), score) for result, score in results] + + def to_bytes(self): + return self.s2v.to_bytes(exclude=["strings"]) + + def from_bytes(self, bytes_data): + self.s2v = Sense2Vec().from_bytes(bytes_data, exclude=["strings"]) + return self + + def to_disk(self, path): + self.s2v.to_bytes(path, exclude=["strings"]) + + def from_disk(self, path): + self.s2v = Sense2Vec().from_disk(path, exclude=["strings"]) + return self diff --git a/sense2vec/_strings.pxd b/sense2vec/_strings.pxd deleted file mode 100644 index fa88a6d..0000000 --- a/sense2vec/_strings.pxd +++ /dev/null @@ -1,31 +0,0 @@ -from libc.stdint cimport int64_t, uint64_t, int32_t - -from cymem.cymem cimport Pool -from preshed.maps cimport PreshMap -from murmurhash.mrmr cimport hash64 - - -ctypedef uint64_t hash_t -ctypedef int32_t attr_t - - -cpdef hash_t hash_string(unicode string) except 0 - - -ctypedef union Utf8Str: - unsigned char[8] s - unsigned char* p - - -cdef class StringStore: - cdef Pool mem - cdef Utf8Str* c - cdef int64_t size - cdef bint is_frozen - - cdef public PreshMap _map - cdef public PreshMap _oov - cdef int64_t _resize_at - - cdef const Utf8Str* intern_unicode(self, unicode py_string) - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length) diff --git a/sense2vec/_strings.pyx b/sense2vec/_strings.pyx deleted file mode 100644 index 1f9da11..0000000 --- a/sense2vec/_strings.pyx +++ /dev/null @@ -1,292 +0,0 @@ -# cython: infer_types=True -# coding: utf8 -from __future__ import unicode_literals, absolute_import - -cimport cython -import srsly -from libc.string cimport memcpy -from libc.stdint cimport uint64_t, uint32_t -from murmurhash.mrmr cimport hash64, hash32 -from preshed.maps cimport map_iter, key_t -import os - - -cpdef hash_t hash_string(unicode string) except 0: - chars = string.encode('utf8') - return hash_utf8(chars, len(chars)) - - -cdef hash_t hash_utf8(char* utf8_string, int length) nogil: - return hash64(utf8_string, length, 1) - - -cdef uint32_t hash32_utf8(char* utf8_string, int length) nogil: - return hash32(utf8_string, length, 1) - - -cdef unicode _decode(const Utf8Str* string): - cdef int i, length - if string.s[0] < sizeof(string.s) and string.s[0] != 0: - return string.s[1:string.s[0]+1].decode('utf8') - elif string.p[0] < 255: - return string.p[1:string.p[0]+1].decode('utf8') - else: - i = 0 - length = 0 - while string.p[i] == 255: - i += 1 - length += 255 - length += string.p[i] - i += 1 - return string.p[i:length + i].decode('utf8') - - -cdef Utf8Str _allocate(Pool mem, const unsigned char* chars, uint32_t length) except *: - cdef int n_length_bytes - cdef int i - cdef Utf8Str string - cdef uint32_t ulength = length - if length < sizeof(string.s): - string.s[0] = length - memcpy(&string.s[1], chars, length) - return string - elif length < 255: - string.p = mem.alloc(length + 1, sizeof(unsigned char)) - string.p[0] = length - memcpy(&string.p[1], chars, length) - assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] - return string - else: - i = 0 - n_length_bytes = (length // 255) + 1 - string.p = mem.alloc(length + n_length_bytes, sizeof(unsigned char)) - for i in range(n_length_bytes-1): - string.p[i] = 255 - string.p[n_length_bytes-1] = length % 255 - memcpy(&string.p[n_length_bytes], chars, length) - assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] - return string - - -cdef class StringStore: - """ - Map strings to and from integer IDs. - """ - def __init__(self, strings=None, freeze=False): - """ - Create the StringStore. - - Arguments: - strings: A sequence of unicode strings to add to the store. - """ - self.mem = Pool() - self._map = PreshMap() - self._oov = PreshMap() - self._resize_at = 10000 - self.c = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) - self.size = 1 - self.is_frozen = freeze - if strings is not None: - for string in strings: - _ = self[string] - - property size: - def __get__(self): - return self.size -1 - - def __reduce__(self): - # TODO: OOV words, for the is_frozen stuff? - if self.is_frozen: - raise NotImplementedError( - "Currently missing support for pickling StringStore when " - "is_frozen=True") - return (StringStore, (list(self),)) - - def __len__(self): - """ - The number of strings in the store. - - Returns: - int The number of strings in the store. - """ - return self.size-1 - - def __getitem__(self, object string_or_id): - """ - Retrieve a string from a given integer ID, or vice versa. - - Arguments: - string_or_id (bytes or unicode or int): - The value to encode. - Returns: - unicode or int: The value to retrieved. - """ - if isinstance(string_or_id, basestring) and len(string_or_id) == 0: - return 0 - elif string_or_id == 0: - return u'' - - cdef bytes byte_string - cdef const Utf8Str* utf8str - cdef uint64_t int_id - cdef uint32_t oov_id - if isinstance(string_or_id, (int, long)): - int_id = string_or_id - oov_id = string_or_id - if int_id < self.size: - return _decode(&self.c[int_id]) - else: - utf8str = self._oov.get(oov_id) - if utf8str is not NULL: - return _decode(utf8str) - else: - raise IndexError(string_or_id) - else: - if isinstance(string_or_id, bytes): - byte_string = string_or_id - elif isinstance(string_or_id, unicode): - byte_string = (string_or_id).encode('utf8') - else: - raise TypeError(type(string_or_id)) - utf8str = self._intern_utf8(byte_string, len(byte_string)) - if utf8str is NULL: - # TODO: We need to use 32 bit here, for compatibility with the - # vocabulary values. This makes birthday paradox probabilities - # pretty bad. - # We could also get unlucky here, and hash into a value that - # collides with the 'real' strings. - return hash32_utf8(byte_string, len(byte_string)) - else: - return utf8str - self.c - - def __contains__(self, unicode string not None): - """ - Check whether a string is in the store. - - Arguments: - string (unicode): The string to check. - Returns bool: - Whether the store contains the string. - """ - if len(string) == 0: - return True - cdef hash_t key = hash_string(string) - return self._map.get(key) is not NULL - - def __iter__(self): - """ - Iterate over the strings in the store, in order. - - Yields: unicode A string in the store. - """ - cdef int i - for i in range(self.size): - yield _decode(&self.c[i]) if i > 0 else u'' - # TODO: Iterate OOV here? - - def __reduce__(self): - strings = [""] - for i in range(1, self.size): - string = &self.c[i] - py_string = _decode(string) - strings.append(py_string) - return (StringStore, (strings,), None, None, None) - - def set_frozen(self, bint is_frozen): - # TODO - self.is_frozen = is_frozen - - def flush_oov(self): - self._oov = PreshMap() - - cdef const Utf8Str* intern_unicode(self, unicode py_string): - # 0 means missing, but we don't bother offsetting the index. - cdef bytes byte_string = py_string.encode('utf8') - return self._intern_utf8(byte_string, len(byte_string)) - - @cython.final - cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length): - # TODO: This function's API/behaviour is an unholy mess... - # 0 means missing, but we don't bother offsetting the index. - cdef hash_t key = hash_utf8(utf8_string, length) - cdef Utf8Str* value = self._map.get(key) - if value is not NULL: - return value - value = self._oov.get(key) - if value is not NULL: - return value - if self.is_frozen: - # OOV store uses 32 bit hashes. Pretty ugly :( - key32 = hash32_utf8(utf8_string, length) - # Important: Make the OOV store own the memory. That way it's trivial - # to flush them all. - value = self._oov.mem.alloc(1, sizeof(Utf8Str)) - value[0] = _allocate(self._oov.mem, utf8_string, length) - self._oov.set(key32, value) - return NULL - - if self.size == self._resize_at: - self._realloc() - self.c[self.size] = _allocate(self.mem, utf8_string, length) - self._map.set(key, &self.c[self.size]) - self.size += 1 - return &self.c[self.size-1] - - def to_disk(self, path): - """ - Save the strings to a JSON file. - - Arguments: - path (unicode / Path): The file path to save the strings. - Returns: - None - """ - with open(path, 'w') as file_: - string_data = srsly.json_dumps(list(self)) - if not isinstance(string_data, unicode): - string_data = string_data.decode('utf8') - file_.write(string_data) - - def from_disk(self, path): - """ - Load the strings from a JSON file. - - Arguments: - path (unicode / Path): The file from which to load the strings. - Returns: - None - """ - strings = srsly.read_json(path) - if strings == ['']: - return None - cdef unicode string - for string in strings: - # explicit None/len check instead of simple truth testing - # (bug in Cython <= 0.23.4) - if string is not None and len(string): - self.intern_unicode(string) - - def _realloc(self): - # We want to map straight to pointers, but they'll be invalidated if - # we resize our array. So, first we remap to indices, then we resize, - # then we can acquire the new pointers. - cdef Pool tmp_mem = Pool() - keys = tmp_mem.alloc(self.size, sizeof(key_t)) - cdef key_t key - cdef void* value - cdef const Utf8Str ptr - cdef int i = 0 - cdef size_t offset - while map_iter(self._map.c_map, &i, &key, &value): - # Find array index with pointer arithmetic - offset = ((value) - self.c) - keys[offset] = key - - self._resize_at *= 2 - cdef size_t new_size = self._resize_at * sizeof(Utf8Str) - self.c = self.mem.realloc(self.c, new_size) - - self._map = PreshMap(self.size) - for i in range(self.size): - if keys[i]: - self._map.set(keys[i], &self.c[i]) diff --git a/sense2vec/about.py b/sense2vec/about.py index c1171b5..bae518e 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1,8 +1,7 @@ __title__ = "sense2vec" -__version__ = "1.0.0a1" +__version__ = "1.0.0a2" __summary__ = "Use NLP to go beyond vanilla word2vec" __uri__ = "https://github.com/explosion/sense2vec" -__author__ = "Explosion AI" +__author__ = "Explosion" __email__ = "contact@explosion.ai" __license__ = "MIT" -__release__ = True diff --git a/sense2vec/cfile.pxd b/sense2vec/cfile.pxd deleted file mode 100644 index 2ca9e29..0000000 --- a/sense2vec/cfile.pxd +++ /dev/null @@ -1,12 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE -from cymem.cymem cimport Pool - -cdef class CFile: - cdef FILE* fp - cdef bint is_open - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1 - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1 - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except * diff --git a/sense2vec/cfile.pyx b/sense2vec/cfile.pyx deleted file mode 100644 index 7817594..0000000 --- a/sense2vec/cfile.pyx +++ /dev/null @@ -1,42 +0,0 @@ -from libc.stdio cimport fopen, fclose, fread, fwrite, FILE - - -cdef class CFile: - def __init__(self, loc, mode): - if isinstance(mode, unicode): - mode_str = mode.encode('ascii') - else: - mode_str = mode - cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc - self.fp = fopen(bytes_loc, mode_str) - if self.fp == NULL: - raise IOError("Could not open binary file %s" % bytes_loc) - self.is_open = True - - def __dealloc__(self): - if self.is_open: - fclose(self.fp) - - def close(self): - fclose(self.fp) - self.is_open = False - - cdef int read_into(self, void* dest, size_t number, size_t elem_size) except -1: - st = fread(dest, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef int write_from(self, void* src, size_t number, size_t elem_size) except -1: - st = fwrite(src, elem_size, number, self.fp) - if st != number: - raise IOError - - cdef void* alloc_read(self, Pool mem, size_t number, size_t elem_size) except *: - cdef void* dest = mem.alloc(number, elem_size) - self.read_into(dest, number, elem_size) - return dest - - def write_unicode(self, unicode value): - cdef bytes py_bytes = value.encode('utf8') - cdef char* chars = py_bytes - self.write(sizeof(char), len(py_bytes), chars) diff --git a/sense2vec/tests/conftest.py b/sense2vec/tests/conftest.py deleted file mode 100644 index fc66ee7..0000000 --- a/sense2vec/tests/conftest.py +++ /dev/null @@ -1,16 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest - - -def pytest_addoption(parser): - parser.addoption( - "--models", action="http://github.com/store_true", help="include tests that require full models" - ) - - -def pytest_runtest_setup(item): - for opt in ["models"]: - if opt in item.keywords and not item.config.getoption("--%s" % opt): - pytest.skip("need --%s option to run" % opt) diff --git a/sense2vec/tests/test_component.py b/sense2vec/tests/test_component.py new file mode 100644 index 0000000..dc9a982 --- /dev/null +++ b/sense2vec/tests/test_component.py @@ -0,0 +1,69 @@ +# coding: utf8 +from __future__ import unicode_literals + +import pytest +import numpy +from spacy.vocab import Vocab +from spacy.tokens import Doc, Span +from sense2vec import Sense2VecComponent + + +@pytest.fixture +def doc(): + vocab = Vocab() + doc = Doc(vocab, words=["hello", "world"]) + doc[0].pos_ = "INTJ" + doc[1].pos_ = "NOUN" + return doc + + +def test_component_attributes(doc): + s2v = Sense2VecComponent(doc.vocab, shape=(10, 4)) + vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) + s2v.s2v.add("world|NOUN", vector) + doc = s2v(doc) + assert doc[0]._.s2v_key == "hello|INTJ" + assert doc[1]._.s2v_key == "world|NOUN" + assert doc[0]._.in_s2v is False + assert doc[1]._.in_s2v is True + assert numpy.array_equal(doc[1]._.s2v_vec, vector) + + +def test_component_attributes_ents(doc): + s2v = Sense2VecComponent(doc.vocab, shape=(10, 4)) + s2v.first_run = False + vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) + s2v.s2v.add("world|NOUN", vector) + s2v.s2v.add("world|GPE", vector) + doc = s2v(doc) + assert len(doc._.s2v_phrases) == 0 + doc.ents = [Span(doc, 1, 2, label="GPE")] + assert len(doc._.s2v_phrases) == 1 + phrase = doc._.s2v_phrases[0] + assert phrase._.s2v_key == "world|GPE" + assert phrase[0]._.s2v_key == "world|NOUN" + assert phrase._.in_s2v is True + assert phrase[0]._.in_s2v is True + + +def test_component_to_from_bytes(doc): + s2v = Sense2VecComponent(doc.vocab, shape=(1, 4)) + s2v.first_run = False + vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) + s2v.s2v.add("world|NOUN", vector) + assert "world|NOUN" in s2v.s2v + assert "world|GPE" not in s2v.s2v + doc = s2v(doc) + assert doc[0]._.in_s2v is False + assert doc[1]._.in_s2v is True + s2v_bytes = s2v.to_bytes() + new_s2v = Sense2VecComponent(doc.vocab).from_bytes(s2v_bytes) + new_s2v.first_run = False + assert "world|NOUN" in new_s2v.s2v + assert numpy.array_equal(new_s2v.s2v["world|NOUN"], vector) + assert "world|GPE" not in new_s2v.s2v + new_s2v.s2v.vectors.resize((2, 4)) + new_s2v.s2v.add("hello|INTJ", vector) + assert doc[0]._.in_s2v is False + new_doc = new_s2v(doc) + assert new_doc[0]._.in_s2v is True diff --git a/sense2vec/tests/test_sense2vec.py b/sense2vec/tests/test_sense2vec.py index 2460e23..d051e23 100644 --- a/sense2vec/tests/test_sense2vec.py +++ b/sense2vec/tests/test_sense2vec.py @@ -2,22 +2,63 @@ from __future__ import unicode_literals import pytest -from os import path +import numpy +from sense2vec import Sense2Vec -from .. import load +def test_sense2vec_object(): + s2v = Sense2Vec(shape=(10, 4)) + assert s2v.vectors.shape == (10, 4) + assert len(s2v) == 10 + test_vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) + s2v.add("test", test_vector) + assert "test" in s2v + assert isinstance(s2v.strings["test"], int) + assert s2v.strings["test"] in s2v + assert "foo" not in s2v + assert numpy.array_equal(s2v["test"], test_vector) + assert numpy.array_equal(s2v[s2v.strings["test"]], test_vector) + assert list(s2v.keys()) == ["test"] + s2v.add("test2", test_vector) + assert "test2" in s2v + assert sorted(list(s2v.keys())) == ["test", "test2"] -data_path = path.join(path.dirname(__file__), "..", "..", "data") +def test_sense2vec_most_similar(): + s2v = Sense2Vec(shape=(6, 4)) + s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) + s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32)) + s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32)) + s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32)) + s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) + s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32)) + result1 = s2v.most_similar(["x"]) + assert len(result1) + assert result1[0][0] == "a" + # assert result1[0][1] == 1.0 + result2 = s2v.most_similar(["y"]) + assert len(result2) == 0 -@pytest.mark.models -@pytest.mark.parametrize("model", ["reddit_vectors-1.1.0"]) -def test_sample(model): - s2v = load(path.join(data_path, model)) - freq, query_vector = s2v["beekeepers|NOUN"] - assert freq is not None - assert s2v.most_similar(query_vector, 3)[0] == [ - "beekeepers|NOUN", - "honey_bees|NOUN", - "Beekeepers|NOUN", - ] + +def test_sense2vec_to_from_bytes(): + s2v = Sense2Vec(shape=(2, 4)) + test_vector1 = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32) + test_vector2 = numpy.asarray([5, 6, 7, 8], dtype=numpy.float32) + s2v.add("test1", test_vector1) + s2v.add("test2", test_vector2) + s2v_bytes = s2v.to_bytes() + new_s2v = Sense2Vec().from_bytes(s2v_bytes) + assert len(new_s2v) == 2 + assert new_s2v.vectors.shape == (2, 4) + assert "test1" in new_s2v + assert "test2" in new_s2v + assert numpy.array_equal(new_s2v["test1"], test_vector1) + assert numpy.array_equal(new_s2v["test2"], test_vector2) + assert s2v_bytes == new_s2v.to_bytes() + s2v_bytes2 = s2v.to_bytes(exclude=["strings"]) + new_s2v2 = Sense2Vec().from_bytes(s2v_bytes2) + assert len(new_s2v2.strings) == 0 + assert "test1" in new_s2v2 + assert s2v.strings["test1"] in new_s2v2 + with pytest.raises(KeyError): # can't resolve hash + new_s2v2.strings[s2v.strings["test2"]] diff --git a/sense2vec/tests/test_vectors.py b/sense2vec/tests/test_vectors.py deleted file mode 100644 index 64825f1..0000000 --- a/sense2vec/tests/test_vectors.py +++ /dev/null @@ -1,50 +0,0 @@ -# coding: utf8 -from __future__ import unicode_literals - -import pytest -import numpy - -from ..vectors import VectorStore - - -def test_init(): - vec = VectorStore(128) - assert vec.mem is not None - with pytest.raises(AttributeError) as excinfo: - vec.mem = None - - -def test_add(): - vecs = VectorStore(128) - good = numpy.ndarray(shape=(vecs.nr_dim,), dtype="float32") - vecs.add(good) - bad = numpy.ndarray(shape=(vecs.nr_dim + 1,), dtype="float32") - with pytest.raises(AssertionError) as excinfo: - vecs.add(bad) - - -@pytest.mark.xfail -def test_borrow(): - vecs = VectorStore(128) - good = numpy.ndarray(shape=(vecs.nr_dim,), dtype="float32") - vecs.borrow(good) - bad = numpy.ndarray(shape=(vecs.nr_dim + 1,), dtype="float32") - with pytest.raises(AssertionError) as excinfo: - vecs.borrow(bad) - - -@pytest.mark.xfail -def test_most_similar(): - vecs = VectorStore(4) - vecs.add(numpy.asarray([4, 2, 2, 2], dtype="float32")) - vecs.add(numpy.asarray([4, 4, 2, 2], dtype="float32")) - vecs.add(numpy.asarray([4, 4, 4, 2], dtype="float32")) - vecs.add(numpy.asarray([4, 4, 4, 4], dtype="float32")) - - indices, scores = vecs.most_similar(numpy.asarray([4, 2, 2, 2], dtype="float32"), 4) - print(list(scores)) - assert list(indices) == [0, 1] - indices, scores = vecs.most_similar( - numpy.asarray([0.1, 1, 1, 1], dtype="float32"), 4 - ) - assert list(indices) == [4, 3] diff --git a/sense2vec/util.py b/sense2vec/util.py index e6be9a2..7f31cf0 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -1,18 +1,8 @@ # coding: utf8 from __future__ import unicode_literals - -def filter_spans(spans): - # Filter a sequence of spans so they don't contain overlaps - get_sort_key = lambda span: (span.end - span.start, span.start) - sorted_spans = sorted(spans, key=get_sort_key, reverse=True) - result = [] - seen_tokens = set() - for span in sorted_spans: - if span.start not in seen_tokens and span.end - 1 not in seen_tokens: - result.append(span) - seen_tokens.update(range(span.start, span.end)) - return result +from spacy.tokens import Token, Span +from spacy.util import filter_spans def transform_doc(doc): @@ -20,11 +10,7 @@ def transform_doc(doc): Transform a spaCy Doc to match the sense2vec format: merge entities into one token and merge noun chunks without determiners. """ - spans = list(doc.ents) - for np in doc.noun_chunks: - while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): - np = np[1:] - spans.append(np) + spans = get_phrases(doc) spans = filter_spans(spans) with doc.retokenize() as retokenizer: for span in spans: @@ -32,3 +18,39 @@ def transform_doc(doc): attrs = {"tag": root.tag_, "lemma": root.lemma_, "ent_type": root.ent_type_} retokenizer.merge(span, attrs=attrs) return doc + + +def make_key(obj): + text = obj.text.replace(" ", "_") + if isinstance(obj, Token): + return text + "|" + obj.pos_ + elif isinstance(obj, Span): + if obj.label_: + return text + "|" + obj.label_ + return text + "|" + obj.root.pos_ + return text + + +def split_key(key): + return tuple(key.replace("_", " ").rsplit("|", 1)) + + +def make_token_key(token): + return token.text.replace(" ", "_") + "|" + token.pos_ + + +def make_span_key(span): + text = span.text.replace(" ", "_") + if span.label_: + return text + "|" + span.label_ + return text + "|" + span.root.pos_ + + +def get_phrases(doc): + spans = list(doc.ents) + if doc.is_parsed: + for np in doc.noun_chunks: + while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): + np = np[1:] + spans.append(np) + return spans diff --git a/sense2vec/vectors.pxd b/sense2vec/vectors.pxd deleted file mode 100644 index 9c98aa4..0000000 --- a/sense2vec/vectors.pxd +++ /dev/null @@ -1,28 +0,0 @@ -from libcpp.vector cimport vector -from preshed.maps cimport PreshMap -from murmurhash.mrmr cimport hash64 -from cymem.cymem cimport Pool -from ._strings cimport StringStore, hash_string - - -cdef class VectorMap: - cdef readonly Pool mem - cdef readonly VectorStore data - cdef readonly StringStore strings - cdef readonly PreshMap freqs - - -cdef class VectorStore: - cdef readonly Pool mem - cdef readonly PreshMap cache - cdef vector[float*] vectors - cdef vector[float] norms - cdef vector[float] _similarities - cdef readonly int nr_dim - - -cdef float get_l2_norm(const float* vec, int n) nogil - - -cdef float cosine_similarity(const float* v1, const float* v2, - float norm1, float norm2, int n) nogil diff --git a/sense2vec/vectors.pyx b/sense2vec/vectors.pyx deleted file mode 100644 index 2afec80..0000000 --- a/sense2vec/vectors.pyx +++ /dev/null @@ -1,358 +0,0 @@ -# cython: profile=True -# cython: cdivision=True -# cython: infer_types=True -cimport cython.parallel -cimport cpython.array - -from libc.stdint cimport int32_t -from libc.stdint cimport uint64_t -from libc.string cimport memcpy -from libc.math cimport sqrt - -from libcpp.pair cimport pair -from libcpp.queue cimport priority_queue -from libcpp.vector cimport vector -from preshed.maps cimport PreshMap -from murmurhash.mrmr cimport hash64 - -from cymem.cymem cimport Pool -cimport numpy as np -import numpy -import srsly -from os import path - -from .cfile cimport CFile -from ._strings cimport StringStore, hash_string - - -ctypedef pair[float, int] Entry -ctypedef priority_queue[Entry] Queue -ctypedef float (*do_similarity_t)(const float* v1, const float* v2, - float nrm1, float nrm2, int nr_dim) nogil - - -cdef struct _CachedResult: - int* indices - float* scores - int n - - -cdef class VectorMap: - '''Provide key-based access into the VectorStore. Keys are unicode strings. - Also manage freqs.''' - def __init__(self, nr_dim): - self.data = VectorStore(nr_dim) - self.strings = StringStore() - self.freqs = PreshMap() - - @property - def nr_dim(self): - return self.data.nr_dim - - def __len__(self): - '''Number of entries in the map. - - Returns: length int >= 0 - ''' - return self.data.vectors.size() - - def __contains__(self, unicode string): - '''Check whether the VectorMap has a given key. - - Returns: has_key bool - ''' - cdef uint64_t hashed = hash_string(string) - return bool(self.freqs[hashed]) - - def __getitem__(self, unicode key): - '''Retrieve a (frequency, vector) tuple from the vector map, or - raise KeyError if the key is not found. - - Arguments: - key unicode - - Returns: - tuple[int, float32[:self.nr_dim]] - ''' - cdef uint64_t hashed = hash_string(key) - freq = self.freqs[hashed] - if not freq: - raise KeyError(key) - else: - i = self.strings[key] - return freq, self.data[i] - - def __setitem__(self, unicode key, value): - '''Assign a (frequency, vector) tuple to the vector map. - - Arguments: - key unicode - value tuple[int, float32[:self.nr_dim]] - Returns: - None - ''' - # TODO: Handle case where we're over-writing an existing entry. - cdef int freq - cdef float[:] vector - freq, vector = value - idx = self.strings[key] - cdef uint64_t hashed = hash_string(key) - self.freqs[hashed] = freq - assert self.data.vectors.size() == idx - self.data.add(vector) - - def __iter__(self): - '''Iterate over the keys in the map, in order of insertion. - - Generates: - key unicode - ''' - yield from self.strings - - def keys(self): - '''Iterate over the keys in the map, in order of insertion. - - Generates: - key unicode - ''' - yield from self.strings - - def values(self): - '''Iterate over the values in the map, in order of insertion. - - Generates: - (freq,vector) tuple[int, float32[:self.nr_dim]] - ''' - for key, value in self.items(): - yield value - - def items(self): - '''Iterate over the items in the map, in order of insertion. - - Generates: - (key, (freq,vector)): tuple[int, float32[:self.nr_dim]] - ''' - cdef uint64_t hashed - for i, string in enumerate(self.strings): - hashed = hash_string(string) - freq = self.freqs[hashed] - yield string, (freq, self.data[i]) - - - def most_similar(self, float[:] vector, int n=10): - '''Find the keys of the N most similar entries, given a vector. - - Arguments: - vector float[:] - n int default=10 - - Returns: - list[unicode] length<=n - ''' - indices, scores = self.data.most_similar(vector, n) - return [self.strings[idx] for idx in indices], scores - - def add(self, unicode string, int freq, float[:] vector): - '''Insert a vector into the map by value. Makes a copy of the vector. - ''' - idx = self.strings[string] - cdef uint64_t hashed = hash_string(string) - self.freqs[hashed] = freq - assert self.data.vectors.size() == idx - self.data.add(vector) - - def borrow(self, unicode string, int freq, float[:] vector): - '''Insert a vector into the map by reference. Does not copy the data, and - changes to the vector will be reflected in the VectorMap. - - The user is responsible for ensuring that another reference to the vector - is maintained --- otherwise, the Python interpreter will free the memory, - potentially resulting in an invalid read. - ''' - idx = self.strings[string] - cdef uint64_t hashed = hash_string(string) - self.freqs[hashed] = freq - assert self.data.vectors.size() == idx - self.data.borrow(vector) - - def save(self, data_dir): - '''Serialize to a directory. - - * data_dir/strings.json --- The keys, in insertion order. - * data_dir/freqs.json --- The frequencies. - * data_dir/vectors.bin --- The vectors. - ''' - self.strings.to_disk(path.join(data_dir, 'strings.json')) - self.data.save(path.join(data_dir, 'vectors.bin')) - freqs = [] - cdef uint64_t hashed - for string in self.strings: - hashed = hash_string(string) - freq = self.freqs[hashed] - if not freq: - continue - freqs.append([string, freq]) - srsly.write_json(path.join(data_dir, "freqs.json")) - - def load(self, data_dir): - '''Load from a directory: - - * data_dir/strings.json --- The keys, in insertion order. - * data_dir/freqs.json --- The frequencies. - * data_dir/vectors.bin --- The vectors. - ''' - self.data.load(path.join(data_dir, 'vectors.bin')) - self.strings.from_disk(path.join(data_dir, 'strings.json')) - freqs = srsly.read_json(path.join(data_dir, "freqs.json")) - cdef uint64_t hashed - for string, freq in freqs: - hashed = hash_string(string) - self.freqs[hashed] = freq - - -cdef class VectorStore: - '''Maintain an array of float* pointers for word vectors, which the - table may or may not own. Keys and frequencies sold separately --- - we're just a dumb vector of data, that knows how to run linear-scan - similarity queries.''' - def __init__(self, int nr_dim): - self.mem = Pool() - self.nr_dim = nr_dim - zeros = self.mem.alloc(self.nr_dim, sizeof(float)) - self.vectors.push_back(zeros) - self.norms.push_back(0) - self.cache = PreshMap(100000) - - def __getitem__(self, int i): - cdef float* ptr = self.vectors.at(i) - cv = ptr - return numpy.asarray(cv, dtype='float32') - - def add(self, float[:] vec): - assert len(vec) == self.nr_dim - ptr = self.mem.alloc(self.nr_dim, sizeof(float)) - memcpy(ptr, - &vec[0], sizeof(ptr[0]) * self.nr_dim) - self.norms.push_back(get_l2_norm(&ptr[0], self.nr_dim)) - self.vectors.push_back(ptr) - - def borrow(self, float[:] vec): - self.norms.push_back(get_l2_norm(&vec[0], self.nr_dim)) - # Danger! User must ensure this is memory contiguous! - self.vectors.push_back(&vec[0]) - - def similarity(self, float[:] v1, float[:] v2): - '''Measure the similarity between two vectors, using cosine. - - Arguments: - v1 float[:] - v2 float[:] - - Returns: - similarity_score -1self.cache.get(cache_key) - if cached_result is not NULL and cached_result.n == n: - memcpy(&indices[0], cached_result.indices, sizeof(indices[0]) * n) - memcpy(&scores[0], cached_result.scores, sizeof(scores[0]) * n) - else: - # This shouldn't happen. But handle it if it does - if cached_result is not NULL: - if cached_result.indices is not NULL: - self.mem.free(cached_result.indices) - if cached_result.scores is not NULL: - self.mem.free(cached_result.scores) - self.mem.free(cached_result) - self._similarities.resize(self.vectors.size()) - linear_similarity(&indices[0], &scores[0], &self._similarities[0], - n, &query[0], self.nr_dim, - &self.vectors[0], &self.norms[0], self.vectors.size(), - cosine_similarity) - cached_result = <_CachedResult*>self.mem.alloc(sizeof(_CachedResult), 1) - cached_result.n = n - cached_result.indices = self.mem.alloc( - sizeof(cached_result.indices[0]), n) - cached_result.scores = self.mem.alloc( - sizeof(cached_result.scores[0]), n) - self.cache.set(cache_key, cached_result) - memcpy(cached_result.indices, &indices[0], sizeof(indices[0]) * n) - memcpy(cached_result.scores, &scores[0], sizeof(scores[0]) * n) - return indices, scores - - def save(self, loc): - cdef CFile cfile = CFile(loc, 'wb') - cdef float* vec - cdef int32_t nr_vector = self.vectors.size() - cfile.write_from(&nr_vector, 1, sizeof(nr_vector)) - cfile.write_from(&self.nr_dim, 1, sizeof(self.nr_dim)) - for vec in self.vectors: - cfile.write_from(vec, self.nr_dim, sizeof(vec[0])) - cfile.close() - - def load(self, loc): - cdef CFile cfile = CFile(loc, 'rb') - cdef int32_t nr_vector - cfile.read_into(&nr_vector, 1, sizeof(nr_vector)) - cfile.read_into(&self.nr_dim, 1, sizeof(self.nr_dim)) - cdef vector[float] tmp - tmp.resize(self.nr_dim) - cdef float[:] cv - for i in range(nr_vector): - cfile.read_into(&tmp[0], self.nr_dim, sizeof(tmp[0])) - ptr = &tmp[0] - cv = ptr - if i >= 1: - self.add(cv) - cfile.close() - - -cdef void linear_similarity(int* indices, float* scores, float* tmp, - int nr_out, const float* query, int nr_dim, - const float* const* vectors, const float* norms, int nr_vector, - do_similarity_t get_similarity) nogil: - query_norm = get_l2_norm(query, nr_dim) - # Initialize the partially sorted heap - cdef int i - cdef float score - for i in cython.parallel.prange(nr_vector, nogil=True): - tmp[i] = get_similarity(query, vectors[i], query_norm, norms[i], nr_dim) - cdef priority_queue[pair[float, int]] queue - cdef float cutoff = 0 - for i in range(nr_vector): - score = tmp[i] - if score > cutoff: - queue.push(pair[float, int](-score, i)) - cutoff = -queue.top().first - if queue.size() > nr_out: - queue.pop() - # Fill the outputs - i = 0 - while i < nr_out and not queue.empty(): - entry = queue.top() - scores[nr_out-(i+1)] = -entry.first - indices[nr_out-(i+1)] = entry.second - queue.pop() - i += 1 - - -cdef float get_l2_norm(const float* vec, int n) nogil: - norm = 0.0 - for i in range(n): - norm += vec[i] ** 2 - return sqrt(norm) - - -cdef float cosine_similarity(const float* v1, const float* v2, - float norm1, float norm2, int n) nogil: - dot = 0.0 - for i in range(n): - dot += v1[i] * v2[i] - return dot / (norm1 * norm2) diff --git a/setup.py b/setup.py index e2bcb5e..6429f75 100644 --- a/setup.py +++ b/setup.py @@ -1,169 +1,56 @@ #!/usr/bin/env python -from __future__ import print_function +from __future__ import unicode_literals import os import io -import subprocess -import sys -import contextlib -from distutils.command.build_ext import build_ext -from distutils.sysconfig import get_python_inc -from distutils import ccompiler, msvccompiler - -try: - from setuptools import Extension, setup -except ImportError: - from distutils.core import Extension, setup - - -PACKAGES = ["sense2vec", "sense2vec.tests"] - -MOD_NAMES = ["sense2vec.vectors", "sense2vec.cfile", "sense2vec._strings"] - - -# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options -# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used -compile_options = { - "msvc": ["/Ox", "/EHsc"], - "other": ["-O3", "-Wno-unused-function", "-fno-stack-protector"], -} -link_options = {"msvc": [], "other": ["-fno-stack-protector"]} - - -if os.environ.get("USE_BLAS") == "1": - compile_options["other"].extend(["-DUSE_BLAS=1", "-fopenmp"]) -# else: -# link_options['other'].extend([ -# '-fopenmp']) -# - - -class build_ext_subclass(build_ext): - def build_extensions(self): - for e in self.extensions: - e.extra_compile_args = compile_options.get( - self.compiler.compiler_type, compile_options["other"] - ) - for e in self.extensions: - e.extra_link_args = link_options.get( - self.compiler.compiler_type, link_options["other"] - ) - build_ext.build_extensions(self) - - -def generate_cython(root, source): - print("Cythonizing sources") - p = subprocess.call( - [sys.executable, os.path.join(root, "bin", "cythonize.py"), source] - ) - if p != 0: - raise RuntimeError("Running cythonize failed") - - -def is_source_release(path): - return os.path.exists(os.path.join(path, "PKG-INFO")) - - -def clean(path): - for name in MOD_NAMES: - name = name.replace(".", "/") - for ext in [".so", ".html", ".cpp", ".c"]: - file_path = os.path.join(path, name + ext) - if os.path.exists(file_path): - os.unlink(file_path) - - -@contextlib.contextmanager -def chdir(new_dir): - old_dir = os.getcwd() - try: - os.chdir(new_dir) - sys.path.insert(0, new_dir) - yield - finally: - del sys.path[0] - os.chdir(old_dir) +from setuptools import setup, find_packages def setup_package(): + package_name = "sense2vec" root = os.path.abspath(os.path.dirname(__file__)) - src_path = "sense2vec" - - if len(sys.argv) > 1 and sys.argv[1] == "clean": - return clean(root) - - with chdir(root): - with io.open(os.path.join(root, src_path, "about.py"), encoding="utf8") as f: - about = {} - exec(f.read(), about) - with io.open(os.path.join(root, "README.md"), encoding="utf8") as f: - readme = f.read() - - include_dirs = [ - get_python_inc(plat_specific=True), - os.path.join(root, "include"), - ] - - if ( - ccompiler.new_compiler().compiler_type == "msvc" - and msvccompiler.get_build_version() == 9 - ): - include_dirs.append(os.path.join(root, "include", "msvc9")) - - ext_modules = [] - for mod_name in MOD_NAMES: - mod_path = mod_name.replace(".", "/") + ".cpp" - ext_modules.append( - Extension( - mod_name, [mod_path], language="c++", include_dirs=include_dirs - ) - ) - - if not is_source_release(root): - generate_cython(root, src_path) - - setup( - name="sense2vec", - zip_safe=False, - packages=PACKAGES, - package_data={"": ["*.pyx", "*.pxd", "*.h"]}, - description=about["__summary__"], - long_description=readme, - long_description_content_type="text/markdown", - author=about["__author__"], - author_email=about["__email__"], - version=about["__version__"], - url=about["__uri__"], - license=about["__license__"], - ext_modules=ext_modules, - install_requires=[ - "numpy>=1.15.0", - "srsly>=0.1.0,<1.1.0", - "preshed>=2.0.1,<2.1.0", - "murmurhash>=0.28.0,<1.1.0", - "cymem>=2.0.2,<2.1.0", - ], - classifiers=[ - "Development Status :: 4 - Beta", - "Environment :: Console", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: MIT License", - "Operating System :: POSIX :: Linux", - "Operating System :: MacOS :: MacOS X", - "Operating System :: Microsoft :: Windows", - "Programming Language :: Cython", - "Programming Language :: Python :: 2", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.4", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", - "Topic :: Scientific/Engineering", - ], - cmdclass={"build_ext": build_ext_subclass}, - ) + # Read in package meta from about.py + about_path = os.path.join(root, package_name, "about.py") + with io.open(about_path, encoding="utf8") as f: + about = {} + exec(f.read(), about) + + # Get readme + readme_path = os.path.join(root, "README.md") + with io.open(readme_path, encoding="utf8") as f: + readme = f.read() + + setup( + name="sense2vec", + description=about["__summary__"], + long_description=readme, + long_description_content_type="text/markdown", + author=about["__author__"], + author_email=about["__email__"], + url=about["__uri__"], + version=about["__version__"], + license=about["__license__"], + packages=find_packages(), + install_requires=["spacy>=2.1.0", "numpy>=1.15.0", "srsly>=0.1.0,<1.1.0"], + python_requires=">=3.6", + entry_points={ + "spacy_factories": ["sense2vec = sense2vec:Sense2VecComponent.from_nlp"] + }, + classifiers=[ + "Development Status :: 5 - Production/Stable", + "Intended Audience :: Developers", + "Intended Audience :: Science/Research", + "Topic :: Scientific/Engineering :: Artificial Intelligence", + "License :: OSI Approved :: MIT License", + "Operating System :: POSIX :: Linux", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + ], + zip_safe=False, + ) if __name__ == "__main__": From 2dcc0b62246d4ba1d9cfda8d5ef407ce469273d0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Sep 2019 15:14:11 +0200 Subject: [PATCH 057/297] Move tests --- azure-pipelines.yml | 2 +- {sense2vec/tests => tests}/__init__.py | 0 {sense2vec/tests => tests}/test_component.py | 0 {sense2vec/tests => tests}/test_sense2vec.py | 0 4 files changed, 1 insertion(+), 1 deletion(-) rename {sense2vec/tests => tests}/__init__.py (100%) rename {sense2vec/tests => tests}/test_component.py (100%) rename {sense2vec/tests => tests}/test_sense2vec.py (100%) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e6e7df7..32759af 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -57,5 +57,5 @@ jobs: - script: pip install -e . displayName: 'Build and install' - - script: python -m pytest sense2vec + - script: python -m pytest tests displayName: 'Run tests' diff --git a/sense2vec/tests/__init__.py b/tests/__init__.py similarity index 100% rename from sense2vec/tests/__init__.py rename to tests/__init__.py diff --git a/sense2vec/tests/test_component.py b/tests/test_component.py similarity index 100% rename from sense2vec/tests/test_component.py rename to tests/test_component.py diff --git a/sense2vec/tests/test_sense2vec.py b/tests/test_sense2vec.py similarity index 100% rename from sense2vec/tests/test_sense2vec.py rename to tests/test_sense2vec.py From 7d5c1cc886ac1d296e572e768f9f60ca88308011 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Sep 2019 15:14:25 +0200 Subject: [PATCH 058/297] Add error message if keys is string etc. --- sense2vec/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 049ec1a..36f6d25 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -56,6 +56,8 @@ def values(self): yield from self.vectors.values() def most_similar(self, keys, n_similar=10): + if not isinstance(keys, (list, tuple)): + raise ValueError("Expected iterable of keys. Got: {}".format(type(keys))) vecs = [self[key] for key in keys if key in self] arr = numpy.asarray(vecs, dtype=numpy.float32) result_keys, _, scores = self.vectors.most_similar(arr) From e85fd9639f5a50c8f397b9b708c1e7e4ef2cc006 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Sep 2019 15:15:15 +0200 Subject: [PATCH 059/297] Add test data and model tests stub --- .gitignore | 1 - tests/data/key2row | Bin 0 -> 51 bytes tests/data/strings.json | 7 +++++++ tests/data/vectors | Bin 0 -> 2688 bytes tests/test_model.py | 17 +++++++++++++++++ 5 files changed, 24 insertions(+), 1 deletion(-) create mode 100644 tests/data/key2row create mode 100644 tests/data/strings.json create mode 100644 tests/data/vectors create mode 100644 tests/test_model.py diff --git a/.gitignore b/.gitignore index e310d67..e3f8bea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ tmp/ -data/ cythonize.dat *.cpp .pytest_cache diff --git a/tests/data/key2row b/tests/data/key2row new file mode 100644 index 0000000000000000000000000000000000000000..9d868d159cda5c77bc6773f19941028773396124 GIT binary patch literal 51 zcmV-30L=e|&lmvJq|Quf%mB|xK5*66JlTK&&v1X1Og;0k90JcK>9i|?)TKrP&u9GU Ju$A;wD+HC18L|KX literal 0 HcmV?d00001 diff --git a/tests/data/strings.json b/tests/data/strings.json new file mode 100644 index 0000000..19b4ce3 --- /dev/null +++ b/tests/data/strings.json @@ -0,0 +1,7 @@ +[ + "beekeepers|NOUN", + "duck|VERB", + "honey_bees|NOUN", + "Beekeepers|NOUN", + "duck|NOUN" +] \ No newline at end of file diff --git a/tests/data/vectors b/tests/data/vectors new file mode 100644 index 0000000000000000000000000000000000000000..5c50142422d787bd3b78cdabcc88eb880d2598bd GIT binary patch literal 2688 zcmbW3=|9zL8^vjJGKDgf%1{|{5S7jmzx(<%5~58i(WFEZM>3?@uvN$`qDYeBlrdrN z``T?&6vSmY&C7zJ9)L-aG&Db>I8H+|upPk-fk2Bm3NZ_WtqVr*o?L$WwEY;=6pnT! z88l?TT&OYn5B1Yn1%2~>5-I9RdmFgSdof8cqC9y1pGLgxyc*&4CR$P64)(rlptvv_ zIgyG&=M`yicSjw}&K_h^llG%oSdk!`KLrb44WRL*LB3(aSNMmYhTlFrK!x%+-{40N zd7~Om*56Zr7y0i&V(koUE7Ya~N5_fE$u>CXYKNw%%j5gW{H%kmMvjC;-*I;790Xq6g2S0wYfJ+yw;Lex_?0=VzGpEf!Hf0z!Yvo`>vkK;% zd``5c6%&7%^RPesCiRUkU^*|9!=$l0?oH1EPp1INpXP%-Mz&CME*C9a}}esyqXG6Iu3c=4X(W>Y+Su4J>uFV1mtK(Cl^}U-!TzQ0D6j z&&%E+Yf4;j-gzCi-SHWz>(@cm2QyjuEf3*EYJPdcsaj~lF4C%DOxjZ$k$W~9fu@CW=>6HhPs$=$GL8L)36tG zB(BndfB9fD@`{Z2Ct%@wcjyXyN}AXHjn6lUY3Th6hQPL&(M+9x8vSqV?Yk zV6VbHOlzKx;ZohiwpE^sibZWr8|LofD>Q6YJX6`pp>KaTLz7qQ zf#)3ys=u~yeAjmB;|ySbUJfNr>0uZNCfRo1sYl0Qcy?4w*sS)E@>SIYZxekWS1ki1 z7Hbk)7Ypcbnxqn;_aNiGJv{mRLSQnagr07#aDlTAgK|xw-fKiKd_5m-{OYn=v5II~ zc3W^cCZO!qg*6$?MC*$SMlH$%uj9|5$mxc_x-*(Ab&o)!z|)}q&KpZa(}lJ!9(0ZA zGQjEqfx${2_~&D$KxS?cUG(@R+4jf=qXdbdsdOEgibsqBEhVwaGcl~xlddv+Kx9AE z(!;XbQ8fD-^%m8jwu=#Fg>uNYp;lbyc|-8ziY&-yDB&FMS;AyTD|{Amm()q}F~i~> zmau10d}6uqkJ55-dn6R5v}of-RZB|!l1YP^JYiqdg9S%PSl_t~A6%+u61Qov6Ekxl zW|IV?li`T2K04;MnzF357(#LBAXW8>M6~w8{4L9cOMXfbJ0fhL+9$WZyki}8EFPcZ-kPXo;KKSJ+zH>19{xKIx2n8mZ! z0Nojk;2e%(&MYkWa1gW3zF=f{X_$1Z7UYd4Y1ft#7{57+o2Hj!E;6UcGCws z5ALR(ZpoNfZV4({N9i|@T*%uM4!xhYVuf)!v{qCLEQ8PBg4%t8trN%LSe+tkRrx2L z^H+k2s+ARG90iPeHb2mXZ~ccGO(_OZ0Ctg@Jl?v6tz2o5OwkHZ_%W|Dh#;>)v#_y z1=fjYV*J1=>{BVg-Bsx{VDuQq++Tt%jy+^FVG6F;T!UQQYz!VsLS^q{9RB!}>2U8w zlVThCl-GpTp*x6sXa$+EF5Tj%-AqB@mzT8cOEif~{Q*61r=rE^CcF`&4Q-9;EC~vR z=W`i!$j}vjSC4}qE|P5FDIT1;JQH@d7f};`GrS@*M`&{^ia5b;c*(nmvQwo%a;YyQ z%hf=vbv{(NpJOZv7C~O_b-Y`hL_?SQ!^TEMRxw6GD7)be_BCDv|J$il{;Lb7Y+Hry zyG3B(u?h{>>taFWX1Hu6O>*V8!@{ZSF>9y=YTfq3!axyjs4}L!Nse%QY#~Mz+`@(f zA7OBR6x@+&MTR#Fr9(TRZI-l9rMi#)SjAzp$9%zXeG^&Wz!5&pbf>59sR&EQV<4+D z7jhbxK*#Da`a;hDRB1Xh`8uAStV$zXr%(9s^K5Ep6+y+m&c{c`dcmjrPyDDI2`ask z_*+0SEK}Ab#$DxfppZ{~7r(+ZTqclm`YAXn_Q9b9DI%)n67zu{OugJ@dhd`qu00b6 zjF+V_&@GVU=iQ=XZmR54#uNiA#c=*%eO9xx946vc0*MhxOVMus*dPP7XaL?M(p zZYJmC;>i*hGbkB~psRnUOwypbuq4+84GSK_RgNp1G0MX24sWp_YysN7;$Zji17;>W zj2(J+1lfgr+>KIglJ)6nGC^q<^1xfYe+2L0)Q-_SS5KtUu0xT=#iYisC?g zcO@iT)`CdxbKD}?CeTxwi}bQ2I4L}%bC%BrH9uRtR)3VSvpfj>Mt%ZjiU75|gwWj( zLZK|pyvCxHDoZqhbEGG1cc~%l-IJhltPG@$OsG}lO=$TZh;MRpQLQKk6U1Uc(t3k?^dm0!F9z3!3+&f<%}x)73Y?*O$yfd$VOw#`F_d;0-1x zC&-D0^I(IofAO~yeyZjPny+PpvFIXned9s)Lq6qwxKHkI#bJ46JDMj_a-{YqPW)R1 z?DPo;k(mll&7|2m(WW@MDFpYd3n57X@vukq7#}*Cp=)V0`i08i+8`x Date: Fri, 27 Sep 2019 16:40:39 +0200 Subject: [PATCH 060/297] Tidy up --- sense2vec/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 36f6d25..c9c3882 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -59,8 +59,8 @@ def most_similar(self, keys, n_similar=10): if not isinstance(keys, (list, tuple)): raise ValueError("Expected iterable of keys. Got: {}".format(type(keys))) vecs = [self[key] for key in keys if key in self] - arr = numpy.asarray(vecs, dtype=numpy.float32) - result_keys, _, scores = self.vectors.most_similar(arr) + queries = numpy.asarray(vecs, dtype=numpy.float32) + result_keys, _, scores = self.vectors.most_similar(queries) result = zip(result_keys, scores) result = [(self.strings[key], score) for key, score in result if key] result = [(key, score) for key, score in result if key not in keys] @@ -115,7 +115,7 @@ def __init__( self.split_key = split_key @classmethod - def from_nlp(cls, nlp): + def from_nlp(cls, nlp, **kwargs): return cls(vocab=nlp.vocab) def __call__(self, doc): From d0d3c53e5a6da3cf77a6e6dd54535fbd8b5c7883 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 27 Sep 2019 20:15:54 +0200 Subject: [PATCH 061/297] Update, type hints and more methods --- .gitignore | 1 + LICENSE | 2 +- MANIFEST.in | 1 - azure-pipelines.yml | 15 - bin/preprocess.py | 76 +- bin/train.py | 51 +- include/cblas_shim.h | 76 - include/msvc9/stdint.h | 259 --- include/murmurhash/MurmurHash2.h | 22 - include/murmurhash/MurmurHash3.h | 28 - include/numpy/__multiarray_api.h | 1686 -------------- include/numpy/__ufunc_api.h | 323 --- include/numpy/_neighborhood_iterator_imp.h | 90 - include/numpy/_numpyconfig.h | 29 - include/numpy/arrayobject.h | 22 - include/numpy/arrayscalars.h | 175 -- include/numpy/halffloat.h | 69 - include/numpy/multiarray_api.txt | 2375 -------------------- include/numpy/ndarrayobject.h | 244 -- include/numpy/ndarraytypes.h | 1731 -------------- include/numpy/noprefix.h | 209 -- include/numpy/npy_3kcompat.h | 417 ---- include/numpy/npy_common.h | 930 -------- include/numpy/npy_cpu.h | 109 - include/numpy/npy_deprecated_api.h | 129 -- include/numpy/npy_endian.h | 46 - include/numpy/npy_interrupt.h | 117 - include/numpy/npy_math.h | 438 ---- include/numpy/npy_no_deprecated_api.h | 19 - include/numpy/npy_os.h | 30 - include/numpy/numpyconfig.h | 33 - include/numpy/old_defines.h | 187 -- include/numpy/oldnumeric.h | 23 - include/numpy/ufunc_api.txt | 312 --- include/numpy/ufuncobject.h | 448 ---- include/numpy/utils.h | 19 - requirements.txt | 2 +- sense2vec/__init__.py | 173 +- sense2vec/component.py | 92 + sense2vec/sense2vec.py | 125 ++ sense2vec/util.py | 53 +- setup.py | 2 +- tests/data/cfg | 1 + tests/test_sense2vec.py | 13 + 44 files changed, 339 insertions(+), 10863 deletions(-) delete mode 100644 include/cblas_shim.h delete mode 100644 include/msvc9/stdint.h delete mode 100644 include/murmurhash/MurmurHash2.h delete mode 100644 include/murmurhash/MurmurHash3.h delete mode 100644 include/numpy/__multiarray_api.h delete mode 100644 include/numpy/__ufunc_api.h delete mode 100644 include/numpy/_neighborhood_iterator_imp.h delete mode 100644 include/numpy/_numpyconfig.h delete mode 100644 include/numpy/arrayobject.h delete mode 100644 include/numpy/arrayscalars.h delete mode 100644 include/numpy/halffloat.h delete mode 100644 include/numpy/multiarray_api.txt delete mode 100644 include/numpy/ndarrayobject.h delete mode 100644 include/numpy/ndarraytypes.h delete mode 100644 include/numpy/noprefix.h delete mode 100644 include/numpy/npy_3kcompat.h delete mode 100644 include/numpy/npy_common.h delete mode 100644 include/numpy/npy_cpu.h delete mode 100644 include/numpy/npy_deprecated_api.h delete mode 100644 include/numpy/npy_endian.h delete mode 100644 include/numpy/npy_interrupt.h delete mode 100644 include/numpy/npy_math.h delete mode 100644 include/numpy/npy_no_deprecated_api.h delete mode 100644 include/numpy/npy_os.h delete mode 100644 include/numpy/numpyconfig.h delete mode 100644 include/numpy/old_defines.h delete mode 100644 include/numpy/oldnumeric.h delete mode 100644 include/numpy/ufunc_api.txt delete mode 100644 include/numpy/ufuncobject.h delete mode 100644 include/numpy/utils.h create mode 100644 sense2vec/component.py create mode 100644 sense2vec/sense2vec.py create mode 100644 tests/data/cfg diff --git a/.gitignore b/.gitignore index e3f8bea..fade098 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ cythonize.dat *.cpp .pytest_cache .vscode +.mypy_cache # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/LICENSE b/LICENSE index 7d042a7..b8ba168 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2016 spaCy GmbH, 2016 ExplosionAI UG (haftungsbeschränkt) +Copyright (C) 2019 ExplosionAI GmbH Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/MANIFEST.in b/MANIFEST.in index 43d36be..c1a7121 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,2 @@ -recursive-include include *.h include LICENSE include README.md diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 32759af..dbd2d2d 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -8,21 +8,6 @@ jobs: - job: 'Test' strategy: matrix: - Python27Linux: - imageName: 'ubuntu-16.04' - python.version: '2.7' - Python27Mac: - imageName: 'macos-10.13' - python.version: '2.7' - Python35Linux: - imageName: 'ubuntu-16.04' - python.version: '3.5' - Python35Windows: - imageName: 'vs2017-win2016' - python.version: '3.5' - Python35Mac: - imageName: 'macos-10.13' - python.version: '3.5' Python36Linux: imageName: 'ubuntu-16.04' python.version: '3.6' diff --git a/bin/preprocess.py b/bin/preprocess.py index f3ad1e9..76dd6e3 100644 --- a/bin/preprocess.py +++ b/bin/preprocess.py @@ -1,73 +1,63 @@ #!/usr/bin/env python -# coding: utf-8 -"""This script can be used to preprocess a corpus for training a sense2vec -model. It take text file with one sentence per line, and outputs a text file -with one sentence per line in the expected sense2vec format (merged noun -phrases, concatenated phrases with underscores and added "senses"). - -Example input: -Rats, mould and broken furniture: the scandal of the UK's refugee housing - -Example output: -Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT -the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN - -DISCLAIMER: The sense2vec training and preprocessing tools are still a work in -progress. Please note that this script hasn't been optimised for efficiency yet -and doesn't paralellize or batch up any of the work, so you might have to -add this functionality yourself for now. -""" -from __future__ import print_function, unicode_literals - -from sense2vec import transform_doc +from sense2vec.util import merge_phrases, make_spacy_key import spacy from pathlib import Path from tqdm import tqdm -import re import plac -def represent_word(word): - if word.like_url: - return '%%URL|X' - text = re.sub(r'\s', '_', word.text) - tag = word.ent_type_ or word.pos_ or '?' - return text + '|' + tag - - def represent_doc(doc): strings = [] for sent in doc.sents: if sent.text.strip(): - words = ' '.join(represent_word(w) for w in sent if not w.is_space) + words = " ".join(make_spacy_key(w) for w in sent if not w.is_space) strings.append(words) - return '\n'.join(strings) + '\n' if strings else '' + return "\n".join(strings) + "\n" if strings else "" @plac.annotations( in_file=("Path to input file", "positional", None, str), out_file=("Path to output file", "positional", None, str), spacy_model=("Name of spaCy model to use", "positional", None, str), - n_workers=("Number of workers", "option", "n", int)) -def main(in_file, out_file, spacy_model='en_core_web_sm', n_workers=4): + n_workers=("Number of workers", "option", "n", int), +) +def main(in_file, out_file, spacy_model="en_core_web_sm", n_workers=4): + """ + This script can be used to preprocess a corpus for training a sense2vec + model. It take text file with one sentence per line, and outputs a text file + with one sentence per line in the expected sense2vec format (merged noun + phrases, concatenated phrases with underscores and added "senses"). + + Example input: + Rats, mould and broken furniture: the scandal of the UK's refugee housing + + Example output: + Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT + the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN + + DISCLAIMER: The sense2vec training and preprocessing tools are still a work + in progress. Please note that this script hasn't been optimised for + efficiency yet and doesn't paralellize or batch up any of the work, so you + might have to add this functionality yourself for now. + """ input_path = Path(in_file) output_path = Path(out_file) if not input_path.exists(): - raise IOError("Can't find input file: {}".format(input_path)) + raise IOError(f"Can't find input file: {in_file}") nlp = spacy.load(spacy_model) - print("Using spaCy model {}".format(spacy_model)) - nlp.add_pipe(transform_doc, name='sense2vec') + print(f"Using spaCy model {spacy_model}") + nlp.add_pipe(merge_phrases, name="merge_sense2vec_phrases") lines_count = 0 - with input_path.open('r', encoding='utf8') as texts: + with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_threads=n_workers) lines = (represent_doc(doc) for doc in docs) - with output_path.open('w', encoding='utf8') as f: - for line in tqdm(lines, desc='Lines', unit=''): + with output_path.open("w", encoding="utf8") as f: + for line in tqdm(lines, desc="Lines", unit=""): lines_count += 1 f.write(line) - print("Successfully preprocessed {} lines".format(lines_count)) - print("{}".format(output_path.resolve())) + print(f"Successfully preprocessed {lines_count} lines") + print(output_path.resolve()) -if __name__ == '__main__': +if __name__ == "__main__": plac.call(main) diff --git a/bin/train.py b/bin/train.py index 6f3a121..4ac425c 100644 --- a/bin/train.py +++ b/bin/train.py @@ -1,6 +1,7 @@ from gensim.models import Word2Vec from gensim.models.word2vec import PathLineSentences -from sense2vec.vectors import VectorMap +from sense2vec import Sense2Vec +from sense2vec.util import split_key import plac @@ -12,31 +13,53 @@ window=("Context window size", "option", "w", int), min_count=("Min count", "option", "m", int), negative=("Number of negative samples", "option", "g", int), - nr_iter=("Number of iterations", "option", "i", int),) -def train(in_dir, out_file, negative=5, n_workers=4, window=5, size=128, - min_count=10, nr_iter=2): - w2v_model = Word2Vec(size=size, window=window, min_count=min_count, - workers=workers, sample=1e-5, negative=negative, - iter=epochs) + nr_iter=("Number of iterations", "option", "i", int), +) +def main( + in_dir, + out_file, + negative=5, + n_workers=4, + window=5, + size=128, + min_count=10, + nr_iter=2, +): + w2v_model = Word2Vec( + size=size, + window=window, + min_count=min_count, + workers=n_workers, + sample=1e-5, + negative=negative, + iter=nr_iter, + ) sentences = PathLineSentences(in_dir) print("Building the vocabulary...") w2v_model.build_vocab(sentences) print("Training the model...") - w2v_model.train(sentences, total_examples=w2v_model.corpus_count, - epochs=w2v_model.iter) + w2v_model.train( + sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter + ) print("Creating the sense2vec model...") - vector_map = VectorMap(size) + vectors = [] + all_senses = set() for string in w2v_model.wv.vocab: vocab = w2v_model.wv.vocab[string] freq, idx = vocab.count, vocab.index if freq < min_count: continue vector = w2v_model.wv.vectors[idx] - vector_map.borrow(string, freq, vector) + vectors.append((string, freq, vector)) + _, sense = split_key(string) + all_senses.add(sense) + s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses) + for string, _, vector in vectors: + s2v.add(string, vector) print("Saving the model...") - vector_map.save(out_file) - print("Saved model to file: ", out_file) + s2v.to_disk(out_file) + print(f"Saved model to file: {out_file}") -if __name__ == '__main__': +if __name__ == "__main__": plac.call(main) diff --git a/include/cblas_shim.h b/include/cblas_shim.h deleted file mode 100644 index 96c9b90..0000000 --- a/include/cblas_shim.h +++ /dev/null @@ -1,76 +0,0 @@ -#ifdef __cplusplus -extern "C" -{ -#endif // __cplusplus -#ifdef USE_BLAS -#include - -int _use_blas() -{ - return 1; -} -#else // USE_BLAS -#include - -#if defined(_MSC_VER) -#define ALIGNAS(byte_alignment) __declspec(align(byte_alignment)) -#elif defined(__GNUC__) -#define ALIGNAS(byte_alignment) __attribute__((aligned(byte_alignment))) -#endif - -float cblas_snrm2(const int N, const float *m1, const int incX) -{ - if (N % 4 != 0) { - fprintf(stderr, "cblas_snrm2() expects N to be a multiple of 4.\n"); - exit(EXIT_FAILURE); - } - - float norm = 0; - ALIGNAS(16) float z[4]; - __m128 X; - __m128 Z = _mm_setzero_ps(); - - for (int i=0; i 1000 -#pragma once -#endif - -#if _MSC_VER >= 1600 // [ -#include -#else // ] _MSC_VER >= 1600 [ - -#include - -// For Visual Studio 6 in C++ mode and for many Visual Studio versions when -// compiling for ARM we should wrap include with 'extern "C++" {}' -// or compiler give many errors like this: -// error C2733: second C linkage of overloaded function 'wmemchr' not allowed -#ifdef __cplusplus -extern "C" { -#endif -# include -#ifdef __cplusplus -} -#endif - -// Define _W64 macros to mark types changing their size, like intptr_t. -#ifndef _W64 -# if !defined(__midl) && (defined(_X86_) || defined(_M_IX86)) && _MSC_VER >= 1300 -# define _W64 __w64 -# else -# define _W64 -# endif -#endif - - -// 7.18.1 Integer types - -// 7.18.1.1 Exact-width integer types - -// Visual Studio 6 and Embedded Visual C++ 4 doesn't -// realize that, e.g. char has the same size as __int8 -// so we give up on __intX for them. -#if (_MSC_VER < 1300) - typedef signed char int8_t; - typedef signed short int16_t; - typedef signed int int32_t; - typedef unsigned char uint8_t; - typedef unsigned short uint16_t; - typedef unsigned int uint32_t; -#else - typedef signed __int8 int8_t; - typedef signed __int16 int16_t; - typedef signed __int32 int32_t; - typedef unsigned __int8 uint8_t; - typedef unsigned __int16 uint16_t; - typedef unsigned __int32 uint32_t; -#endif -typedef signed __int64 int64_t; -typedef unsigned __int64 uint64_t; - - -// 7.18.1.2 Minimum-width integer types -typedef int8_t int_least8_t; -typedef int16_t int_least16_t; -typedef int32_t int_least32_t; -typedef int64_t int_least64_t; -typedef uint8_t uint_least8_t; -typedef uint16_t uint_least16_t; -typedef uint32_t uint_least32_t; -typedef uint64_t uint_least64_t; - -// 7.18.1.3 Fastest minimum-width integer types -typedef int8_t int_fast8_t; -typedef int16_t int_fast16_t; -typedef int32_t int_fast32_t; -typedef int64_t int_fast64_t; -typedef uint8_t uint_fast8_t; -typedef uint16_t uint_fast16_t; -typedef uint32_t uint_fast32_t; -typedef uint64_t uint_fast64_t; - -// 7.18.1.4 Integer types capable of holding object pointers -#ifdef _WIN64 // [ - typedef signed __int64 intptr_t; - typedef unsigned __int64 uintptr_t; -#else // _WIN64 ][ - typedef _W64 signed int intptr_t; - typedef _W64 unsigned int uintptr_t; -#endif // _WIN64 ] - -// 7.18.1.5 Greatest-width integer types -typedef int64_t intmax_t; -typedef uint64_t uintmax_t; - - -// 7.18.2 Limits of specified-width integer types - -#if !defined(__cplusplus) || defined(__STDC_LIMIT_MACROS) // [ See footnote 220 at page 257 and footnote 221 at page 259 - -// 7.18.2.1 Limits of exact-width integer types -#define INT8_MIN ((int8_t)_I8_MIN) -#define INT8_MAX _I8_MAX -#define INT16_MIN ((int16_t)_I16_MIN) -#define INT16_MAX _I16_MAX -#define INT32_MIN ((int32_t)_I32_MIN) -#define INT32_MAX _I32_MAX -#define INT64_MIN ((int64_t)_I64_MIN) -#define INT64_MAX _I64_MAX -#define UINT8_MAX _UI8_MAX -#define UINT16_MAX _UI16_MAX -#define UINT32_MAX _UI32_MAX -#define UINT64_MAX _UI64_MAX - -// 7.18.2.2 Limits of minimum-width integer types -#define INT_LEAST8_MIN INT8_MIN -#define INT_LEAST8_MAX INT8_MAX -#define INT_LEAST16_MIN INT16_MIN -#define INT_LEAST16_MAX INT16_MAX -#define INT_LEAST32_MIN INT32_MIN -#define INT_LEAST32_MAX INT32_MAX -#define INT_LEAST64_MIN INT64_MIN -#define INT_LEAST64_MAX INT64_MAX -#define UINT_LEAST8_MAX UINT8_MAX -#define UINT_LEAST16_MAX UINT16_MAX -#define UINT_LEAST32_MAX UINT32_MAX -#define UINT_LEAST64_MAX UINT64_MAX - -// 7.18.2.3 Limits of fastest minimum-width integer types -#define INT_FAST8_MIN INT8_MIN -#define INT_FAST8_MAX INT8_MAX -#define INT_FAST16_MIN INT16_MIN -#define INT_FAST16_MAX INT16_MAX -#define INT_FAST32_MIN INT32_MIN -#define INT_FAST32_MAX INT32_MAX -#define INT_FAST64_MIN INT64_MIN -#define INT_FAST64_MAX INT64_MAX -#define UINT_FAST8_MAX UINT8_MAX -#define UINT_FAST16_MAX UINT16_MAX -#define UINT_FAST32_MAX UINT32_MAX -#define UINT_FAST64_MAX UINT64_MAX - -// 7.18.2.4 Limits of integer types capable of holding object pointers -#ifdef _WIN64 // [ -# define INTPTR_MIN INT64_MIN -# define INTPTR_MAX INT64_MAX -# define UINTPTR_MAX UINT64_MAX -#else // _WIN64 ][ -# define INTPTR_MIN INT32_MIN -# define INTPTR_MAX INT32_MAX -# define UINTPTR_MAX UINT32_MAX -#endif // _WIN64 ] - -// 7.18.2.5 Limits of greatest-width integer types -#define INTMAX_MIN INT64_MIN -#define INTMAX_MAX INT64_MAX -#define UINTMAX_MAX UINT64_MAX - -// 7.18.3 Limits of other integer types - -#ifdef _WIN64 // [ -# define PTRDIFF_MIN _I64_MIN -# define PTRDIFF_MAX _I64_MAX -#else // _WIN64 ][ -# define PTRDIFF_MIN _I32_MIN -# define PTRDIFF_MAX _I32_MAX -#endif // _WIN64 ] - -#define SIG_ATOMIC_MIN INT_MIN -#define SIG_ATOMIC_MAX INT_MAX - -#ifndef SIZE_MAX // [ -# ifdef _WIN64 // [ -# define SIZE_MAX _UI64_MAX -# else // _WIN64 ][ -# define SIZE_MAX _UI32_MAX -# endif // _WIN64 ] -#endif // SIZE_MAX ] - -// WCHAR_MIN and WCHAR_MAX are also defined in -#ifndef WCHAR_MIN // [ -# define WCHAR_MIN 0 -#endif // WCHAR_MIN ] -#ifndef WCHAR_MAX // [ -# define WCHAR_MAX _UI16_MAX -#endif // WCHAR_MAX ] - -#define WINT_MIN 0 -#define WINT_MAX _UI16_MAX - -#endif // __STDC_LIMIT_MACROS ] - - -// 7.18.4 Limits of other integer types - -#if !defined(__cplusplus) || defined(__STDC_CONSTANT_MACROS) // [ See footnote 224 at page 260 - -// 7.18.4.1 Macros for minimum-width integer constants - -#define INT8_C(val) val##i8 -#define INT16_C(val) val##i16 -#define INT32_C(val) val##i32 -#define INT64_C(val) val##i64 - -#define UINT8_C(val) val##ui8 -#define UINT16_C(val) val##ui16 -#define UINT32_C(val) val##ui32 -#define UINT64_C(val) val##ui64 - -// 7.18.4.2 Macros for greatest-width integer constants -// These #ifndef's are needed to prevent collisions with . -// Check out Issue 9 for the details. -#ifndef INTMAX_C // [ -# define INTMAX_C INT64_C -#endif // INTMAX_C ] -#ifndef UINTMAX_C // [ -# define UINTMAX_C UINT64_C -#endif // UINTMAX_C ] - -#endif // __STDC_CONSTANT_MACROS ] - -#endif // _MSC_VER >= 1600 ] - -#endif // _MSC_STDINT_H_ ] diff --git a/include/murmurhash/MurmurHash2.h b/include/murmurhash/MurmurHash2.h deleted file mode 100644 index 6d7ccf4..0000000 --- a/include/murmurhash/MurmurHash2.h +++ /dev/null @@ -1,22 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash2 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef _MURMURHASH2_H_ -#define _MURMURHASH2_H_ - -#include - -//----------------------------------------------------------------------------- - -uint32_t MurmurHash2 ( const void * key, int len, uint32_t seed ); -uint64_t MurmurHash64A ( const void * key, int len, uint64_t seed ); -uint64_t MurmurHash64B ( const void * key, int len, uint64_t seed ); -uint32_t MurmurHash2A ( const void * key, int len, uint32_t seed ); -uint32_t MurmurHashNeutral2 ( const void * key, int len, uint32_t seed ); -uint32_t MurmurHashAligned2 ( const void * key, int len, uint32_t seed ); - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH2_H_ - diff --git a/include/murmurhash/MurmurHash3.h b/include/murmurhash/MurmurHash3.h deleted file mode 100644 index 9b4c3c9..0000000 --- a/include/murmurhash/MurmurHash3.h +++ /dev/null @@ -1,28 +0,0 @@ -//----------------------------------------------------------------------------- -// MurmurHash3 was written by Austin Appleby, and is placed in the public -// domain. The author hereby disclaims copyright to this source code. - -#ifndef _MURMURHASH3_H_ -#define _MURMURHASH3_H_ - -#include - -//----------------------------------------------------------------------------- -#ifdef __cplusplus -extern "C" { -#endif - - -void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); - -void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); - -#ifdef __cplusplus -} -#endif - -//----------------------------------------------------------------------------- - -#endif // _MURMURHASH3_H_ diff --git a/include/numpy/__multiarray_api.h b/include/numpy/__multiarray_api.h deleted file mode 100644 index c949d73..0000000 --- a/include/numpy/__multiarray_api.h +++ /dev/null @@ -1,1686 +0,0 @@ - -#ifdef _MULTIARRAYMODULE - -typedef struct { - PyObject_HEAD - npy_bool obval; -} PyBoolScalarObject; - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION -extern NPY_NO_EXPORT PyTypeObject PyArrayMapIter_Type; -extern NPY_NO_EXPORT PyTypeObject PyArrayNeighborhoodIter_Type; -extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; -#else -NPY_NO_EXPORT PyTypeObject PyArrayMapIter_Type; -NPY_NO_EXPORT PyTypeObject PyArrayNeighborhoodIter_Type; -NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; -#endif - -NPY_NO_EXPORT unsigned int PyArray_GetNDArrayCVersion \ - (void); -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyBigArray_Type; -#else - NPY_NO_EXPORT PyTypeObject PyBigArray_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArray_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArray_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArrayDescr_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArrayDescr_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArrayFlags_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArrayFlags_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArrayIter_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArrayIter_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyArrayMultiIter_Type; -#else - NPY_NO_EXPORT PyTypeObject PyArrayMultiIter_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT int NPY_NUMUSERTYPES; -#else - NPY_NO_EXPORT int NPY_NUMUSERTYPES; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyBoolArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyBoolArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION -extern NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; -#else -NPY_NO_EXPORT PyBoolScalarObject _PyArrayScalar_BoolValues[2]; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyGenericArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyNumberArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyNumberArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyIntegerArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyIntegerArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PySignedIntegerArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PySignedIntegerArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUnsignedIntegerArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUnsignedIntegerArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyInexactArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyInexactArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyFloatingArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyFloatingArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyComplexFloatingArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyComplexFloatingArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyFlexibleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyFlexibleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyCharacterArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyCharacterArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyByteArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyByteArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyShortArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyShortArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyIntArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyIntArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyLongArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyLongArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyLongLongArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyLongLongArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUByteArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUByteArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUShortArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUShortArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUIntArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUIntArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyULongArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyULongArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyULongLongArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyULongLongArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyFloatArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyFloatArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyDoubleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyDoubleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyLongDoubleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyLongDoubleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyCFloatArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyCFloatArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyCDoubleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyCDoubleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyCLongDoubleArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyCLongDoubleArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyObjectArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyStringArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyStringArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUnicodeArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUnicodeArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyVoidArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyVoidArrType_Type; -#endif - -NPY_NO_EXPORT int PyArray_SetNumericOps \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_GetNumericOps \ - (void); -NPY_NO_EXPORT int PyArray_INCREF \ - (PyArrayObject *); -NPY_NO_EXPORT int PyArray_XDECREF \ - (PyArrayObject *); -NPY_NO_EXPORT void PyArray_SetStringFunction \ - (PyObject *, int); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrFromType \ - (int); -NPY_NO_EXPORT PyObject * PyArray_TypeObjectFromType \ - (int); -NPY_NO_EXPORT char * PyArray_Zero \ - (PyArrayObject *); -NPY_NO_EXPORT char * PyArray_One \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_CastToType \ - (PyArrayObject *, PyArray_Descr *, int); -NPY_NO_EXPORT int PyArray_CastTo \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CastAnyTo \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CanCastSafely \ - (int, int); -NPY_NO_EXPORT npy_bool PyArray_CanCastTo \ - (PyArray_Descr *, PyArray_Descr *); -NPY_NO_EXPORT int PyArray_ObjectType \ - (PyObject *, int); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrFromObject \ - (PyObject *, PyArray_Descr *); -NPY_NO_EXPORT PyArrayObject ** PyArray_ConvertToCommonType \ - (PyObject *, int *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrFromScalar \ - (PyObject *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrFromTypeObject \ - (PyObject *); -NPY_NO_EXPORT npy_intp PyArray_Size \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Scalar \ - (void *, PyArray_Descr *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromScalar \ - (PyObject *, PyArray_Descr *); -NPY_NO_EXPORT void PyArray_ScalarAsCtype \ - (PyObject *, void *); -NPY_NO_EXPORT int PyArray_CastScalarToCtype \ - (PyObject *, void *, PyArray_Descr *); -NPY_NO_EXPORT int PyArray_CastScalarDirect \ - (PyObject *, PyArray_Descr *, void *, int); -NPY_NO_EXPORT PyObject * PyArray_ScalarFromObject \ - (PyObject *); -NPY_NO_EXPORT PyArray_VectorUnaryFunc * PyArray_GetCastFunc \ - (PyArray_Descr *, int); -NPY_NO_EXPORT PyObject * PyArray_FromDims \ - (int, int *, int); -NPY_NO_EXPORT PyObject * PyArray_FromDimsAndDataAndDescr \ - (int, int *, PyArray_Descr *, char *); -NPY_NO_EXPORT PyObject * PyArray_FromAny \ - (PyObject *, PyArray_Descr *, int, int, int, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_EnsureArray \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_EnsureAnyArray \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromFile \ - (FILE *, PyArray_Descr *, npy_intp, char *); -NPY_NO_EXPORT PyObject * PyArray_FromString \ - (char *, npy_intp, PyArray_Descr *, npy_intp, char *); -NPY_NO_EXPORT PyObject * PyArray_FromBuffer \ - (PyObject *, PyArray_Descr *, npy_intp, npy_intp); -NPY_NO_EXPORT PyObject * PyArray_FromIter \ - (PyObject *, PyArray_Descr *, npy_intp); -NPY_NO_EXPORT PyObject * PyArray_Return \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_GetField \ - (PyArrayObject *, PyArray_Descr *, int); -NPY_NO_EXPORT int PyArray_SetField \ - (PyArrayObject *, PyArray_Descr *, int, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Byteswap \ - (PyArrayObject *, npy_bool); -NPY_NO_EXPORT PyObject * PyArray_Resize \ - (PyArrayObject *, PyArray_Dims *, int, NPY_ORDER); -NPY_NO_EXPORT int PyArray_MoveInto \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CopyInto \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CopyAnyInto \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT int PyArray_CopyObject \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_NewCopy \ - (PyArrayObject *, NPY_ORDER); -NPY_NO_EXPORT PyObject * PyArray_ToList \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_ToString \ - (PyArrayObject *, NPY_ORDER); -NPY_NO_EXPORT int PyArray_ToFile \ - (PyArrayObject *, FILE *, char *, char *); -NPY_NO_EXPORT int PyArray_Dump \ - (PyObject *, PyObject *, int); -NPY_NO_EXPORT PyObject * PyArray_Dumps \ - (PyObject *, int); -NPY_NO_EXPORT int PyArray_ValidType \ - (int); -NPY_NO_EXPORT void PyArray_UpdateFlags \ - (PyArrayObject *, int); -NPY_NO_EXPORT PyObject * PyArray_New \ - (PyTypeObject *, int, npy_intp *, int, npy_intp *, void *, int, int, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_NewFromDescr \ - (PyTypeObject *, PyArray_Descr *, int, npy_intp *, npy_intp *, void *, int, PyObject *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrNew \ - (PyArray_Descr *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrNewFromType \ - (int); -NPY_NO_EXPORT double PyArray_GetPriority \ - (PyObject *, double); -NPY_NO_EXPORT PyObject * PyArray_IterNew \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_MultiIterNew \ - (int, ...); -NPY_NO_EXPORT int PyArray_PyIntAsInt \ - (PyObject *); -NPY_NO_EXPORT npy_intp PyArray_PyIntAsIntp \ - (PyObject *); -NPY_NO_EXPORT int PyArray_Broadcast \ - (PyArrayMultiIterObject *); -NPY_NO_EXPORT void PyArray_FillObjectArray \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT int PyArray_FillWithScalar \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT npy_bool PyArray_CheckStrides \ - (int, int, npy_intp, npy_intp, npy_intp *, npy_intp *); -NPY_NO_EXPORT PyArray_Descr * PyArray_DescrNewByteorder \ - (PyArray_Descr *, char); -NPY_NO_EXPORT PyObject * PyArray_IterAllButAxis \ - (PyObject *, int *); -NPY_NO_EXPORT PyObject * PyArray_CheckFromAny \ - (PyObject *, PyArray_Descr *, int, int, int, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromArray \ - (PyArrayObject *, PyArray_Descr *, int); -NPY_NO_EXPORT PyObject * PyArray_FromInterface \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromStructInterface \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_FromArrayAttr \ - (PyObject *, PyArray_Descr *, PyObject *); -NPY_NO_EXPORT NPY_SCALARKIND PyArray_ScalarKind \ - (int, PyArrayObject **); -NPY_NO_EXPORT int PyArray_CanCoerceScalar \ - (int, int, NPY_SCALARKIND); -NPY_NO_EXPORT PyObject * PyArray_NewFlagsObject \ - (PyObject *); -NPY_NO_EXPORT npy_bool PyArray_CanCastScalar \ - (PyTypeObject *, PyTypeObject *); -NPY_NO_EXPORT int PyArray_CompareUCS4 \ - (npy_ucs4 *, npy_ucs4 *, size_t); -NPY_NO_EXPORT int PyArray_RemoveSmallest \ - (PyArrayMultiIterObject *); -NPY_NO_EXPORT int PyArray_ElementStrides \ - (PyObject *); -NPY_NO_EXPORT void PyArray_Item_INCREF \ - (char *, PyArray_Descr *); -NPY_NO_EXPORT void PyArray_Item_XDECREF \ - (char *, PyArray_Descr *); -NPY_NO_EXPORT PyObject * PyArray_FieldNames \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Transpose \ - (PyArrayObject *, PyArray_Dims *); -NPY_NO_EXPORT PyObject * PyArray_TakeFrom \ - (PyArrayObject *, PyObject *, int, PyArrayObject *, NPY_CLIPMODE); -NPY_NO_EXPORT PyObject * PyArray_PutTo \ - (PyArrayObject *, PyObject*, PyObject *, NPY_CLIPMODE); -NPY_NO_EXPORT PyObject * PyArray_PutMask \ - (PyArrayObject *, PyObject*, PyObject*); -NPY_NO_EXPORT PyObject * PyArray_Repeat \ - (PyArrayObject *, PyObject *, int); -NPY_NO_EXPORT PyObject * PyArray_Choose \ - (PyArrayObject *, PyObject *, PyArrayObject *, NPY_CLIPMODE); -NPY_NO_EXPORT int PyArray_Sort \ - (PyArrayObject *, int, NPY_SORTKIND); -NPY_NO_EXPORT PyObject * PyArray_ArgSort \ - (PyArrayObject *, int, NPY_SORTKIND); -NPY_NO_EXPORT PyObject * PyArray_SearchSorted \ - (PyArrayObject *, PyObject *, NPY_SEARCHSIDE, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_ArgMax \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_ArgMin \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Reshape \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Newshape \ - (PyArrayObject *, PyArray_Dims *, NPY_ORDER); -NPY_NO_EXPORT PyObject * PyArray_Squeeze \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_View \ - (PyArrayObject *, PyArray_Descr *, PyTypeObject *); -NPY_NO_EXPORT PyObject * PyArray_SwapAxes \ - (PyArrayObject *, int, int); -NPY_NO_EXPORT PyObject * PyArray_Max \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Min \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Ptp \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Mean \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Trace \ - (PyArrayObject *, int, int, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Diagonal \ - (PyArrayObject *, int, int, int); -NPY_NO_EXPORT PyObject * PyArray_Clip \ - (PyArrayObject *, PyObject *, PyObject *, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Conjugate \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Nonzero \ - (PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Std \ - (PyArrayObject *, int, int, PyArrayObject *, int); -NPY_NO_EXPORT PyObject * PyArray_Sum \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_CumSum \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Prod \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_CumProd \ - (PyArrayObject *, int, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_All \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Any \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Compress \ - (PyArrayObject *, PyObject *, int, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_Flatten \ - (PyArrayObject *, NPY_ORDER); -NPY_NO_EXPORT PyObject * PyArray_Ravel \ - (PyArrayObject *, NPY_ORDER); -NPY_NO_EXPORT npy_intp PyArray_MultiplyList \ - (npy_intp *, int); -NPY_NO_EXPORT int PyArray_MultiplyIntList \ - (int *, int); -NPY_NO_EXPORT void * PyArray_GetPtr \ - (PyArrayObject *, npy_intp*); -NPY_NO_EXPORT int PyArray_CompareLists \ - (npy_intp *, npy_intp *, int); -NPY_NO_EXPORT int PyArray_AsCArray \ - (PyObject **, void *, npy_intp *, int, PyArray_Descr*); -NPY_NO_EXPORT int PyArray_As1D \ - (PyObject **, char **, int *, int); -NPY_NO_EXPORT int PyArray_As2D \ - (PyObject **, char ***, int *, int *, int); -NPY_NO_EXPORT int PyArray_Free \ - (PyObject *, void *); -NPY_NO_EXPORT int PyArray_Converter \ - (PyObject *, PyObject **); -NPY_NO_EXPORT int PyArray_IntpFromSequence \ - (PyObject *, npy_intp *, int); -NPY_NO_EXPORT PyObject * PyArray_Concatenate \ - (PyObject *, int); -NPY_NO_EXPORT PyObject * PyArray_InnerProduct \ - (PyObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_MatrixProduct \ - (PyObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_CopyAndTranspose \ - (PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Correlate \ - (PyObject *, PyObject *, int); -NPY_NO_EXPORT int PyArray_TypestrConvert \ - (int, int); -NPY_NO_EXPORT int PyArray_DescrConverter \ - (PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyArray_DescrConverter2 \ - (PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyArray_IntpConverter \ - (PyObject *, PyArray_Dims *); -NPY_NO_EXPORT int PyArray_BufferConverter \ - (PyObject *, PyArray_Chunk *); -NPY_NO_EXPORT int PyArray_AxisConverter \ - (PyObject *, int *); -NPY_NO_EXPORT int PyArray_BoolConverter \ - (PyObject *, npy_bool *); -NPY_NO_EXPORT int PyArray_ByteorderConverter \ - (PyObject *, char *); -NPY_NO_EXPORT int PyArray_OrderConverter \ - (PyObject *, NPY_ORDER *); -NPY_NO_EXPORT unsigned char PyArray_EquivTypes \ - (PyArray_Descr *, PyArray_Descr *); -NPY_NO_EXPORT PyObject * PyArray_Zeros \ - (int, npy_intp *, PyArray_Descr *, int); -NPY_NO_EXPORT PyObject * PyArray_Empty \ - (int, npy_intp *, PyArray_Descr *, int); -NPY_NO_EXPORT PyObject * PyArray_Where \ - (PyObject *, PyObject *, PyObject *); -NPY_NO_EXPORT PyObject * PyArray_Arange \ - (double, double, double, int); -NPY_NO_EXPORT PyObject * PyArray_ArangeObj \ - (PyObject *, PyObject *, PyObject *, PyArray_Descr *); -NPY_NO_EXPORT int PyArray_SortkindConverter \ - (PyObject *, NPY_SORTKIND *); -NPY_NO_EXPORT PyObject * PyArray_LexSort \ - (PyObject *, int); -NPY_NO_EXPORT PyObject * PyArray_Round \ - (PyArrayObject *, int, PyArrayObject *); -NPY_NO_EXPORT unsigned char PyArray_EquivTypenums \ - (int, int); -NPY_NO_EXPORT int PyArray_RegisterDataType \ - (PyArray_Descr *); -NPY_NO_EXPORT int PyArray_RegisterCastFunc \ - (PyArray_Descr *, int, PyArray_VectorUnaryFunc *); -NPY_NO_EXPORT int PyArray_RegisterCanCast \ - (PyArray_Descr *, int, NPY_SCALARKIND); -NPY_NO_EXPORT void PyArray_InitArrFuncs \ - (PyArray_ArrFuncs *); -NPY_NO_EXPORT PyObject * PyArray_IntTupleFromIntp \ - (int, npy_intp *); -NPY_NO_EXPORT int PyArray_TypeNumFromName \ - (char *); -NPY_NO_EXPORT int PyArray_ClipmodeConverter \ - (PyObject *, NPY_CLIPMODE *); -NPY_NO_EXPORT int PyArray_OutputConverter \ - (PyObject *, PyArrayObject **); -NPY_NO_EXPORT PyObject * PyArray_BroadcastToShape \ - (PyObject *, npy_intp *, int); -NPY_NO_EXPORT void _PyArray_SigintHandler \ - (int); -NPY_NO_EXPORT void* _PyArray_GetSigintBuf \ - (void); -NPY_NO_EXPORT int PyArray_DescrAlignConverter \ - (PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyArray_DescrAlignConverter2 \ - (PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyArray_SearchsideConverter \ - (PyObject *, void *); -NPY_NO_EXPORT PyObject * PyArray_CheckAxis \ - (PyArrayObject *, int *, int); -NPY_NO_EXPORT npy_intp PyArray_OverflowMultiplyList \ - (npy_intp *, int); -NPY_NO_EXPORT int PyArray_CompareString \ - (char *, char *, size_t); -NPY_NO_EXPORT PyObject * PyArray_MultiIterFromObjects \ - (PyObject **, int, int, ...); -NPY_NO_EXPORT int PyArray_GetEndianness \ - (void); -NPY_NO_EXPORT unsigned int PyArray_GetNDArrayCFeatureVersion \ - (void); -NPY_NO_EXPORT PyObject * PyArray_Correlate2 \ - (PyObject *, PyObject *, int); -NPY_NO_EXPORT PyObject* PyArray_NeighborhoodIterNew \ - (PyArrayIterObject *, npy_intp *, int, PyArrayObject*); -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyTimeIntegerArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyTimeIntegerArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyDatetimeArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyDatetimeArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyTimedeltaArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyTimedeltaArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyHalfArrType_Type; -#else - NPY_NO_EXPORT PyTypeObject PyHalfArrType_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject NpyIter_Type; -#else - NPY_NO_EXPORT PyTypeObject NpyIter_Type; -#endif - -NPY_NO_EXPORT void PyArray_SetDatetimeParseFunction \ - (PyObject *); -NPY_NO_EXPORT void PyArray_DatetimeToDatetimeStruct \ - (npy_datetime, NPY_DATETIMEUNIT, npy_datetimestruct *); -NPY_NO_EXPORT void PyArray_TimedeltaToTimedeltaStruct \ - (npy_timedelta, NPY_DATETIMEUNIT, npy_timedeltastruct *); -NPY_NO_EXPORT npy_datetime PyArray_DatetimeStructToDatetime \ - (NPY_DATETIMEUNIT, npy_datetimestruct *); -NPY_NO_EXPORT npy_datetime PyArray_TimedeltaStructToTimedelta \ - (NPY_DATETIMEUNIT, npy_timedeltastruct *); -NPY_NO_EXPORT NpyIter * NpyIter_New \ - (PyArrayObject *, npy_uint32, NPY_ORDER, NPY_CASTING, PyArray_Descr*); -NPY_NO_EXPORT NpyIter * NpyIter_MultiNew \ - (int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **); -NPY_NO_EXPORT NpyIter * NpyIter_AdvancedNew \ - (int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **, int, int **, npy_intp *, npy_intp); -NPY_NO_EXPORT NpyIter * NpyIter_Copy \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_Deallocate \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_HasDelayedBufAlloc \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_HasExternalLoop \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_EnableExternalLoop \ - (NpyIter *); -NPY_NO_EXPORT npy_intp * NpyIter_GetInnerStrideArray \ - (NpyIter *); -NPY_NO_EXPORT npy_intp * NpyIter_GetInnerLoopSizePtr \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_Reset \ - (NpyIter *, char **); -NPY_NO_EXPORT int NpyIter_ResetBasePointers \ - (NpyIter *, char **, char **); -NPY_NO_EXPORT int NpyIter_ResetToIterIndexRange \ - (NpyIter *, npy_intp, npy_intp, char **); -NPY_NO_EXPORT int NpyIter_GetNDim \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_GetNOp \ - (NpyIter *); -NPY_NO_EXPORT NpyIter_IterNextFunc * NpyIter_GetIterNext \ - (NpyIter *, char **); -NPY_NO_EXPORT npy_intp NpyIter_GetIterSize \ - (NpyIter *); -NPY_NO_EXPORT void NpyIter_GetIterIndexRange \ - (NpyIter *, npy_intp *, npy_intp *); -NPY_NO_EXPORT npy_intp NpyIter_GetIterIndex \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_GotoIterIndex \ - (NpyIter *, npy_intp); -NPY_NO_EXPORT npy_bool NpyIter_HasMultiIndex \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_GetShape \ - (NpyIter *, npy_intp *); -NPY_NO_EXPORT NpyIter_GetMultiIndexFunc * NpyIter_GetGetMultiIndex \ - (NpyIter *, char **); -NPY_NO_EXPORT int NpyIter_GotoMultiIndex \ - (NpyIter *, npy_intp *); -NPY_NO_EXPORT int NpyIter_RemoveMultiIndex \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_HasIndex \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_IsBuffered \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_IsGrowInner \ - (NpyIter *); -NPY_NO_EXPORT npy_intp NpyIter_GetBufferSize \ - (NpyIter *); -NPY_NO_EXPORT npy_intp * NpyIter_GetIndexPtr \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_GotoIndex \ - (NpyIter *, npy_intp); -NPY_NO_EXPORT char ** NpyIter_GetDataPtrArray \ - (NpyIter *); -NPY_NO_EXPORT PyArray_Descr ** NpyIter_GetDescrArray \ - (NpyIter *); -NPY_NO_EXPORT PyArrayObject ** NpyIter_GetOperandArray \ - (NpyIter *); -NPY_NO_EXPORT PyArrayObject * NpyIter_GetIterView \ - (NpyIter *, npy_intp); -NPY_NO_EXPORT void NpyIter_GetReadFlags \ - (NpyIter *, char *); -NPY_NO_EXPORT void NpyIter_GetWriteFlags \ - (NpyIter *, char *); -NPY_NO_EXPORT void NpyIter_DebugPrint \ - (NpyIter *); -NPY_NO_EXPORT npy_bool NpyIter_IterationNeedsAPI \ - (NpyIter *); -NPY_NO_EXPORT void NpyIter_GetInnerFixedStrideArray \ - (NpyIter *, npy_intp *); -NPY_NO_EXPORT int NpyIter_RemoveAxis \ - (NpyIter *, int); -NPY_NO_EXPORT npy_intp * NpyIter_GetAxisStrideArray \ - (NpyIter *, int); -NPY_NO_EXPORT npy_bool NpyIter_RequiresBuffering \ - (NpyIter *); -NPY_NO_EXPORT char ** NpyIter_GetInitialDataPtrArray \ - (NpyIter *); -NPY_NO_EXPORT int NpyIter_CreateCompatibleStrides \ - (NpyIter *, npy_intp, npy_intp *); -NPY_NO_EXPORT int PyArray_CastingConverter \ - (PyObject *, NPY_CASTING *); -NPY_NO_EXPORT npy_intp PyArray_CountNonzero \ - (PyArrayObject *); -NPY_NO_EXPORT PyArray_Descr * PyArray_PromoteTypes \ - (PyArray_Descr *, PyArray_Descr *); -NPY_NO_EXPORT PyArray_Descr * PyArray_MinScalarType \ - (PyArrayObject *); -NPY_NO_EXPORT PyArray_Descr * PyArray_ResultType \ - (npy_intp, PyArrayObject **, npy_intp, PyArray_Descr **); -NPY_NO_EXPORT npy_bool PyArray_CanCastArrayTo \ - (PyArrayObject *, PyArray_Descr *, NPY_CASTING); -NPY_NO_EXPORT npy_bool PyArray_CanCastTypeTo \ - (PyArray_Descr *, PyArray_Descr *, NPY_CASTING); -NPY_NO_EXPORT PyArrayObject * PyArray_EinsteinSum \ - (char *, npy_intp, PyArrayObject **, PyArray_Descr *, NPY_ORDER, NPY_CASTING, PyArrayObject *); -NPY_NO_EXPORT PyObject * PyArray_NewLikeArray \ - (PyArrayObject *, NPY_ORDER, PyArray_Descr *, int); -NPY_NO_EXPORT int PyArray_GetArrayParamsFromObject \ - (PyObject *, PyArray_Descr *, npy_bool, PyArray_Descr **, int *, npy_intp *, PyArrayObject **, PyObject *); -NPY_NO_EXPORT int PyArray_ConvertClipmodeSequence \ - (PyObject *, NPY_CLIPMODE *, int); -NPY_NO_EXPORT PyObject * PyArray_MatrixProduct2 \ - (PyObject *, PyObject *, PyArrayObject*); -NPY_NO_EXPORT npy_bool NpyIter_IsFirstVisit \ - (NpyIter *, int); -NPY_NO_EXPORT int PyArray_SetBaseObject \ - (PyArrayObject *, PyObject *); -NPY_NO_EXPORT void PyArray_CreateSortedStridePerm \ - (int, npy_intp *, npy_stride_sort_item *); -NPY_NO_EXPORT void PyArray_RemoveAxesInPlace \ - (PyArrayObject *, npy_bool *); -NPY_NO_EXPORT void PyArray_DebugPrint \ - (PyArrayObject *); -NPY_NO_EXPORT int PyArray_FailUnlessWriteable \ - (PyArrayObject *, const char *); -NPY_NO_EXPORT int PyArray_SetUpdateIfCopyBase \ - (PyArrayObject *, PyArrayObject *); -NPY_NO_EXPORT void * PyDataMem_NEW \ - (size_t); -NPY_NO_EXPORT void PyDataMem_FREE \ - (void *); -NPY_NO_EXPORT void * PyDataMem_RENEW \ - (void *, size_t); -NPY_NO_EXPORT PyDataMem_EventHookFunc * PyDataMem_SetEventHook \ - (PyDataMem_EventHookFunc *, void *, void **); -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT NPY_CASTING NPY_DEFAULT_ASSIGN_CASTING; -#else - NPY_NO_EXPORT NPY_CASTING NPY_DEFAULT_ASSIGN_CASTING; -#endif - - -#else - -#if defined(PY_ARRAY_UNIQUE_SYMBOL) -#define PyArray_API PY_ARRAY_UNIQUE_SYMBOL -#endif - -#if defined(NO_IMPORT) || defined(NO_IMPORT_ARRAY) -extern void **PyArray_API; -#else -#if defined(PY_ARRAY_UNIQUE_SYMBOL) -void **PyArray_API; -#else -static void **PyArray_API=NULL; -#endif -#endif - -#define PyArray_GetNDArrayCVersion \ - (*(unsigned int (*)(void)) \ - PyArray_API[0]) -#define PyBigArray_Type (*(PyTypeObject *)PyArray_API[1]) -#define PyArray_Type (*(PyTypeObject *)PyArray_API[2]) -#define PyArrayDescr_Type (*(PyTypeObject *)PyArray_API[3]) -#define PyArrayFlags_Type (*(PyTypeObject *)PyArray_API[4]) -#define PyArrayIter_Type (*(PyTypeObject *)PyArray_API[5]) -#define PyArrayMultiIter_Type (*(PyTypeObject *)PyArray_API[6]) -#define NPY_NUMUSERTYPES (*(int *)PyArray_API[7]) -#define PyBoolArrType_Type (*(PyTypeObject *)PyArray_API[8]) -#define _PyArrayScalar_BoolValues ((PyBoolScalarObject *)PyArray_API[9]) -#define PyGenericArrType_Type (*(PyTypeObject *)PyArray_API[10]) -#define PyNumberArrType_Type (*(PyTypeObject *)PyArray_API[11]) -#define PyIntegerArrType_Type (*(PyTypeObject *)PyArray_API[12]) -#define PySignedIntegerArrType_Type (*(PyTypeObject *)PyArray_API[13]) -#define PyUnsignedIntegerArrType_Type (*(PyTypeObject *)PyArray_API[14]) -#define PyInexactArrType_Type (*(PyTypeObject *)PyArray_API[15]) -#define PyFloatingArrType_Type (*(PyTypeObject *)PyArray_API[16]) -#define PyComplexFloatingArrType_Type (*(PyTypeObject *)PyArray_API[17]) -#define PyFlexibleArrType_Type (*(PyTypeObject *)PyArray_API[18]) -#define PyCharacterArrType_Type (*(PyTypeObject *)PyArray_API[19]) -#define PyByteArrType_Type (*(PyTypeObject *)PyArray_API[20]) -#define PyShortArrType_Type (*(PyTypeObject *)PyArray_API[21]) -#define PyIntArrType_Type (*(PyTypeObject *)PyArray_API[22]) -#define PyLongArrType_Type (*(PyTypeObject *)PyArray_API[23]) -#define PyLongLongArrType_Type (*(PyTypeObject *)PyArray_API[24]) -#define PyUByteArrType_Type (*(PyTypeObject *)PyArray_API[25]) -#define PyUShortArrType_Type (*(PyTypeObject *)PyArray_API[26]) -#define PyUIntArrType_Type (*(PyTypeObject *)PyArray_API[27]) -#define PyULongArrType_Type (*(PyTypeObject *)PyArray_API[28]) -#define PyULongLongArrType_Type (*(PyTypeObject *)PyArray_API[29]) -#define PyFloatArrType_Type (*(PyTypeObject *)PyArray_API[30]) -#define PyDoubleArrType_Type (*(PyTypeObject *)PyArray_API[31]) -#define PyLongDoubleArrType_Type (*(PyTypeObject *)PyArray_API[32]) -#define PyCFloatArrType_Type (*(PyTypeObject *)PyArray_API[33]) -#define PyCDoubleArrType_Type (*(PyTypeObject *)PyArray_API[34]) -#define PyCLongDoubleArrType_Type (*(PyTypeObject *)PyArray_API[35]) -#define PyObjectArrType_Type (*(PyTypeObject *)PyArray_API[36]) -#define PyStringArrType_Type (*(PyTypeObject *)PyArray_API[37]) -#define PyUnicodeArrType_Type (*(PyTypeObject *)PyArray_API[38]) -#define PyVoidArrType_Type (*(PyTypeObject *)PyArray_API[39]) -#define PyArray_SetNumericOps \ - (*(int (*)(PyObject *)) \ - PyArray_API[40]) -#define PyArray_GetNumericOps \ - (*(PyObject * (*)(void)) \ - PyArray_API[41]) -#define PyArray_INCREF \ - (*(int (*)(PyArrayObject *)) \ - PyArray_API[42]) -#define PyArray_XDECREF \ - (*(int (*)(PyArrayObject *)) \ - PyArray_API[43]) -#define PyArray_SetStringFunction \ - (*(void (*)(PyObject *, int)) \ - PyArray_API[44]) -#define PyArray_DescrFromType \ - (*(PyArray_Descr * (*)(int)) \ - PyArray_API[45]) -#define PyArray_TypeObjectFromType \ - (*(PyObject * (*)(int)) \ - PyArray_API[46]) -#define PyArray_Zero \ - (*(char * (*)(PyArrayObject *)) \ - PyArray_API[47]) -#define PyArray_One \ - (*(char * (*)(PyArrayObject *)) \ - PyArray_API[48]) -#define PyArray_CastToType \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \ - PyArray_API[49]) -#define PyArray_CastTo \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[50]) -#define PyArray_CastAnyTo \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[51]) -#define PyArray_CanCastSafely \ - (*(int (*)(int, int)) \ - PyArray_API[52]) -#define PyArray_CanCastTo \ - (*(npy_bool (*)(PyArray_Descr *, PyArray_Descr *)) \ - PyArray_API[53]) -#define PyArray_ObjectType \ - (*(int (*)(PyObject *, int)) \ - PyArray_API[54]) -#define PyArray_DescrFromObject \ - (*(PyArray_Descr * (*)(PyObject *, PyArray_Descr *)) \ - PyArray_API[55]) -#define PyArray_ConvertToCommonType \ - (*(PyArrayObject ** (*)(PyObject *, int *)) \ - PyArray_API[56]) -#define PyArray_DescrFromScalar \ - (*(PyArray_Descr * (*)(PyObject *)) \ - PyArray_API[57]) -#define PyArray_DescrFromTypeObject \ - (*(PyArray_Descr * (*)(PyObject *)) \ - PyArray_API[58]) -#define PyArray_Size \ - (*(npy_intp (*)(PyObject *)) \ - PyArray_API[59]) -#define PyArray_Scalar \ - (*(PyObject * (*)(void *, PyArray_Descr *, PyObject *)) \ - PyArray_API[60]) -#define PyArray_FromScalar \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *)) \ - PyArray_API[61]) -#define PyArray_ScalarAsCtype \ - (*(void (*)(PyObject *, void *)) \ - PyArray_API[62]) -#define PyArray_CastScalarToCtype \ - (*(int (*)(PyObject *, void *, PyArray_Descr *)) \ - PyArray_API[63]) -#define PyArray_CastScalarDirect \ - (*(int (*)(PyObject *, PyArray_Descr *, void *, int)) \ - PyArray_API[64]) -#define PyArray_ScalarFromObject \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[65]) -#define PyArray_GetCastFunc \ - (*(PyArray_VectorUnaryFunc * (*)(PyArray_Descr *, int)) \ - PyArray_API[66]) -#define PyArray_FromDims \ - (*(PyObject * (*)(int, int *, int)) \ - PyArray_API[67]) -#define PyArray_FromDimsAndDataAndDescr \ - (*(PyObject * (*)(int, int *, PyArray_Descr *, char *)) \ - PyArray_API[68]) -#define PyArray_FromAny \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int, PyObject *)) \ - PyArray_API[69]) -#define PyArray_EnsureArray \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[70]) -#define PyArray_EnsureAnyArray \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[71]) -#define PyArray_FromFile \ - (*(PyObject * (*)(FILE *, PyArray_Descr *, npy_intp, char *)) \ - PyArray_API[72]) -#define PyArray_FromString \ - (*(PyObject * (*)(char *, npy_intp, PyArray_Descr *, npy_intp, char *)) \ - PyArray_API[73]) -#define PyArray_FromBuffer \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, npy_intp, npy_intp)) \ - PyArray_API[74]) -#define PyArray_FromIter \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, npy_intp)) \ - PyArray_API[75]) -#define PyArray_Return \ - (*(PyObject * (*)(PyArrayObject *)) \ - PyArray_API[76]) -#define PyArray_GetField \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \ - PyArray_API[77]) -#define PyArray_SetField \ - (*(int (*)(PyArrayObject *, PyArray_Descr *, int, PyObject *)) \ - PyArray_API[78]) -#define PyArray_Byteswap \ - (*(PyObject * (*)(PyArrayObject *, npy_bool)) \ - PyArray_API[79]) -#define PyArray_Resize \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *, int, NPY_ORDER)) \ - PyArray_API[80]) -#define PyArray_MoveInto \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[81]) -#define PyArray_CopyInto \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[82]) -#define PyArray_CopyAnyInto \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[83]) -#define PyArray_CopyObject \ - (*(int (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[84]) -#define PyArray_NewCopy \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \ - PyArray_API[85]) -#define PyArray_ToList \ - (*(PyObject * (*)(PyArrayObject *)) \ - PyArray_API[86]) -#define PyArray_ToString \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \ - PyArray_API[87]) -#define PyArray_ToFile \ - (*(int (*)(PyArrayObject *, FILE *, char *, char *)) \ - PyArray_API[88]) -#define PyArray_Dump \ - (*(int (*)(PyObject *, PyObject *, int)) \ - PyArray_API[89]) -#define PyArray_Dumps \ - (*(PyObject * (*)(PyObject *, int)) \ - PyArray_API[90]) -#define PyArray_ValidType \ - (*(int (*)(int)) \ - PyArray_API[91]) -#define PyArray_UpdateFlags \ - (*(void (*)(PyArrayObject *, int)) \ - PyArray_API[92]) -#define PyArray_New \ - (*(PyObject * (*)(PyTypeObject *, int, npy_intp *, int, npy_intp *, void *, int, int, PyObject *)) \ - PyArray_API[93]) -#define PyArray_NewFromDescr \ - (*(PyObject * (*)(PyTypeObject *, PyArray_Descr *, int, npy_intp *, npy_intp *, void *, int, PyObject *)) \ - PyArray_API[94]) -#define PyArray_DescrNew \ - (*(PyArray_Descr * (*)(PyArray_Descr *)) \ - PyArray_API[95]) -#define PyArray_DescrNewFromType \ - (*(PyArray_Descr * (*)(int)) \ - PyArray_API[96]) -#define PyArray_GetPriority \ - (*(double (*)(PyObject *, double)) \ - PyArray_API[97]) -#define PyArray_IterNew \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[98]) -#define PyArray_MultiIterNew \ - (*(PyObject * (*)(int, ...)) \ - PyArray_API[99]) -#define PyArray_PyIntAsInt \ - (*(int (*)(PyObject *)) \ - PyArray_API[100]) -#define PyArray_PyIntAsIntp \ - (*(npy_intp (*)(PyObject *)) \ - PyArray_API[101]) -#define PyArray_Broadcast \ - (*(int (*)(PyArrayMultiIterObject *)) \ - PyArray_API[102]) -#define PyArray_FillObjectArray \ - (*(void (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[103]) -#define PyArray_FillWithScalar \ - (*(int (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[104]) -#define PyArray_CheckStrides \ - (*(npy_bool (*)(int, int, npy_intp, npy_intp, npy_intp *, npy_intp *)) \ - PyArray_API[105]) -#define PyArray_DescrNewByteorder \ - (*(PyArray_Descr * (*)(PyArray_Descr *, char)) \ - PyArray_API[106]) -#define PyArray_IterAllButAxis \ - (*(PyObject * (*)(PyObject *, int *)) \ - PyArray_API[107]) -#define PyArray_CheckFromAny \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, int, int, int, PyObject *)) \ - PyArray_API[108]) -#define PyArray_FromArray \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, int)) \ - PyArray_API[109]) -#define PyArray_FromInterface \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[110]) -#define PyArray_FromStructInterface \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[111]) -#define PyArray_FromArrayAttr \ - (*(PyObject * (*)(PyObject *, PyArray_Descr *, PyObject *)) \ - PyArray_API[112]) -#define PyArray_ScalarKind \ - (*(NPY_SCALARKIND (*)(int, PyArrayObject **)) \ - PyArray_API[113]) -#define PyArray_CanCoerceScalar \ - (*(int (*)(int, int, NPY_SCALARKIND)) \ - PyArray_API[114]) -#define PyArray_NewFlagsObject \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[115]) -#define PyArray_CanCastScalar \ - (*(npy_bool (*)(PyTypeObject *, PyTypeObject *)) \ - PyArray_API[116]) -#define PyArray_CompareUCS4 \ - (*(int (*)(npy_ucs4 *, npy_ucs4 *, size_t)) \ - PyArray_API[117]) -#define PyArray_RemoveSmallest \ - (*(int (*)(PyArrayMultiIterObject *)) \ - PyArray_API[118]) -#define PyArray_ElementStrides \ - (*(int (*)(PyObject *)) \ - PyArray_API[119]) -#define PyArray_Item_INCREF \ - (*(void (*)(char *, PyArray_Descr *)) \ - PyArray_API[120]) -#define PyArray_Item_XDECREF \ - (*(void (*)(char *, PyArray_Descr *)) \ - PyArray_API[121]) -#define PyArray_FieldNames \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[122]) -#define PyArray_Transpose \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *)) \ - PyArray_API[123]) -#define PyArray_TakeFrom \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, int, PyArrayObject *, NPY_CLIPMODE)) \ - PyArray_API[124]) -#define PyArray_PutTo \ - (*(PyObject * (*)(PyArrayObject *, PyObject*, PyObject *, NPY_CLIPMODE)) \ - PyArray_API[125]) -#define PyArray_PutMask \ - (*(PyObject * (*)(PyArrayObject *, PyObject*, PyObject*)) \ - PyArray_API[126]) -#define PyArray_Repeat \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, int)) \ - PyArray_API[127]) -#define PyArray_Choose \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, PyArrayObject *, NPY_CLIPMODE)) \ - PyArray_API[128]) -#define PyArray_Sort \ - (*(int (*)(PyArrayObject *, int, NPY_SORTKIND)) \ - PyArray_API[129]) -#define PyArray_ArgSort \ - (*(PyObject * (*)(PyArrayObject *, int, NPY_SORTKIND)) \ - PyArray_API[130]) -#define PyArray_SearchSorted \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, NPY_SEARCHSIDE, PyObject *)) \ - PyArray_API[131]) -#define PyArray_ArgMax \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[132]) -#define PyArray_ArgMin \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[133]) -#define PyArray_Reshape \ - (*(PyObject * (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[134]) -#define PyArray_Newshape \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Dims *, NPY_ORDER)) \ - PyArray_API[135]) -#define PyArray_Squeeze \ - (*(PyObject * (*)(PyArrayObject *)) \ - PyArray_API[136]) -#define PyArray_View \ - (*(PyObject * (*)(PyArrayObject *, PyArray_Descr *, PyTypeObject *)) \ - PyArray_API[137]) -#define PyArray_SwapAxes \ - (*(PyObject * (*)(PyArrayObject *, int, int)) \ - PyArray_API[138]) -#define PyArray_Max \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[139]) -#define PyArray_Min \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[140]) -#define PyArray_Ptp \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[141]) -#define PyArray_Mean \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[142]) -#define PyArray_Trace \ - (*(PyObject * (*)(PyArrayObject *, int, int, int, int, PyArrayObject *)) \ - PyArray_API[143]) -#define PyArray_Diagonal \ - (*(PyObject * (*)(PyArrayObject *, int, int, int)) \ - PyArray_API[144]) -#define PyArray_Clip \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, PyObject *, PyArrayObject *)) \ - PyArray_API[145]) -#define PyArray_Conjugate \ - (*(PyObject * (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[146]) -#define PyArray_Nonzero \ - (*(PyObject * (*)(PyArrayObject *)) \ - PyArray_API[147]) -#define PyArray_Std \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *, int)) \ - PyArray_API[148]) -#define PyArray_Sum \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[149]) -#define PyArray_CumSum \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[150]) -#define PyArray_Prod \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[151]) -#define PyArray_CumProd \ - (*(PyObject * (*)(PyArrayObject *, int, int, PyArrayObject *)) \ - PyArray_API[152]) -#define PyArray_All \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[153]) -#define PyArray_Any \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[154]) -#define PyArray_Compress \ - (*(PyObject * (*)(PyArrayObject *, PyObject *, int, PyArrayObject *)) \ - PyArray_API[155]) -#define PyArray_Flatten \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \ - PyArray_API[156]) -#define PyArray_Ravel \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER)) \ - PyArray_API[157]) -#define PyArray_MultiplyList \ - (*(npy_intp (*)(npy_intp *, int)) \ - PyArray_API[158]) -#define PyArray_MultiplyIntList \ - (*(int (*)(int *, int)) \ - PyArray_API[159]) -#define PyArray_GetPtr \ - (*(void * (*)(PyArrayObject *, npy_intp*)) \ - PyArray_API[160]) -#define PyArray_CompareLists \ - (*(int (*)(npy_intp *, npy_intp *, int)) \ - PyArray_API[161]) -#define PyArray_AsCArray \ - (*(int (*)(PyObject **, void *, npy_intp *, int, PyArray_Descr*)) \ - PyArray_API[162]) -#define PyArray_As1D \ - (*(int (*)(PyObject **, char **, int *, int)) \ - PyArray_API[163]) -#define PyArray_As2D \ - (*(int (*)(PyObject **, char ***, int *, int *, int)) \ - PyArray_API[164]) -#define PyArray_Free \ - (*(int (*)(PyObject *, void *)) \ - PyArray_API[165]) -#define PyArray_Converter \ - (*(int (*)(PyObject *, PyObject **)) \ - PyArray_API[166]) -#define PyArray_IntpFromSequence \ - (*(int (*)(PyObject *, npy_intp *, int)) \ - PyArray_API[167]) -#define PyArray_Concatenate \ - (*(PyObject * (*)(PyObject *, int)) \ - PyArray_API[168]) -#define PyArray_InnerProduct \ - (*(PyObject * (*)(PyObject *, PyObject *)) \ - PyArray_API[169]) -#define PyArray_MatrixProduct \ - (*(PyObject * (*)(PyObject *, PyObject *)) \ - PyArray_API[170]) -#define PyArray_CopyAndTranspose \ - (*(PyObject * (*)(PyObject *)) \ - PyArray_API[171]) -#define PyArray_Correlate \ - (*(PyObject * (*)(PyObject *, PyObject *, int)) \ - PyArray_API[172]) -#define PyArray_TypestrConvert \ - (*(int (*)(int, int)) \ - PyArray_API[173]) -#define PyArray_DescrConverter \ - (*(int (*)(PyObject *, PyArray_Descr **)) \ - PyArray_API[174]) -#define PyArray_DescrConverter2 \ - (*(int (*)(PyObject *, PyArray_Descr **)) \ - PyArray_API[175]) -#define PyArray_IntpConverter \ - (*(int (*)(PyObject *, PyArray_Dims *)) \ - PyArray_API[176]) -#define PyArray_BufferConverter \ - (*(int (*)(PyObject *, PyArray_Chunk *)) \ - PyArray_API[177]) -#define PyArray_AxisConverter \ - (*(int (*)(PyObject *, int *)) \ - PyArray_API[178]) -#define PyArray_BoolConverter \ - (*(int (*)(PyObject *, npy_bool *)) \ - PyArray_API[179]) -#define PyArray_ByteorderConverter \ - (*(int (*)(PyObject *, char *)) \ - PyArray_API[180]) -#define PyArray_OrderConverter \ - (*(int (*)(PyObject *, NPY_ORDER *)) \ - PyArray_API[181]) -#define PyArray_EquivTypes \ - (*(unsigned char (*)(PyArray_Descr *, PyArray_Descr *)) \ - PyArray_API[182]) -#define PyArray_Zeros \ - (*(PyObject * (*)(int, npy_intp *, PyArray_Descr *, int)) \ - PyArray_API[183]) -#define PyArray_Empty \ - (*(PyObject * (*)(int, npy_intp *, PyArray_Descr *, int)) \ - PyArray_API[184]) -#define PyArray_Where \ - (*(PyObject * (*)(PyObject *, PyObject *, PyObject *)) \ - PyArray_API[185]) -#define PyArray_Arange \ - (*(PyObject * (*)(double, double, double, int)) \ - PyArray_API[186]) -#define PyArray_ArangeObj \ - (*(PyObject * (*)(PyObject *, PyObject *, PyObject *, PyArray_Descr *)) \ - PyArray_API[187]) -#define PyArray_SortkindConverter \ - (*(int (*)(PyObject *, NPY_SORTKIND *)) \ - PyArray_API[188]) -#define PyArray_LexSort \ - (*(PyObject * (*)(PyObject *, int)) \ - PyArray_API[189]) -#define PyArray_Round \ - (*(PyObject * (*)(PyArrayObject *, int, PyArrayObject *)) \ - PyArray_API[190]) -#define PyArray_EquivTypenums \ - (*(unsigned char (*)(int, int)) \ - PyArray_API[191]) -#define PyArray_RegisterDataType \ - (*(int (*)(PyArray_Descr *)) \ - PyArray_API[192]) -#define PyArray_RegisterCastFunc \ - (*(int (*)(PyArray_Descr *, int, PyArray_VectorUnaryFunc *)) \ - PyArray_API[193]) -#define PyArray_RegisterCanCast \ - (*(int (*)(PyArray_Descr *, int, NPY_SCALARKIND)) \ - PyArray_API[194]) -#define PyArray_InitArrFuncs \ - (*(void (*)(PyArray_ArrFuncs *)) \ - PyArray_API[195]) -#define PyArray_IntTupleFromIntp \ - (*(PyObject * (*)(int, npy_intp *)) \ - PyArray_API[196]) -#define PyArray_TypeNumFromName \ - (*(int (*)(char *)) \ - PyArray_API[197]) -#define PyArray_ClipmodeConverter \ - (*(int (*)(PyObject *, NPY_CLIPMODE *)) \ - PyArray_API[198]) -#define PyArray_OutputConverter \ - (*(int (*)(PyObject *, PyArrayObject **)) \ - PyArray_API[199]) -#define PyArray_BroadcastToShape \ - (*(PyObject * (*)(PyObject *, npy_intp *, int)) \ - PyArray_API[200]) -#define _PyArray_SigintHandler \ - (*(void (*)(int)) \ - PyArray_API[201]) -#define _PyArray_GetSigintBuf \ - (*(void* (*)(void)) \ - PyArray_API[202]) -#define PyArray_DescrAlignConverter \ - (*(int (*)(PyObject *, PyArray_Descr **)) \ - PyArray_API[203]) -#define PyArray_DescrAlignConverter2 \ - (*(int (*)(PyObject *, PyArray_Descr **)) \ - PyArray_API[204]) -#define PyArray_SearchsideConverter \ - (*(int (*)(PyObject *, void *)) \ - PyArray_API[205]) -#define PyArray_CheckAxis \ - (*(PyObject * (*)(PyArrayObject *, int *, int)) \ - PyArray_API[206]) -#define PyArray_OverflowMultiplyList \ - (*(npy_intp (*)(npy_intp *, int)) \ - PyArray_API[207]) -#define PyArray_CompareString \ - (*(int (*)(char *, char *, size_t)) \ - PyArray_API[208]) -#define PyArray_MultiIterFromObjects \ - (*(PyObject * (*)(PyObject **, int, int, ...)) \ - PyArray_API[209]) -#define PyArray_GetEndianness \ - (*(int (*)(void)) \ - PyArray_API[210]) -#define PyArray_GetNDArrayCFeatureVersion \ - (*(unsigned int (*)(void)) \ - PyArray_API[211]) -#define PyArray_Correlate2 \ - (*(PyObject * (*)(PyObject *, PyObject *, int)) \ - PyArray_API[212]) -#define PyArray_NeighborhoodIterNew \ - (*(PyObject* (*)(PyArrayIterObject *, npy_intp *, int, PyArrayObject*)) \ - PyArray_API[213]) -#define PyTimeIntegerArrType_Type (*(PyTypeObject *)PyArray_API[214]) -#define PyDatetimeArrType_Type (*(PyTypeObject *)PyArray_API[215]) -#define PyTimedeltaArrType_Type (*(PyTypeObject *)PyArray_API[216]) -#define PyHalfArrType_Type (*(PyTypeObject *)PyArray_API[217]) -#define NpyIter_Type (*(PyTypeObject *)PyArray_API[218]) -#define PyArray_SetDatetimeParseFunction \ - (*(void (*)(PyObject *)) \ - PyArray_API[219]) -#define PyArray_DatetimeToDatetimeStruct \ - (*(void (*)(npy_datetime, NPY_DATETIMEUNIT, npy_datetimestruct *)) \ - PyArray_API[220]) -#define PyArray_TimedeltaToTimedeltaStruct \ - (*(void (*)(npy_timedelta, NPY_DATETIMEUNIT, npy_timedeltastruct *)) \ - PyArray_API[221]) -#define PyArray_DatetimeStructToDatetime \ - (*(npy_datetime (*)(NPY_DATETIMEUNIT, npy_datetimestruct *)) \ - PyArray_API[222]) -#define PyArray_TimedeltaStructToTimedelta \ - (*(npy_datetime (*)(NPY_DATETIMEUNIT, npy_timedeltastruct *)) \ - PyArray_API[223]) -#define NpyIter_New \ - (*(NpyIter * (*)(PyArrayObject *, npy_uint32, NPY_ORDER, NPY_CASTING, PyArray_Descr*)) \ - PyArray_API[224]) -#define NpyIter_MultiNew \ - (*(NpyIter * (*)(int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **)) \ - PyArray_API[225]) -#define NpyIter_AdvancedNew \ - (*(NpyIter * (*)(int, PyArrayObject **, npy_uint32, NPY_ORDER, NPY_CASTING, npy_uint32 *, PyArray_Descr **, int, int **, npy_intp *, npy_intp)) \ - PyArray_API[226]) -#define NpyIter_Copy \ - (*(NpyIter * (*)(NpyIter *)) \ - PyArray_API[227]) -#define NpyIter_Deallocate \ - (*(int (*)(NpyIter *)) \ - PyArray_API[228]) -#define NpyIter_HasDelayedBufAlloc \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[229]) -#define NpyIter_HasExternalLoop \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[230]) -#define NpyIter_EnableExternalLoop \ - (*(int (*)(NpyIter *)) \ - PyArray_API[231]) -#define NpyIter_GetInnerStrideArray \ - (*(npy_intp * (*)(NpyIter *)) \ - PyArray_API[232]) -#define NpyIter_GetInnerLoopSizePtr \ - (*(npy_intp * (*)(NpyIter *)) \ - PyArray_API[233]) -#define NpyIter_Reset \ - (*(int (*)(NpyIter *, char **)) \ - PyArray_API[234]) -#define NpyIter_ResetBasePointers \ - (*(int (*)(NpyIter *, char **, char **)) \ - PyArray_API[235]) -#define NpyIter_ResetToIterIndexRange \ - (*(int (*)(NpyIter *, npy_intp, npy_intp, char **)) \ - PyArray_API[236]) -#define NpyIter_GetNDim \ - (*(int (*)(NpyIter *)) \ - PyArray_API[237]) -#define NpyIter_GetNOp \ - (*(int (*)(NpyIter *)) \ - PyArray_API[238]) -#define NpyIter_GetIterNext \ - (*(NpyIter_IterNextFunc * (*)(NpyIter *, char **)) \ - PyArray_API[239]) -#define NpyIter_GetIterSize \ - (*(npy_intp (*)(NpyIter *)) \ - PyArray_API[240]) -#define NpyIter_GetIterIndexRange \ - (*(void (*)(NpyIter *, npy_intp *, npy_intp *)) \ - PyArray_API[241]) -#define NpyIter_GetIterIndex \ - (*(npy_intp (*)(NpyIter *)) \ - PyArray_API[242]) -#define NpyIter_GotoIterIndex \ - (*(int (*)(NpyIter *, npy_intp)) \ - PyArray_API[243]) -#define NpyIter_HasMultiIndex \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[244]) -#define NpyIter_GetShape \ - (*(int (*)(NpyIter *, npy_intp *)) \ - PyArray_API[245]) -#define NpyIter_GetGetMultiIndex \ - (*(NpyIter_GetMultiIndexFunc * (*)(NpyIter *, char **)) \ - PyArray_API[246]) -#define NpyIter_GotoMultiIndex \ - (*(int (*)(NpyIter *, npy_intp *)) \ - PyArray_API[247]) -#define NpyIter_RemoveMultiIndex \ - (*(int (*)(NpyIter *)) \ - PyArray_API[248]) -#define NpyIter_HasIndex \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[249]) -#define NpyIter_IsBuffered \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[250]) -#define NpyIter_IsGrowInner \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[251]) -#define NpyIter_GetBufferSize \ - (*(npy_intp (*)(NpyIter *)) \ - PyArray_API[252]) -#define NpyIter_GetIndexPtr \ - (*(npy_intp * (*)(NpyIter *)) \ - PyArray_API[253]) -#define NpyIter_GotoIndex \ - (*(int (*)(NpyIter *, npy_intp)) \ - PyArray_API[254]) -#define NpyIter_GetDataPtrArray \ - (*(char ** (*)(NpyIter *)) \ - PyArray_API[255]) -#define NpyIter_GetDescrArray \ - (*(PyArray_Descr ** (*)(NpyIter *)) \ - PyArray_API[256]) -#define NpyIter_GetOperandArray \ - (*(PyArrayObject ** (*)(NpyIter *)) \ - PyArray_API[257]) -#define NpyIter_GetIterView \ - (*(PyArrayObject * (*)(NpyIter *, npy_intp)) \ - PyArray_API[258]) -#define NpyIter_GetReadFlags \ - (*(void (*)(NpyIter *, char *)) \ - PyArray_API[259]) -#define NpyIter_GetWriteFlags \ - (*(void (*)(NpyIter *, char *)) \ - PyArray_API[260]) -#define NpyIter_DebugPrint \ - (*(void (*)(NpyIter *)) \ - PyArray_API[261]) -#define NpyIter_IterationNeedsAPI \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[262]) -#define NpyIter_GetInnerFixedStrideArray \ - (*(void (*)(NpyIter *, npy_intp *)) \ - PyArray_API[263]) -#define NpyIter_RemoveAxis \ - (*(int (*)(NpyIter *, int)) \ - PyArray_API[264]) -#define NpyIter_GetAxisStrideArray \ - (*(npy_intp * (*)(NpyIter *, int)) \ - PyArray_API[265]) -#define NpyIter_RequiresBuffering \ - (*(npy_bool (*)(NpyIter *)) \ - PyArray_API[266]) -#define NpyIter_GetInitialDataPtrArray \ - (*(char ** (*)(NpyIter *)) \ - PyArray_API[267]) -#define NpyIter_CreateCompatibleStrides \ - (*(int (*)(NpyIter *, npy_intp, npy_intp *)) \ - PyArray_API[268]) -#define PyArray_CastingConverter \ - (*(int (*)(PyObject *, NPY_CASTING *)) \ - PyArray_API[269]) -#define PyArray_CountNonzero \ - (*(npy_intp (*)(PyArrayObject *)) \ - PyArray_API[270]) -#define PyArray_PromoteTypes \ - (*(PyArray_Descr * (*)(PyArray_Descr *, PyArray_Descr *)) \ - PyArray_API[271]) -#define PyArray_MinScalarType \ - (*(PyArray_Descr * (*)(PyArrayObject *)) \ - PyArray_API[272]) -#define PyArray_ResultType \ - (*(PyArray_Descr * (*)(npy_intp, PyArrayObject **, npy_intp, PyArray_Descr **)) \ - PyArray_API[273]) -#define PyArray_CanCastArrayTo \ - (*(npy_bool (*)(PyArrayObject *, PyArray_Descr *, NPY_CASTING)) \ - PyArray_API[274]) -#define PyArray_CanCastTypeTo \ - (*(npy_bool (*)(PyArray_Descr *, PyArray_Descr *, NPY_CASTING)) \ - PyArray_API[275]) -#define PyArray_EinsteinSum \ - (*(PyArrayObject * (*)(char *, npy_intp, PyArrayObject **, PyArray_Descr *, NPY_ORDER, NPY_CASTING, PyArrayObject *)) \ - PyArray_API[276]) -#define PyArray_NewLikeArray \ - (*(PyObject * (*)(PyArrayObject *, NPY_ORDER, PyArray_Descr *, int)) \ - PyArray_API[277]) -#define PyArray_GetArrayParamsFromObject \ - (*(int (*)(PyObject *, PyArray_Descr *, npy_bool, PyArray_Descr **, int *, npy_intp *, PyArrayObject **, PyObject *)) \ - PyArray_API[278]) -#define PyArray_ConvertClipmodeSequence \ - (*(int (*)(PyObject *, NPY_CLIPMODE *, int)) \ - PyArray_API[279]) -#define PyArray_MatrixProduct2 \ - (*(PyObject * (*)(PyObject *, PyObject *, PyArrayObject*)) \ - PyArray_API[280]) -#define NpyIter_IsFirstVisit \ - (*(npy_bool (*)(NpyIter *, int)) \ - PyArray_API[281]) -#define PyArray_SetBaseObject \ - (*(int (*)(PyArrayObject *, PyObject *)) \ - PyArray_API[282]) -#define PyArray_CreateSortedStridePerm \ - (*(void (*)(int, npy_intp *, npy_stride_sort_item *)) \ - PyArray_API[283]) -#define PyArray_RemoveAxesInPlace \ - (*(void (*)(PyArrayObject *, npy_bool *)) \ - PyArray_API[284]) -#define PyArray_DebugPrint \ - (*(void (*)(PyArrayObject *)) \ - PyArray_API[285]) -#define PyArray_FailUnlessWriteable \ - (*(int (*)(PyArrayObject *, const char *)) \ - PyArray_API[286]) -#define PyArray_SetUpdateIfCopyBase \ - (*(int (*)(PyArrayObject *, PyArrayObject *)) \ - PyArray_API[287]) -#define PyDataMem_NEW \ - (*(void * (*)(size_t)) \ - PyArray_API[288]) -#define PyDataMem_FREE \ - (*(void (*)(void *)) \ - PyArray_API[289]) -#define PyDataMem_RENEW \ - (*(void * (*)(void *, size_t)) \ - PyArray_API[290]) -#define PyDataMem_SetEventHook \ - (*(PyDataMem_EventHookFunc * (*)(PyDataMem_EventHookFunc *, void *, void **)) \ - PyArray_API[291]) -#define NPY_DEFAULT_ASSIGN_CASTING (*(NPY_CASTING *)PyArray_API[292]) - -#if !defined(NO_IMPORT_ARRAY) && !defined(NO_IMPORT) -static int -_import_array(void) -{ - int st; - PyObject *numpy = PyImport_ImportModule("numpy.core.multiarray"); - PyObject *c_api = NULL; - - if (numpy == NULL) { - PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); - return -1; - } - c_api = PyObject_GetAttrString(numpy, "_ARRAY_API"); - Py_DECREF(numpy); - if (c_api == NULL) { - PyErr_SetString(PyExc_AttributeError, "_ARRAY_API not found"); - return -1; - } - -#if PY_VERSION_HEX >= 0x03000000 - if (!PyCapsule_CheckExact(c_api)) { - PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is not PyCapsule object"); - Py_DECREF(c_api); - return -1; - } - PyArray_API = (void **)PyCapsule_GetPointer(c_api, NULL); -#else - if (!PyCObject_Check(c_api)) { - PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is not PyCObject object"); - Py_DECREF(c_api); - return -1; - } - PyArray_API = (void **)PyCObject_AsVoidPtr(c_api); -#endif - Py_DECREF(c_api); - if (PyArray_API == NULL) { - PyErr_SetString(PyExc_RuntimeError, "_ARRAY_API is NULL pointer"); - return -1; - } - - /* Perform runtime check of C API version */ - if (NPY_VERSION != PyArray_GetNDArrayCVersion()) { - PyErr_Format(PyExc_RuntimeError, "module compiled against "\ - "ABI version %x but this version of numpy is %x", \ - (int) NPY_VERSION, (int) PyArray_GetNDArrayCVersion()); - return -1; - } - if (NPY_FEATURE_VERSION > PyArray_GetNDArrayCFeatureVersion()) { - PyErr_Format(PyExc_RuntimeError, "module compiled against "\ - "API version %x but this version of numpy is %x", \ - (int) NPY_FEATURE_VERSION, (int) PyArray_GetNDArrayCFeatureVersion()); - return -1; - } - - /* - * Perform runtime check of endianness and check it matches the one set by - * the headers (npy_endian.h) as a safeguard - */ - st = PyArray_GetEndianness(); - if (st == NPY_CPU_UNKNOWN_ENDIAN) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as unknown endian"); - return -1; - } -#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN - if (st != NPY_CPU_BIG) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as "\ - "big endian, but detected different endianness at runtime"); - return -1; - } -#elif NPY_BYTE_ORDER == NPY_LITTLE_ENDIAN - if (st != NPY_CPU_LITTLE) { - PyErr_Format(PyExc_RuntimeError, "FATAL: module compiled as "\ - "little endian, but detected different endianness at runtime"); - return -1; - } -#endif - - return 0; -} - -#if PY_VERSION_HEX >= 0x03000000 -#define NUMPY_IMPORT_ARRAY_RETVAL NULL -#else -#define NUMPY_IMPORT_ARRAY_RETVAL -#endif - -#define import_array() {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return NUMPY_IMPORT_ARRAY_RETVAL; } } - -#define import_array1(ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); return ret; } } - -#define import_array2(msg, ret) {if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, msg); return ret; } } - -#endif - -#endif diff --git a/include/numpy/__ufunc_api.h b/include/numpy/__ufunc_api.h deleted file mode 100644 index fd81d07..0000000 --- a/include/numpy/__ufunc_api.h +++ /dev/null @@ -1,323 +0,0 @@ - -#ifdef _UMATHMODULE - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION -extern NPY_NO_EXPORT PyTypeObject PyUFunc_Type; -#else -NPY_NO_EXPORT PyTypeObject PyUFunc_Type; -#endif - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - extern NPY_NO_EXPORT PyTypeObject PyUFunc_Type; -#else - NPY_NO_EXPORT PyTypeObject PyUFunc_Type; -#endif - -NPY_NO_EXPORT PyObject * PyUFunc_FromFuncAndData \ - (PyUFuncGenericFunction *, void **, char *, int, int, int, int, char *, char *, int); -NPY_NO_EXPORT int PyUFunc_RegisterLoopForType \ - (PyUFuncObject *, int, PyUFuncGenericFunction, int *, void *); -NPY_NO_EXPORT int PyUFunc_GenericFunction \ - (PyUFuncObject *, PyObject *, PyObject *, PyArrayObject **); -NPY_NO_EXPORT void PyUFunc_f_f_As_d_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_d_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_f_f \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_g_g \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_F_F_As_D_D \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_F_F \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_D_D \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_G_G \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_O_O \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ff_f_As_dd_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ff_f \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_dd_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_gg_g \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_FF_F_As_DD_D \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_DD_D \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_FF_F \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_GG_G \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_OO_O \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_O_O_method \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_OO_O_method \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_On_Om \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT int PyUFunc_GetPyValues \ - (char *, int *, int *, PyObject **); -NPY_NO_EXPORT int PyUFunc_checkfperr \ - (int, PyObject *, int *); -NPY_NO_EXPORT void PyUFunc_clearfperr \ - (void); -NPY_NO_EXPORT int PyUFunc_getfperr \ - (void); -NPY_NO_EXPORT int PyUFunc_handlefperr \ - (int, PyObject *, int, int *); -NPY_NO_EXPORT int PyUFunc_ReplaceLoopBySignature \ - (PyUFuncObject *, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *); -NPY_NO_EXPORT PyObject * PyUFunc_FromFuncAndDataAndSignature \ - (PyUFuncGenericFunction *, void **, char *, int, int, int, int, char *, char *, int, const char *); -NPY_NO_EXPORT int PyUFunc_SetUsesArraysAsData \ - (void **, size_t); -NPY_NO_EXPORT void PyUFunc_e_e \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_e_e_As_f_f \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_e_e_As_d_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ee_e \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ee_e_As_ff_f \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT void PyUFunc_ee_e_As_dd_d \ - (char **, npy_intp *, npy_intp *, void *); -NPY_NO_EXPORT int PyUFunc_DefaultTypeResolver \ - (PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyObject *, PyArray_Descr **); -NPY_NO_EXPORT int PyUFunc_ValidateCasting \ - (PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyArray_Descr **); - -#else - -#if defined(PY_UFUNC_UNIQUE_SYMBOL) -#define PyUFunc_API PY_UFUNC_UNIQUE_SYMBOL -#endif - -#if defined(NO_IMPORT) || defined(NO_IMPORT_UFUNC) -extern void **PyUFunc_API; -#else -#if defined(PY_UFUNC_UNIQUE_SYMBOL) -void **PyUFunc_API; -#else -static void **PyUFunc_API=NULL; -#endif -#endif - -#define PyUFunc_Type (*(PyTypeObject *)PyUFunc_API[0]) -#define PyUFunc_FromFuncAndData \ - (*(PyObject * (*)(PyUFuncGenericFunction *, void **, char *, int, int, int, int, char *, char *, int)) \ - PyUFunc_API[1]) -#define PyUFunc_RegisterLoopForType \ - (*(int (*)(PyUFuncObject *, int, PyUFuncGenericFunction, int *, void *)) \ - PyUFunc_API[2]) -#define PyUFunc_GenericFunction \ - (*(int (*)(PyUFuncObject *, PyObject *, PyObject *, PyArrayObject **)) \ - PyUFunc_API[3]) -#define PyUFunc_f_f_As_d_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[4]) -#define PyUFunc_d_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[5]) -#define PyUFunc_f_f \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[6]) -#define PyUFunc_g_g \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[7]) -#define PyUFunc_F_F_As_D_D \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[8]) -#define PyUFunc_F_F \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[9]) -#define PyUFunc_D_D \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[10]) -#define PyUFunc_G_G \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[11]) -#define PyUFunc_O_O \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[12]) -#define PyUFunc_ff_f_As_dd_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[13]) -#define PyUFunc_ff_f \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[14]) -#define PyUFunc_dd_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[15]) -#define PyUFunc_gg_g \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[16]) -#define PyUFunc_FF_F_As_DD_D \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[17]) -#define PyUFunc_DD_D \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[18]) -#define PyUFunc_FF_F \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[19]) -#define PyUFunc_GG_G \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[20]) -#define PyUFunc_OO_O \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[21]) -#define PyUFunc_O_O_method \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[22]) -#define PyUFunc_OO_O_method \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[23]) -#define PyUFunc_On_Om \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[24]) -#define PyUFunc_GetPyValues \ - (*(int (*)(char *, int *, int *, PyObject **)) \ - PyUFunc_API[25]) -#define PyUFunc_checkfperr \ - (*(int (*)(int, PyObject *, int *)) \ - PyUFunc_API[26]) -#define PyUFunc_clearfperr \ - (*(void (*)(void)) \ - PyUFunc_API[27]) -#define PyUFunc_getfperr \ - (*(int (*)(void)) \ - PyUFunc_API[28]) -#define PyUFunc_handlefperr \ - (*(int (*)(int, PyObject *, int, int *)) \ - PyUFunc_API[29]) -#define PyUFunc_ReplaceLoopBySignature \ - (*(int (*)(PyUFuncObject *, PyUFuncGenericFunction, int *, PyUFuncGenericFunction *)) \ - PyUFunc_API[30]) -#define PyUFunc_FromFuncAndDataAndSignature \ - (*(PyObject * (*)(PyUFuncGenericFunction *, void **, char *, int, int, int, int, char *, char *, int, const char *)) \ - PyUFunc_API[31]) -#define PyUFunc_SetUsesArraysAsData \ - (*(int (*)(void **, size_t)) \ - PyUFunc_API[32]) -#define PyUFunc_e_e \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[33]) -#define PyUFunc_e_e_As_f_f \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[34]) -#define PyUFunc_e_e_As_d_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[35]) -#define PyUFunc_ee_e \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[36]) -#define PyUFunc_ee_e_As_ff_f \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[37]) -#define PyUFunc_ee_e_As_dd_d \ - (*(void (*)(char **, npy_intp *, npy_intp *, void *)) \ - PyUFunc_API[38]) -#define PyUFunc_DefaultTypeResolver \ - (*(int (*)(PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyObject *, PyArray_Descr **)) \ - PyUFunc_API[39]) -#define PyUFunc_ValidateCasting \ - (*(int (*)(PyUFuncObject *, NPY_CASTING, PyArrayObject **, PyArray_Descr **)) \ - PyUFunc_API[40]) - -static int -_import_umath(void) -{ - PyObject *numpy = PyImport_ImportModule("numpy.core.umath"); - PyObject *c_api = NULL; - - if (numpy == NULL) { - PyErr_SetString(PyExc_ImportError, "numpy.core.umath failed to import"); - return -1; - } - c_api = PyObject_GetAttrString(numpy, "_UFUNC_API"); - Py_DECREF(numpy); - if (c_api == NULL) { - PyErr_SetString(PyExc_AttributeError, "_UFUNC_API not found"); - return -1; - } - -#if PY_VERSION_HEX >= 0x03000000 - if (!PyCapsule_CheckExact(c_api)) { - PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is not PyCapsule object"); - Py_DECREF(c_api); - return -1; - } - PyUFunc_API = (void **)PyCapsule_GetPointer(c_api, NULL); -#else - if (!PyCObject_Check(c_api)) { - PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is not PyCObject object"); - Py_DECREF(c_api); - return -1; - } - PyUFunc_API = (void **)PyCObject_AsVoidPtr(c_api); -#endif - Py_DECREF(c_api); - if (PyUFunc_API == NULL) { - PyErr_SetString(PyExc_RuntimeError, "_UFUNC_API is NULL pointer"); - return -1; - } - return 0; -} - -#if PY_VERSION_HEX >= 0x03000000 -#define NUMPY_IMPORT_UMATH_RETVAL NULL -#else -#define NUMPY_IMPORT_UMATH_RETVAL -#endif - -#define import_umath() \ - do {\ - UFUNC_NOFPE\ - if (_import_umath() < 0) {\ - PyErr_Print();\ - PyErr_SetString(PyExc_ImportError,\ - "numpy.core.umath failed to import");\ - return NUMPY_IMPORT_UMATH_RETVAL;\ - }\ - } while(0) - -#define import_umath1(ret) \ - do {\ - UFUNC_NOFPE\ - if (_import_umath() < 0) {\ - PyErr_Print();\ - PyErr_SetString(PyExc_ImportError,\ - "numpy.core.umath failed to import");\ - return ret;\ - }\ - } while(0) - -#define import_umath2(ret, msg) \ - do {\ - UFUNC_NOFPE\ - if (_import_umath() < 0) {\ - PyErr_Print();\ - PyErr_SetString(PyExc_ImportError, msg);\ - return ret;\ - }\ - } while(0) - -#define import_ufunc() \ - do {\ - UFUNC_NOFPE\ - if (_import_umath() < 0) {\ - PyErr_Print();\ - PyErr_SetString(PyExc_ImportError,\ - "numpy.core.umath failed to import");\ - }\ - } while(0) - -#endif diff --git a/include/numpy/_neighborhood_iterator_imp.h b/include/numpy/_neighborhood_iterator_imp.h deleted file mode 100644 index e8860cb..0000000 --- a/include/numpy/_neighborhood_iterator_imp.h +++ /dev/null @@ -1,90 +0,0 @@ -#ifndef _NPY_INCLUDE_NEIGHBORHOOD_IMP -#error You should not include this header directly -#endif -/* - * Private API (here for inline) - */ -static NPY_INLINE int -_PyArrayNeighborhoodIter_IncrCoord(PyArrayNeighborhoodIterObject* iter); - -/* - * Update to next item of the iterator - * - * Note: this simply increment the coordinates vector, last dimension - * incremented first , i.e, for dimension 3 - * ... - * -1, -1, -1 - * -1, -1, 0 - * -1, -1, 1 - * .... - * -1, 0, -1 - * -1, 0, 0 - * .... - * 0, -1, -1 - * 0, -1, 0 - * .... - */ -#define _UPDATE_COORD_ITER(c) \ - wb = iter->coordinates[c] < iter->bounds[c][1]; \ - if (wb) { \ - iter->coordinates[c] += 1; \ - return 0; \ - } \ - else { \ - iter->coordinates[c] = iter->bounds[c][0]; \ - } - -static NPY_INLINE int -_PyArrayNeighborhoodIter_IncrCoord(PyArrayNeighborhoodIterObject* iter) -{ - npy_intp i, wb; - - for (i = iter->nd - 1; i >= 0; --i) { - _UPDATE_COORD_ITER(i) - } - - return 0; -} - -/* - * Version optimized for 2d arrays, manual loop unrolling - */ -static NPY_INLINE int -_PyArrayNeighborhoodIter_IncrCoord2D(PyArrayNeighborhoodIterObject* iter) -{ - npy_intp wb; - - _UPDATE_COORD_ITER(1) - _UPDATE_COORD_ITER(0) - - return 0; -} -#undef _UPDATE_COORD_ITER - -/* - * Advance to the next neighbour - */ -static NPY_INLINE int -PyArrayNeighborhoodIter_Next(PyArrayNeighborhoodIterObject* iter) -{ - _PyArrayNeighborhoodIter_IncrCoord (iter); - iter->dataptr = iter->translate((PyArrayIterObject*)iter, iter->coordinates); - - return 0; -} - -/* - * Reset functions - */ -static NPY_INLINE int -PyArrayNeighborhoodIter_Reset(PyArrayNeighborhoodIterObject* iter) -{ - npy_intp i; - - for (i = 0; i < iter->nd; ++i) { - iter->coordinates[i] = iter->bounds[i][0]; - } - iter->dataptr = iter->translate((PyArrayIterObject*)iter, iter->coordinates); - - return 0; -} diff --git a/include/numpy/_numpyconfig.h b/include/numpy/_numpyconfig.h deleted file mode 100644 index d55ffc3..0000000 --- a/include/numpy/_numpyconfig.h +++ /dev/null @@ -1,29 +0,0 @@ -#define NPY_SIZEOF_SHORT SIZEOF_SHORT -#define NPY_SIZEOF_INT SIZEOF_INT -#define NPY_SIZEOF_LONG SIZEOF_LONG -#define NPY_SIZEOF_FLOAT 4 -#define NPY_SIZEOF_COMPLEX_FLOAT 8 -#define NPY_SIZEOF_DOUBLE 8 -#define NPY_SIZEOF_COMPLEX_DOUBLE 16 -#define NPY_SIZEOF_LONGDOUBLE 16 -#define NPY_SIZEOF_COMPLEX_LONGDOUBLE 32 -#define NPY_SIZEOF_PY_INTPTR_T 8 -#define NPY_SIZEOF_PY_LONG_LONG 8 -#define NPY_SIZEOF_LONGLONG 8 -#define NPY_NO_SMP 0 -#define NPY_HAVE_DECL_ISNAN -#define NPY_HAVE_DECL_ISINF -#define NPY_HAVE_DECL_ISFINITE -#define NPY_HAVE_DECL_SIGNBIT -#define NPY_USE_C99_COMPLEX 1 -#define NPY_HAVE_COMPLEX_DOUBLE 1 -#define NPY_HAVE_COMPLEX_FLOAT 1 -#define NPY_HAVE_COMPLEX_LONG_DOUBLE 1 -#define NPY_USE_C99_FORMATS 1 -#define NPY_VISIBILITY_HIDDEN __attribute__((visibility("hidden"))) -#define NPY_ABI_VERSION 0x01000009 -#define NPY_API_VERSION 0x00000007 - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS 1 -#endif diff --git a/include/numpy/arrayobject.h b/include/numpy/arrayobject.h deleted file mode 100644 index a84766f..0000000 --- a/include/numpy/arrayobject.h +++ /dev/null @@ -1,22 +0,0 @@ - -/* This expects the following variables to be defined (besides - the usual ones from pyconfig.h - - SIZEOF_LONG_DOUBLE -- sizeof(long double) or sizeof(double) if no - long double is present on platform. - CHAR_BIT -- number of bits in a char (usually 8) - (should be in limits.h) - -*/ - -#ifndef Py_ARRAYOBJECT_H -#define Py_ARRAYOBJECT_H - -#include "ndarrayobject.h" -#include "npy_interrupt.h" - -#ifdef NPY_NO_PREFIX -#include "noprefix.h" -#endif - -#endif diff --git a/include/numpy/arrayscalars.h b/include/numpy/arrayscalars.h deleted file mode 100644 index 64450e7..0000000 --- a/include/numpy/arrayscalars.h +++ /dev/null @@ -1,175 +0,0 @@ -#ifndef _NPY_ARRAYSCALARS_H_ -#define _NPY_ARRAYSCALARS_H_ - -#ifndef _MULTIARRAYMODULE -typedef struct { - PyObject_HEAD - npy_bool obval; -} PyBoolScalarObject; -#endif - - -typedef struct { - PyObject_HEAD - signed char obval; -} PyByteScalarObject; - - -typedef struct { - PyObject_HEAD - short obval; -} PyShortScalarObject; - - -typedef struct { - PyObject_HEAD - int obval; -} PyIntScalarObject; - - -typedef struct { - PyObject_HEAD - long obval; -} PyLongScalarObject; - - -typedef struct { - PyObject_HEAD - npy_longlong obval; -} PyLongLongScalarObject; - - -typedef struct { - PyObject_HEAD - unsigned char obval; -} PyUByteScalarObject; - - -typedef struct { - PyObject_HEAD - unsigned short obval; -} PyUShortScalarObject; - - -typedef struct { - PyObject_HEAD - unsigned int obval; -} PyUIntScalarObject; - - -typedef struct { - PyObject_HEAD - unsigned long obval; -} PyULongScalarObject; - - -typedef struct { - PyObject_HEAD - npy_ulonglong obval; -} PyULongLongScalarObject; - - -typedef struct { - PyObject_HEAD - npy_half obval; -} PyHalfScalarObject; - - -typedef struct { - PyObject_HEAD - float obval; -} PyFloatScalarObject; - - -typedef struct { - PyObject_HEAD - double obval; -} PyDoubleScalarObject; - - -typedef struct { - PyObject_HEAD - npy_longdouble obval; -} PyLongDoubleScalarObject; - - -typedef struct { - PyObject_HEAD - npy_cfloat obval; -} PyCFloatScalarObject; - - -typedef struct { - PyObject_HEAD - npy_cdouble obval; -} PyCDoubleScalarObject; - - -typedef struct { - PyObject_HEAD - npy_clongdouble obval; -} PyCLongDoubleScalarObject; - - -typedef struct { - PyObject_HEAD - PyObject * obval; -} PyObjectScalarObject; - -typedef struct { - PyObject_HEAD - npy_datetime obval; - PyArray_DatetimeMetaData obmeta; -} PyDatetimeScalarObject; - -typedef struct { - PyObject_HEAD - npy_timedelta obval; - PyArray_DatetimeMetaData obmeta; -} PyTimedeltaScalarObject; - - -typedef struct { - PyObject_HEAD - char obval; -} PyScalarObject; - -#define PyStringScalarObject PyStringObject -#define PyUnicodeScalarObject PyUnicodeObject - -typedef struct { - PyObject_VAR_HEAD - char *obval; - PyArray_Descr *descr; - int flags; - PyObject *base; -} PyVoidScalarObject; - -/* Macros - PyScalarObject - PyArrType_Type - are defined in ndarrayobject.h -*/ - -#define PyArrayScalar_False ((PyObject *)(&(_PyArrayScalar_BoolValues[0]))) -#define PyArrayScalar_True ((PyObject *)(&(_PyArrayScalar_BoolValues[1]))) -#define PyArrayScalar_FromLong(i) \ - ((PyObject *)(&(_PyArrayScalar_BoolValues[((i)!=0)]))) -#define PyArrayScalar_RETURN_BOOL_FROM_LONG(i) \ - return Py_INCREF(PyArrayScalar_FromLong(i)), \ - PyArrayScalar_FromLong(i) -#define PyArrayScalar_RETURN_FALSE \ - return Py_INCREF(PyArrayScalar_False), \ - PyArrayScalar_False -#define PyArrayScalar_RETURN_TRUE \ - return Py_INCREF(PyArrayScalar_True), \ - PyArrayScalar_True - -#define PyArrayScalar_New(cls) \ - Py##cls##ArrType_Type.tp_alloc(&Py##cls##ArrType_Type, 0) -#define PyArrayScalar_VAL(obj, cls) \ - ((Py##cls##ScalarObject *)obj)->obval -#define PyArrayScalar_ASSIGN(obj, cls, val) \ - PyArrayScalar_VAL(obj, cls) = val - -#endif diff --git a/include/numpy/halffloat.h b/include/numpy/halffloat.h deleted file mode 100644 index 944f0ea..0000000 --- a/include/numpy/halffloat.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __NPY_HALFFLOAT_H__ -#define __NPY_HALFFLOAT_H__ - -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * Half-precision routines - */ - -/* Conversions */ -float npy_half_to_float(npy_half h); -double npy_half_to_double(npy_half h); -npy_half npy_float_to_half(float f); -npy_half npy_double_to_half(double d); -/* Comparisons */ -int npy_half_eq(npy_half h1, npy_half h2); -int npy_half_ne(npy_half h1, npy_half h2); -int npy_half_le(npy_half h1, npy_half h2); -int npy_half_lt(npy_half h1, npy_half h2); -int npy_half_ge(npy_half h1, npy_half h2); -int npy_half_gt(npy_half h1, npy_half h2); -/* faster *_nonan variants for when you know h1 and h2 are not NaN */ -int npy_half_eq_nonan(npy_half h1, npy_half h2); -int npy_half_lt_nonan(npy_half h1, npy_half h2); -int npy_half_le_nonan(npy_half h1, npy_half h2); -/* Miscellaneous functions */ -int npy_half_iszero(npy_half h); -int npy_half_isnan(npy_half h); -int npy_half_isinf(npy_half h); -int npy_half_isfinite(npy_half h); -int npy_half_signbit(npy_half h); -npy_half npy_half_copysign(npy_half x, npy_half y); -npy_half npy_half_spacing(npy_half h); -npy_half npy_half_nextafter(npy_half x, npy_half y); - -/* - * Half-precision constants - */ - -#define NPY_HALF_ZERO (0x0000u) -#define NPY_HALF_PZERO (0x0000u) -#define NPY_HALF_NZERO (0x8000u) -#define NPY_HALF_ONE (0x3c00u) -#define NPY_HALF_NEGONE (0xbc00u) -#define NPY_HALF_PINF (0x7c00u) -#define NPY_HALF_NINF (0xfc00u) -#define NPY_HALF_NAN (0x7e00u) - -#define NPY_MAX_HALF (0x7bffu) - -/* - * Bit-level conversions - */ - -npy_uint16 npy_floatbits_to_halfbits(npy_uint32 f); -npy_uint16 npy_doublebits_to_halfbits(npy_uint64 d); -npy_uint32 npy_halfbits_to_floatbits(npy_uint16 h); -npy_uint64 npy_halfbits_to_doublebits(npy_uint16 h); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/include/numpy/multiarray_api.txt b/include/numpy/multiarray_api.txt deleted file mode 100644 index 7e588f0..0000000 --- a/include/numpy/multiarray_api.txt +++ /dev/null @@ -1,2375 +0,0 @@ - -=========== -Numpy C-API -=========== -:: - - unsigned int - PyArray_GetNDArrayCVersion(void ) - - -Included at the very first so not auto-grabbed and thus not labeled. - -:: - - int - PyArray_SetNumericOps(PyObject *dict) - -Set internal structure with number functions that all arrays will use - -:: - - PyObject * - PyArray_GetNumericOps(void ) - -Get dictionary showing number functions that all arrays will use - -:: - - int - PyArray_INCREF(PyArrayObject *mp) - -For object arrays, increment all internal references. - -:: - - int - PyArray_XDECREF(PyArrayObject *mp) - -Decrement all internal references for object arrays. -(or arrays with object fields) - -:: - - void - PyArray_SetStringFunction(PyObject *op, int repr) - -Set the array print function to be a Python function. - -:: - - PyArray_Descr * - PyArray_DescrFromType(int type) - -Get the PyArray_Descr structure for a type. - -:: - - PyObject * - PyArray_TypeObjectFromType(int type) - -Get a typeobject from a type-number -- can return NULL. - -New reference - -:: - - char * - PyArray_Zero(PyArrayObject *arr) - -Get pointer to zero of correct type for array. - -:: - - char * - PyArray_One(PyArrayObject *arr) - -Get pointer to one of correct type for array - -:: - - PyObject * - PyArray_CastToType(PyArrayObject *arr, PyArray_Descr *dtype, int - is_f_order) - -For backward compatibility - -Cast an array using typecode structure. -steals reference to at --- cannot be NULL - -This function always makes a copy of arr, even if the dtype -doesn't change. - -:: - - int - PyArray_CastTo(PyArrayObject *out, PyArrayObject *mp) - -Cast to an already created array. - -:: - - int - PyArray_CastAnyTo(PyArrayObject *out, PyArrayObject *mp) - -Cast to an already created array. Arrays don't have to be "broadcastable" -Only requirement is they have the same number of elements. - -:: - - int - PyArray_CanCastSafely(int fromtype, int totype) - -Check the type coercion rules. - -:: - - npy_bool - PyArray_CanCastTo(PyArray_Descr *from, PyArray_Descr *to) - -leaves reference count alone --- cannot be NULL - -PyArray_CanCastTypeTo is equivalent to this, but adds a 'casting' -parameter. - -:: - - int - PyArray_ObjectType(PyObject *op, int minimum_type) - -Return the typecode of the array a Python object would be converted to - -Returns the type number the result should have, or NPY_NOTYPE on error. - -:: - - PyArray_Descr * - PyArray_DescrFromObject(PyObject *op, PyArray_Descr *mintype) - -new reference -- accepts NULL for mintype - -:: - - PyArrayObject ** - PyArray_ConvertToCommonType(PyObject *op, int *retn) - - -:: - - PyArray_Descr * - PyArray_DescrFromScalar(PyObject *sc) - -Return descr object from array scalar. - -New reference - -:: - - PyArray_Descr * - PyArray_DescrFromTypeObject(PyObject *type) - - -:: - - npy_intp - PyArray_Size(PyObject *op) - -Compute the size of an array (in number of items) - -:: - - PyObject * - PyArray_Scalar(void *data, PyArray_Descr *descr, PyObject *base) - -Get scalar-equivalent to a region of memory described by a descriptor. - -:: - - PyObject * - PyArray_FromScalar(PyObject *scalar, PyArray_Descr *outcode) - -Get 0-dim array from scalar - -0-dim array from array-scalar object -always contains a copy of the data -unless outcode is NULL, it is of void type and the referrer does -not own it either. - -steals reference to outcode - -:: - - void - PyArray_ScalarAsCtype(PyObject *scalar, void *ctypeptr) - -Convert to c-type - -no error checking is performed -- ctypeptr must be same type as scalar -in case of flexible type, the data is not copied -into ctypeptr which is expected to be a pointer to pointer - -:: - - int - PyArray_CastScalarToCtype(PyObject *scalar, void - *ctypeptr, PyArray_Descr *outcode) - -Cast Scalar to c-type - -The output buffer must be large-enough to receive the value -Even for flexible types which is different from ScalarAsCtype -where only a reference for flexible types is returned - -This may not work right on narrow builds for NumPy unicode scalars. - -:: - - int - PyArray_CastScalarDirect(PyObject *scalar, PyArray_Descr - *indescr, void *ctypeptr, int outtype) - -Cast Scalar to c-type - -:: - - PyObject * - PyArray_ScalarFromObject(PyObject *object) - -Get an Array Scalar From a Python Object - -Returns NULL if unsuccessful but error is only set if another error occurred. -Currently only Numeric-like object supported. - -:: - - PyArray_VectorUnaryFunc * - PyArray_GetCastFunc(PyArray_Descr *descr, int type_num) - -Get a cast function to cast from the input descriptor to the -output type_number (must be a registered data-type). -Returns NULL if un-successful. - -:: - - PyObject * - PyArray_FromDims(int nd, int *d, int type) - -Construct an empty array from dimensions and typenum - -:: - - PyObject * - PyArray_FromDimsAndDataAndDescr(int nd, int *d, PyArray_Descr - *descr, char *data) - -Like FromDimsAndData but uses the Descr structure instead of typecode -as input. - -:: - - PyObject * - PyArray_FromAny(PyObject *op, PyArray_Descr *newtype, int - min_depth, int max_depth, int flags, PyObject - *context) - -Does not check for NPY_ARRAY_ENSURECOPY and NPY_ARRAY_NOTSWAPPED in flags -Steals a reference to newtype --- which can be NULL - -:: - - PyObject * - PyArray_EnsureArray(PyObject *op) - -This is a quick wrapper around PyArray_FromAny(op, NULL, 0, 0, ENSUREARRAY) -that special cases Arrays and PyArray_Scalars up front -It *steals a reference* to the object -It also guarantees that the result is PyArray_Type -Because it decrefs op if any conversion needs to take place -so it can be used like PyArray_EnsureArray(some_function(...)) - -:: - - PyObject * - PyArray_EnsureAnyArray(PyObject *op) - - -:: - - PyObject * - PyArray_FromFile(FILE *fp, PyArray_Descr *dtype, npy_intp num, char - *sep) - - -Given a ``FILE *`` pointer ``fp``, and a ``PyArray_Descr``, return an -array corresponding to the data encoded in that file. - -If the dtype is NULL, the default array type is used (double). -If non-null, the reference is stolen. - -The number of elements to read is given as ``num``; if it is < 0, then -then as many as possible are read. - -If ``sep`` is NULL or empty, then binary data is assumed, else -text data, with ``sep`` as the separator between elements. Whitespace in -the separator matches any length of whitespace in the text, and a match -for whitespace around the separator is added. - -For memory-mapped files, use the buffer interface. No more data than -necessary is read by this routine. - -:: - - PyObject * - PyArray_FromString(char *data, npy_intp slen, PyArray_Descr - *dtype, npy_intp num, char *sep) - - -Given a pointer to a string ``data``, a string length ``slen``, and -a ``PyArray_Descr``, return an array corresponding to the data -encoded in that string. - -If the dtype is NULL, the default array type is used (double). -If non-null, the reference is stolen. - -If ``slen`` is < 0, then the end of string is used for text data. -It is an error for ``slen`` to be < 0 for binary data (since embedded NULLs -would be the norm). - -The number of elements to read is given as ``num``; if it is < 0, then -then as many as possible are read. - -If ``sep`` is NULL or empty, then binary data is assumed, else -text data, with ``sep`` as the separator between elements. Whitespace in -the separator matches any length of whitespace in the text, and a match -for whitespace around the separator is added. - -:: - - PyObject * - PyArray_FromBuffer(PyObject *buf, PyArray_Descr *type, npy_intp - count, npy_intp offset) - - -:: - - PyObject * - PyArray_FromIter(PyObject *obj, PyArray_Descr *dtype, npy_intp count) - - -steals a reference to dtype (which cannot be NULL) - -:: - - PyObject * - PyArray_Return(PyArrayObject *mp) - - -Return either an array or the appropriate Python object if the array -is 0d and matches a Python type. - -:: - - PyObject * - PyArray_GetField(PyArrayObject *self, PyArray_Descr *typed, int - offset) - -Get a subset of bytes from each element of the array - -:: - - int - PyArray_SetField(PyArrayObject *self, PyArray_Descr *dtype, int - offset, PyObject *val) - -Set a subset of bytes from each element of the array - -:: - - PyObject * - PyArray_Byteswap(PyArrayObject *self, npy_bool inplace) - - -:: - - PyObject * - PyArray_Resize(PyArrayObject *self, PyArray_Dims *newshape, int - refcheck, NPY_ORDER order) - -Resize (reallocate data). Only works if nothing else is referencing this -array and it is contiguous. If refcheck is 0, then the reference count is -not checked and assumed to be 1. You still must own this data and have no -weak-references and no base object. - -:: - - int - PyArray_MoveInto(PyArrayObject *dst, PyArrayObject *src) - -Move the memory of one array into another, allowing for overlapping data. - -Returns 0 on success, negative on failure. - -:: - - int - PyArray_CopyInto(PyArrayObject *dst, PyArrayObject *src) - -Copy an Array into another array. -Broadcast to the destination shape if necessary. - -Returns 0 on success, -1 on failure. - -:: - - int - PyArray_CopyAnyInto(PyArrayObject *dst, PyArrayObject *src) - -Copy an Array into another array -- memory must not overlap -Does not require src and dest to have "broadcastable" shapes -(only the same number of elements). - -TODO: For NumPy 2.0, this could accept an order parameter which -only allows NPY_CORDER and NPY_FORDER. Could also rename -this to CopyAsFlat to make the name more intuitive. - -Returns 0 on success, -1 on error. - -:: - - int - PyArray_CopyObject(PyArrayObject *dest, PyObject *src_object) - - -:: - - PyObject * - PyArray_NewCopy(PyArrayObject *obj, NPY_ORDER order) - -Copy an array. - -:: - - PyObject * - PyArray_ToList(PyArrayObject *self) - -To List - -:: - - PyObject * - PyArray_ToString(PyArrayObject *self, NPY_ORDER order) - - -:: - - int - PyArray_ToFile(PyArrayObject *self, FILE *fp, char *sep, char *format) - -To File - -:: - - int - PyArray_Dump(PyObject *self, PyObject *file, int protocol) - - -:: - - PyObject * - PyArray_Dumps(PyObject *self, int protocol) - - -:: - - int - PyArray_ValidType(int type) - -Is the typenum valid? - -:: - - void - PyArray_UpdateFlags(PyArrayObject *ret, int flagmask) - -Update Several Flags at once. - -:: - - PyObject * - PyArray_New(PyTypeObject *subtype, int nd, npy_intp *dims, int - type_num, npy_intp *strides, void *data, int itemsize, int - flags, PyObject *obj) - -Generic new array creation routine. - -:: - - PyObject * - PyArray_NewFromDescr(PyTypeObject *subtype, PyArray_Descr *descr, int - nd, npy_intp *dims, npy_intp *strides, void - *data, int flags, PyObject *obj) - -Generic new array creation routine. - -steals a reference to descr (even on failure) - -:: - - PyArray_Descr * - PyArray_DescrNew(PyArray_Descr *base) - -base cannot be NULL - -:: - - PyArray_Descr * - PyArray_DescrNewFromType(int type_num) - - -:: - - double - PyArray_GetPriority(PyObject *obj, double default_) - -Get Priority from object - -:: - - PyObject * - PyArray_IterNew(PyObject *obj) - -Get Iterator. - -:: - - PyObject * - PyArray_MultiIterNew(int n, ... ) - -Get MultiIterator, - -:: - - int - PyArray_PyIntAsInt(PyObject *o) - - -:: - - npy_intp - PyArray_PyIntAsIntp(PyObject *o) - - -:: - - int - PyArray_Broadcast(PyArrayMultiIterObject *mit) - - -:: - - void - PyArray_FillObjectArray(PyArrayObject *arr, PyObject *obj) - -Assumes contiguous - -:: - - int - PyArray_FillWithScalar(PyArrayObject *arr, PyObject *obj) - - -:: - - npy_bool - PyArray_CheckStrides(int elsize, int nd, npy_intp numbytes, npy_intp - offset, npy_intp *dims, npy_intp *newstrides) - - -:: - - PyArray_Descr * - PyArray_DescrNewByteorder(PyArray_Descr *self, char newendian) - - -returns a copy of the PyArray_Descr structure with the byteorder -altered: -no arguments: The byteorder is swapped (in all subfields as well) -single argument: The byteorder is forced to the given state -(in all subfields as well) - -Valid states: ('big', '>') or ('little' or '<') -('native', or '=') - -If a descr structure with | is encountered it's own -byte-order is not changed but any fields are: - - -Deep bytorder change of a data-type descriptor -Leaves reference count of self unchanged --- does not DECREF self *** - -:: - - PyObject * - PyArray_IterAllButAxis(PyObject *obj, int *inaxis) - -Get Iterator that iterates over all but one axis (don't use this with -PyArray_ITER_GOTO1D). The axis will be over-written if negative -with the axis having the smallest stride. - -:: - - PyObject * - PyArray_CheckFromAny(PyObject *op, PyArray_Descr *descr, int - min_depth, int max_depth, int requires, PyObject - *context) - -steals a reference to descr -- accepts NULL - -:: - - PyObject * - PyArray_FromArray(PyArrayObject *arr, PyArray_Descr *newtype, int - flags) - -steals reference to newtype --- acc. NULL - -:: - - PyObject * - PyArray_FromInterface(PyObject *origin) - - -:: - - PyObject * - PyArray_FromStructInterface(PyObject *input) - - -:: - - PyObject * - PyArray_FromArrayAttr(PyObject *op, PyArray_Descr *typecode, PyObject - *context) - - -:: - - NPY_SCALARKIND - PyArray_ScalarKind(int typenum, PyArrayObject **arr) - -ScalarKind - -Returns the scalar kind of a type number, with an -optional tweak based on the scalar value itself. -If no scalar is provided, it returns INTPOS_SCALAR -for both signed and unsigned integers, otherwise -it checks the sign of any signed integer to choose -INTNEG_SCALAR when appropriate. - -:: - - int - PyArray_CanCoerceScalar(int thistype, int neededtype, NPY_SCALARKIND - scalar) - - -Determines whether the data type 'thistype', with -scalar kind 'scalar', can be coerced into 'neededtype'. - -:: - - PyObject * - PyArray_NewFlagsObject(PyObject *obj) - - -Get New ArrayFlagsObject - -:: - - npy_bool - PyArray_CanCastScalar(PyTypeObject *from, PyTypeObject *to) - -See if array scalars can be cast. - -TODO: For NumPy 2.0, add a NPY_CASTING parameter. - -:: - - int - PyArray_CompareUCS4(npy_ucs4 *s1, npy_ucs4 *s2, size_t len) - - -:: - - int - PyArray_RemoveSmallest(PyArrayMultiIterObject *multi) - -Adjusts previously broadcasted iterators so that the axis with -the smallest sum of iterator strides is not iterated over. -Returns dimension which is smallest in the range [0,multi->nd). -A -1 is returned if multi->nd == 0. - -don't use with PyArray_ITER_GOTO1D because factors are not adjusted - -:: - - int - PyArray_ElementStrides(PyObject *obj) - - -:: - - void - PyArray_Item_INCREF(char *data, PyArray_Descr *descr) - - -:: - - void - PyArray_Item_XDECREF(char *data, PyArray_Descr *descr) - - -:: - - PyObject * - PyArray_FieldNames(PyObject *fields) - -Return the tuple of ordered field names from a dictionary. - -:: - - PyObject * - PyArray_Transpose(PyArrayObject *ap, PyArray_Dims *permute) - -Return Transpose. - -:: - - PyObject * - PyArray_TakeFrom(PyArrayObject *self0, PyObject *indices0, int - axis, PyArrayObject *out, NPY_CLIPMODE clipmode) - -Take - -:: - - PyObject * - PyArray_PutTo(PyArrayObject *self, PyObject*values0, PyObject - *indices0, NPY_CLIPMODE clipmode) - -Put values into an array - -:: - - PyObject * - PyArray_PutMask(PyArrayObject *self, PyObject*values0, PyObject*mask0) - -Put values into an array according to a mask. - -:: - - PyObject * - PyArray_Repeat(PyArrayObject *aop, PyObject *op, int axis) - -Repeat the array. - -:: - - PyObject * - PyArray_Choose(PyArrayObject *ip, PyObject *op, PyArrayObject - *out, NPY_CLIPMODE clipmode) - - -:: - - int - PyArray_Sort(PyArrayObject *op, int axis, NPY_SORTKIND which) - -Sort an array in-place - -:: - - PyObject * - PyArray_ArgSort(PyArrayObject *op, int axis, NPY_SORTKIND which) - -ArgSort an array - -:: - - PyObject * - PyArray_SearchSorted(PyArrayObject *op1, PyObject *op2, NPY_SEARCHSIDE - side, PyObject *perm) - - -Search the sorted array op1 for the location of the items in op2. The -result is an array of indexes, one for each element in op2, such that if -the item were to be inserted in op1 just before that index the array -would still be in sorted order. - -Parameters ----------- -op1 : PyArrayObject * -Array to be searched, must be 1-D. -op2 : PyObject * -Array of items whose insertion indexes in op1 are wanted -side : {NPY_SEARCHLEFT, NPY_SEARCHRIGHT} -If NPY_SEARCHLEFT, return first valid insertion indexes -If NPY_SEARCHRIGHT, return last valid insertion indexes -perm : PyObject * -Permutation array that sorts op1 (optional) - -Returns -------- -ret : PyObject * -New reference to npy_intp array containing indexes where items in op2 -could be validly inserted into op1. NULL on error. - -Notes ------ -Binary search is used to find the indexes. - -:: - - PyObject * - PyArray_ArgMax(PyArrayObject *op, int axis, PyArrayObject *out) - -ArgMax - -:: - - PyObject * - PyArray_ArgMin(PyArrayObject *op, int axis, PyArrayObject *out) - -ArgMin - -:: - - PyObject * - PyArray_Reshape(PyArrayObject *self, PyObject *shape) - -Reshape - -:: - - PyObject * - PyArray_Newshape(PyArrayObject *self, PyArray_Dims *newdims, NPY_ORDER - order) - -New shape for an array - -:: - - PyObject * - PyArray_Squeeze(PyArrayObject *self) - - -return a new view of the array object with all of its unit-length -dimensions squeezed out if needed, otherwise -return the same array. - -:: - - PyObject * - PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject - *pytype) - -View -steals a reference to type -- accepts NULL - -:: - - PyObject * - PyArray_SwapAxes(PyArrayObject *ap, int a1, int a2) - -SwapAxes - -:: - - PyObject * - PyArray_Max(PyArrayObject *ap, int axis, PyArrayObject *out) - -Max - -:: - - PyObject * - PyArray_Min(PyArrayObject *ap, int axis, PyArrayObject *out) - -Min - -:: - - PyObject * - PyArray_Ptp(PyArrayObject *ap, int axis, PyArrayObject *out) - -Ptp - -:: - - PyObject * - PyArray_Mean(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out) - -Mean - -:: - - PyObject * - PyArray_Trace(PyArrayObject *self, int offset, int axis1, int - axis2, int rtype, PyArrayObject *out) - -Trace - -:: - - PyObject * - PyArray_Diagonal(PyArrayObject *self, int offset, int axis1, int - axis2) - -Diagonal - -In NumPy versions prior to 1.7, this function always returned a copy of -the diagonal array. In 1.7, the code has been updated to compute a view -onto 'self', but it still copies this array before returning, as well as -setting the internal WARN_ON_WRITE flag. In a future version, it will -simply return a view onto self. - -:: - - PyObject * - PyArray_Clip(PyArrayObject *self, PyObject *min, PyObject - *max, PyArrayObject *out) - -Clip - -:: - - PyObject * - PyArray_Conjugate(PyArrayObject *self, PyArrayObject *out) - -Conjugate - -:: - - PyObject * - PyArray_Nonzero(PyArrayObject *self) - -Nonzero - -TODO: In NumPy 2.0, should make the iteration order a parameter. - -:: - - PyObject * - PyArray_Std(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out, int variance) - -Set variance to 1 to by-pass square-root calculation and return variance -Std - -:: - - PyObject * - PyArray_Sum(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out) - -Sum - -:: - - PyObject * - PyArray_CumSum(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out) - -CumSum - -:: - - PyObject * - PyArray_Prod(PyArrayObject *self, int axis, int rtype, PyArrayObject - *out) - -Prod - -:: - - PyObject * - PyArray_CumProd(PyArrayObject *self, int axis, int - rtype, PyArrayObject *out) - -CumProd - -:: - - PyObject * - PyArray_All(PyArrayObject *self, int axis, PyArrayObject *out) - -All - -:: - - PyObject * - PyArray_Any(PyArrayObject *self, int axis, PyArrayObject *out) - -Any - -:: - - PyObject * - PyArray_Compress(PyArrayObject *self, PyObject *condition, int - axis, PyArrayObject *out) - -Compress - -:: - - PyObject * - PyArray_Flatten(PyArrayObject *a, NPY_ORDER order) - -Flatten - -:: - - PyObject * - PyArray_Ravel(PyArrayObject *arr, NPY_ORDER order) - -Ravel -Returns a contiguous array - -:: - - npy_intp - PyArray_MultiplyList(npy_intp *l1, int n) - -Multiply a List - -:: - - int - PyArray_MultiplyIntList(int *l1, int n) - -Multiply a List of ints - -:: - - void * - PyArray_GetPtr(PyArrayObject *obj, npy_intp*ind) - -Produce a pointer into array - -:: - - int - PyArray_CompareLists(npy_intp *l1, npy_intp *l2, int n) - -Compare Lists - -:: - - int - PyArray_AsCArray(PyObject **op, void *ptr, npy_intp *dims, int - nd, PyArray_Descr*typedescr) - -Simulate a C-array -steals a reference to typedescr -- can be NULL - -:: - - int - PyArray_As1D(PyObject **op, char **ptr, int *d1, int typecode) - -Convert to a 1D C-array - -:: - - int - PyArray_As2D(PyObject **op, char ***ptr, int *d1, int *d2, int - typecode) - -Convert to a 2D C-array - -:: - - int - PyArray_Free(PyObject *op, void *ptr) - -Free pointers created if As2D is called - -:: - - int - PyArray_Converter(PyObject *object, PyObject **address) - - -Useful to pass as converter function for O& processing in PyArgs_ParseTuple. - -This conversion function can be used with the "O&" argument for -PyArg_ParseTuple. It will immediately return an object of array type -or will convert to a NPY_ARRAY_CARRAY any other object. - -If you use PyArray_Converter, you must DECREF the array when finished -as you get a new reference to it. - -:: - - int - PyArray_IntpFromSequence(PyObject *seq, npy_intp *vals, int maxvals) - -PyArray_IntpFromSequence -Returns the number of dimensions or -1 if an error occurred. -vals must be large enough to hold maxvals - -:: - - PyObject * - PyArray_Concatenate(PyObject *op, int axis) - -Concatenate - -Concatenate an arbitrary Python sequence into an array. -op is a python object supporting the sequence interface. -Its elements will be concatenated together to form a single -multidimensional array. If axis is NPY_MAXDIMS or bigger, then -each sequence object will be flattened before concatenation - -:: - - PyObject * - PyArray_InnerProduct(PyObject *op1, PyObject *op2) - -Numeric.innerproduct(a,v) - -:: - - PyObject * - PyArray_MatrixProduct(PyObject *op1, PyObject *op2) - -Numeric.matrixproduct(a,v) -just like inner product but does the swapaxes stuff on the fly - -:: - - PyObject * - PyArray_CopyAndTranspose(PyObject *op) - -Copy and Transpose - -Could deprecate this function, as there isn't a speed benefit over -calling Transpose and then Copy. - -:: - - PyObject * - PyArray_Correlate(PyObject *op1, PyObject *op2, int mode) - -Numeric.correlate(a1,a2,mode) - -:: - - int - PyArray_TypestrConvert(int itemsize, int gentype) - -Typestr converter - -:: - - int - PyArray_DescrConverter(PyObject *obj, PyArray_Descr **at) - -Get typenum from an object -- None goes to NPY_DEFAULT_TYPE -This function takes a Python object representing a type and converts it -to a the correct PyArray_Descr * structure to describe the type. - -Many objects can be used to represent a data-type which in NumPy is -quite a flexible concept. - -This is the central code that converts Python objects to -Type-descriptor objects that are used throughout numpy. - -Returns a new reference in *at, but the returned should not be -modified as it may be one of the canonical immutable objects or -a reference to the input obj. - -:: - - int - PyArray_DescrConverter2(PyObject *obj, PyArray_Descr **at) - -Get typenum from an object -- None goes to NULL - -:: - - int - PyArray_IntpConverter(PyObject *obj, PyArray_Dims *seq) - -Get intp chunk from sequence - -This function takes a Python sequence object and allocates and -fills in an intp array with the converted values. - -Remember to free the pointer seq.ptr when done using -PyDimMem_FREE(seq.ptr)** - -:: - - int - PyArray_BufferConverter(PyObject *obj, PyArray_Chunk *buf) - -Get buffer chunk from object - -this function takes a Python object which exposes the (single-segment) -buffer interface and returns a pointer to the data segment - -You should increment the reference count by one of buf->base -if you will hang on to a reference - -You only get a borrowed reference to the object. Do not free the -memory... - -:: - - int - PyArray_AxisConverter(PyObject *obj, int *axis) - -Get axis from an object (possibly None) -- a converter function, - -See also PyArray_ConvertMultiAxis, which also handles a tuple of axes. - -:: - - int - PyArray_BoolConverter(PyObject *object, npy_bool *val) - -Convert an object to true / false - -:: - - int - PyArray_ByteorderConverter(PyObject *obj, char *endian) - -Convert object to endian - -:: - - int - PyArray_OrderConverter(PyObject *object, NPY_ORDER *val) - -Convert an object to FORTRAN / C / ANY / KEEP - -:: - - unsigned char - PyArray_EquivTypes(PyArray_Descr *type1, PyArray_Descr *type2) - - -This function returns true if the two typecodes are -equivalent (same basic kind and same itemsize). - -:: - - PyObject * - PyArray_Zeros(int nd, npy_intp *dims, PyArray_Descr *type, int - is_f_order) - -Zeros - -steal a reference -accepts NULL type - -:: - - PyObject * - PyArray_Empty(int nd, npy_intp *dims, PyArray_Descr *type, int - is_f_order) - -Empty - -accepts NULL type -steals referenct to type - -:: - - PyObject * - PyArray_Where(PyObject *condition, PyObject *x, PyObject *y) - -Where - -:: - - PyObject * - PyArray_Arange(double start, double stop, double step, int type_num) - -Arange, - -:: - - PyObject * - PyArray_ArangeObj(PyObject *start, PyObject *stop, PyObject - *step, PyArray_Descr *dtype) - - -ArangeObj, - -this doesn't change the references - -:: - - int - PyArray_SortkindConverter(PyObject *obj, NPY_SORTKIND *sortkind) - -Convert object to sort kind - -:: - - PyObject * - PyArray_LexSort(PyObject *sort_keys, int axis) - -LexSort an array providing indices that will sort a collection of arrays -lexicographically. The first key is sorted on first, followed by the second key --- requires that arg"merge"sort is available for each sort_key - -Returns an index array that shows the indexes for the lexicographic sort along -the given axis. - -:: - - PyObject * - PyArray_Round(PyArrayObject *a, int decimals, PyArrayObject *out) - -Round - -:: - - unsigned char - PyArray_EquivTypenums(int typenum1, int typenum2) - - -:: - - int - PyArray_RegisterDataType(PyArray_Descr *descr) - -Register Data type -Does not change the reference count of descr - -:: - - int - PyArray_RegisterCastFunc(PyArray_Descr *descr, int - totype, PyArray_VectorUnaryFunc *castfunc) - -Register Casting Function -Replaces any function currently stored. - -:: - - int - PyArray_RegisterCanCast(PyArray_Descr *descr, int - totype, NPY_SCALARKIND scalar) - -Register a type number indicating that a descriptor can be cast -to it safely - -:: - - void - PyArray_InitArrFuncs(PyArray_ArrFuncs *f) - -Initialize arrfuncs to NULL - -:: - - PyObject * - PyArray_IntTupleFromIntp(int len, npy_intp *vals) - -PyArray_IntTupleFromIntp - -:: - - int - PyArray_TypeNumFromName(char *str) - - -:: - - int - PyArray_ClipmodeConverter(PyObject *object, NPY_CLIPMODE *val) - -Convert an object to NPY_RAISE / NPY_CLIP / NPY_WRAP - -:: - - int - PyArray_OutputConverter(PyObject *object, PyArrayObject **address) - -Useful to pass as converter function for O& processing in -PyArgs_ParseTuple for output arrays - -:: - - PyObject * - PyArray_BroadcastToShape(PyObject *obj, npy_intp *dims, int nd) - -Get Iterator broadcast to a particular shape - -:: - - void - _PyArray_SigintHandler(int signum) - - -:: - - void* - _PyArray_GetSigintBuf(void ) - - -:: - - int - PyArray_DescrAlignConverter(PyObject *obj, PyArray_Descr **at) - - -Get type-descriptor from an object forcing alignment if possible -None goes to DEFAULT type. - -any object with the .fields attribute and/or .itemsize attribute (if the -.fields attribute does not give the total size -- i.e. a partial record -naming). If itemsize is given it must be >= size computed from fields - -The .fields attribute must return a convertible dictionary if present. -Result inherits from NPY_VOID. - -:: - - int - PyArray_DescrAlignConverter2(PyObject *obj, PyArray_Descr **at) - - -Get type-descriptor from an object forcing alignment if possible -None goes to NULL. - -:: - - int - PyArray_SearchsideConverter(PyObject *obj, void *addr) - -Convert object to searchsorted side - -:: - - PyObject * - PyArray_CheckAxis(PyArrayObject *arr, int *axis, int flags) - -PyArray_CheckAxis - -check that axis is valid -convert 0-d arrays to 1-d arrays - -:: - - npy_intp - PyArray_OverflowMultiplyList(npy_intp *l1, int n) - -Multiply a List of Non-negative numbers with over-flow detection. - -:: - - int - PyArray_CompareString(char *s1, char *s2, size_t len) - - -:: - - PyObject * - PyArray_MultiIterFromObjects(PyObject **mps, int n, int nadd, ... ) - -Get MultiIterator from array of Python objects and any additional - -PyObject **mps -- array of PyObjects -int n - number of PyObjects in the array -int nadd - number of additional arrays to include in the iterator. - -Returns a multi-iterator object. - -:: - - int - PyArray_GetEndianness(void ) - - -:: - - unsigned int - PyArray_GetNDArrayCFeatureVersion(void ) - -Returns the built-in (at compilation time) C API version - -:: - - PyObject * - PyArray_Correlate2(PyObject *op1, PyObject *op2, int mode) - -correlate(a1,a2,mode) - -This function computes the usual correlation (correlate(a1, a2) != -correlate(a2, a1), and conjugate the second argument for complex inputs - -:: - - PyObject* - PyArray_NeighborhoodIterNew(PyArrayIterObject *x, npy_intp - *bounds, int mode, PyArrayObject*fill) - -A Neighborhood Iterator object. - -:: - - void - PyArray_SetDatetimeParseFunction(PyObject *op) - -This function is scheduled to be removed - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - void - PyArray_DatetimeToDatetimeStruct(npy_datetime val, NPY_DATETIMEUNIT - fr, npy_datetimestruct *result) - -Fill the datetime struct from the value and resolution unit. - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - void - PyArray_TimedeltaToTimedeltaStruct(npy_timedelta val, NPY_DATETIMEUNIT - fr, npy_timedeltastruct *result) - -Fill the timedelta struct from the timedelta value and resolution unit. - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - npy_datetime - PyArray_DatetimeStructToDatetime(NPY_DATETIMEUNIT - fr, npy_datetimestruct *d) - -Create a datetime value from a filled datetime struct and resolution unit. - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - npy_datetime - PyArray_TimedeltaStructToTimedelta(NPY_DATETIMEUNIT - fr, npy_timedeltastruct *d) - -Create a timdelta value from a filled timedelta struct and resolution unit. - -TO BE REMOVED - NOT USED INTERNALLY. - -:: - - NpyIter * - NpyIter_New(PyArrayObject *op, npy_uint32 flags, NPY_ORDER - order, NPY_CASTING casting, PyArray_Descr*dtype) - -Allocate a new iterator for one array object. - -:: - - NpyIter * - NpyIter_MultiNew(int nop, PyArrayObject **op_in, npy_uint32 - flags, NPY_ORDER order, NPY_CASTING - casting, npy_uint32 *op_flags, PyArray_Descr - **op_request_dtypes) - -Allocate a new iterator for more than one array object, using -standard NumPy broadcasting rules and the default buffer size. - -:: - - NpyIter * - NpyIter_AdvancedNew(int nop, PyArrayObject **op_in, npy_uint32 - flags, NPY_ORDER order, NPY_CASTING - casting, npy_uint32 *op_flags, PyArray_Descr - **op_request_dtypes, int oa_ndim, int - **op_axes, npy_intp *itershape, npy_intp - buffersize) - -Allocate a new iterator for multiple array objects, and advanced -options for controlling the broadcasting, shape, and buffer size. - -:: - - NpyIter * - NpyIter_Copy(NpyIter *iter) - -Makes a copy of the iterator - -:: - - int - NpyIter_Deallocate(NpyIter *iter) - -Deallocate an iterator - -:: - - npy_bool - NpyIter_HasDelayedBufAlloc(NpyIter *iter) - -Whether the buffer allocation is being delayed - -:: - - npy_bool - NpyIter_HasExternalLoop(NpyIter *iter) - -Whether the iterator handles the inner loop - -:: - - int - NpyIter_EnableExternalLoop(NpyIter *iter) - -Removes the inner loop handling (so HasExternalLoop returns true) - -:: - - npy_intp * - NpyIter_GetInnerStrideArray(NpyIter *iter) - -Get the array of strides for the inner loop (when HasExternalLoop is true) - -This function may be safely called without holding the Python GIL. - -:: - - npy_intp * - NpyIter_GetInnerLoopSizePtr(NpyIter *iter) - -Get a pointer to the size of the inner loop (when HasExternalLoop is true) - -This function may be safely called without holding the Python GIL. - -:: - - int - NpyIter_Reset(NpyIter *iter, char **errmsg) - -Resets the iterator to its initial state - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - int - NpyIter_ResetBasePointers(NpyIter *iter, char **baseptrs, char - **errmsg) - -Resets the iterator to its initial state, with new base data pointers. -This function requires great caution. - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - int - NpyIter_ResetToIterIndexRange(NpyIter *iter, npy_intp istart, npy_intp - iend, char **errmsg) - -Resets the iterator to a new iterator index range - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - int - NpyIter_GetNDim(NpyIter *iter) - -Gets the number of dimensions being iterated - -:: - - int - NpyIter_GetNOp(NpyIter *iter) - -Gets the number of operands being iterated - -:: - - NpyIter_IterNextFunc * - NpyIter_GetIterNext(NpyIter *iter, char **errmsg) - -Compute the specialized iteration function for an iterator - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - npy_intp - NpyIter_GetIterSize(NpyIter *iter) - -Gets the number of elements being iterated - -:: - - void - NpyIter_GetIterIndexRange(NpyIter *iter, npy_intp *istart, npy_intp - *iend) - -Gets the range of iteration indices being iterated - -:: - - npy_intp - NpyIter_GetIterIndex(NpyIter *iter) - -Gets the current iteration index - -:: - - int - NpyIter_GotoIterIndex(NpyIter *iter, npy_intp iterindex) - -Sets the iterator position to the specified iterindex, -which matches the iteration order of the iterator. - -Returns NPY_SUCCEED on success, NPY_FAIL on failure. - -:: - - npy_bool - NpyIter_HasMultiIndex(NpyIter *iter) - -Whether the iterator is tracking a multi-index - -:: - - int - NpyIter_GetShape(NpyIter *iter, npy_intp *outshape) - -Gets the broadcast shape if a multi-index is being tracked by the iterator, -otherwise gets the shape of the iteration as Fortran-order -(fastest-changing index first). - -The reason Fortran-order is returned when a multi-index -is not enabled is that this is providing a direct view into how -the iterator traverses the n-dimensional space. The iterator organizes -its memory from fastest index to slowest index, and when -a multi-index is enabled, it uses a permutation to recover the original -order. - -Returns NPY_SUCCEED or NPY_FAIL. - -:: - - NpyIter_GetMultiIndexFunc * - NpyIter_GetGetMultiIndex(NpyIter *iter, char **errmsg) - -Compute a specialized get_multi_index function for the iterator - -If errmsg is non-NULL, it should point to a variable which will -receive the error message, and no Python exception will be set. -This is so that the function can be called from code not holding -the GIL. - -:: - - int - NpyIter_GotoMultiIndex(NpyIter *iter, npy_intp *multi_index) - -Sets the iterator to the specified multi-index, which must have the -correct number of entries for 'ndim'. It is only valid -when NPY_ITER_MULTI_INDEX was passed to the constructor. This operation -fails if the multi-index is out of bounds. - -Returns NPY_SUCCEED on success, NPY_FAIL on failure. - -:: - - int - NpyIter_RemoveMultiIndex(NpyIter *iter) - -Removes multi-index support from an iterator. - -Returns NPY_SUCCEED or NPY_FAIL. - -:: - - npy_bool - NpyIter_HasIndex(NpyIter *iter) - -Whether the iterator is tracking an index - -:: - - npy_bool - NpyIter_IsBuffered(NpyIter *iter) - -Whether the iterator is buffered - -:: - - npy_bool - NpyIter_IsGrowInner(NpyIter *iter) - -Whether the inner loop can grow if buffering is unneeded - -:: - - npy_intp - NpyIter_GetBufferSize(NpyIter *iter) - -Gets the size of the buffer, or 0 if buffering is not enabled - -:: - - npy_intp * - NpyIter_GetIndexPtr(NpyIter *iter) - -Get a pointer to the index, if it is being tracked - -:: - - int - NpyIter_GotoIndex(NpyIter *iter, npy_intp flat_index) - -If the iterator is tracking an index, sets the iterator -to the specified index. - -Returns NPY_SUCCEED on success, NPY_FAIL on failure. - -:: - - char ** - NpyIter_GetDataPtrArray(NpyIter *iter) - -Get the array of data pointers (1 per object being iterated) - -This function may be safely called without holding the Python GIL. - -:: - - PyArray_Descr ** - NpyIter_GetDescrArray(NpyIter *iter) - -Get the array of data type pointers (1 per object being iterated) - -:: - - PyArrayObject ** - NpyIter_GetOperandArray(NpyIter *iter) - -Get the array of objects being iterated - -:: - - PyArrayObject * - NpyIter_GetIterView(NpyIter *iter, npy_intp i) - -Returns a view to the i-th object with the iterator's internal axes - -:: - - void - NpyIter_GetReadFlags(NpyIter *iter, char *outreadflags) - -Gets an array of read flags (1 per object being iterated) - -:: - - void - NpyIter_GetWriteFlags(NpyIter *iter, char *outwriteflags) - -Gets an array of write flags (1 per object being iterated) - -:: - - void - NpyIter_DebugPrint(NpyIter *iter) - -For debugging - -:: - - npy_bool - NpyIter_IterationNeedsAPI(NpyIter *iter) - -Whether the iteration loop, and in particular the iternext() -function, needs API access. If this is true, the GIL must -be retained while iterating. - -:: - - void - NpyIter_GetInnerFixedStrideArray(NpyIter *iter, npy_intp *out_strides) - -Get an array of strides which are fixed. Any strides which may -change during iteration receive the value NPY_MAX_INTP. Once -the iterator is ready to iterate, call this to get the strides -which will always be fixed in the inner loop, then choose optimized -inner loop functions which take advantage of those fixed strides. - -This function may be safely called without holding the Python GIL. - -:: - - int - NpyIter_RemoveAxis(NpyIter *iter, int axis) - -Removes an axis from iteration. This requires that NPY_ITER_MULTI_INDEX -was set for iterator creation, and does not work if buffering is -enabled. This function also resets the iterator to its initial state. - -Returns NPY_SUCCEED or NPY_FAIL. - -:: - - npy_intp * - NpyIter_GetAxisStrideArray(NpyIter *iter, int axis) - -Gets the array of strides for the specified axis. -If the iterator is tracking a multi-index, gets the strides -for the axis specified, otherwise gets the strides for -the iteration axis as Fortran order (fastest-changing axis first). - -Returns NULL if an error occurs. - -:: - - npy_bool - NpyIter_RequiresBuffering(NpyIter *iter) - -Whether the iteration could be done with no buffering. - -:: - - char ** - NpyIter_GetInitialDataPtrArray(NpyIter *iter) - -Get the array of data pointers (1 per object being iterated), -directly into the arrays (never pointing to a buffer), for starting -unbuffered iteration. This always returns the addresses for the -iterator position as reset to iterator index 0. - -These pointers are different from the pointers accepted by -NpyIter_ResetBasePointers, because the direction along some -axes may have been reversed, requiring base offsets. - -This function may be safely called without holding the Python GIL. - -:: - - int - NpyIter_CreateCompatibleStrides(NpyIter *iter, npy_intp - itemsize, npy_intp *outstrides) - -Builds a set of strides which are the same as the strides of an -output array created using the NPY_ITER_ALLOCATE flag, where NULL -was passed for op_axes. This is for data packed contiguously, -but not necessarily in C or Fortran order. This should be used -together with NpyIter_GetShape and NpyIter_GetNDim. - -A use case for this function is to match the shape and layout of -the iterator and tack on one or more dimensions. For example, -in order to generate a vector per input value for a numerical gradient, -you pass in ndim*itemsize for itemsize, then add another dimension to -the end with size ndim and stride itemsize. To do the Hessian matrix, -you do the same thing but add two dimensions, or take advantage of -the symmetry and pack it into 1 dimension with a particular encoding. - -This function may only be called if the iterator is tracking a multi-index -and if NPY_ITER_DONT_NEGATE_STRIDES was used to prevent an axis from -being iterated in reverse order. - -If an array is created with this method, simply adding 'itemsize' -for each iteration will traverse the new array matching the -iterator. - -Returns NPY_SUCCEED or NPY_FAIL. - -:: - - int - PyArray_CastingConverter(PyObject *obj, NPY_CASTING *casting) - -Convert any Python object, *obj*, to an NPY_CASTING enum. - -:: - - npy_intp - PyArray_CountNonzero(PyArrayObject *self) - -Counts the number of non-zero elements in the array. - -Returns -1 on error. - -:: - - PyArray_Descr * - PyArray_PromoteTypes(PyArray_Descr *type1, PyArray_Descr *type2) - -Produces the smallest size and lowest kind type to which both -input types can be cast. - -:: - - PyArray_Descr * - PyArray_MinScalarType(PyArrayObject *arr) - -If arr is a scalar (has 0 dimensions) with a built-in number data type, -finds the smallest type size/kind which can still represent its data. -Otherwise, returns the array's data type. - - -:: - - PyArray_Descr * - PyArray_ResultType(npy_intp narrs, PyArrayObject **arr, npy_intp - ndtypes, PyArray_Descr **dtypes) - -Produces the result type of a bunch of inputs, using the UFunc -type promotion rules. Use this function when you have a set of -input arrays, and need to determine an output array dtype. - -If all the inputs are scalars (have 0 dimensions) or the maximum "kind" -of the scalars is greater than the maximum "kind" of the arrays, does -a regular type promotion. - -Otherwise, does a type promotion on the MinScalarType -of all the inputs. Data types passed directly are treated as array -types. - - -:: - - npy_bool - PyArray_CanCastArrayTo(PyArrayObject *arr, PyArray_Descr - *to, NPY_CASTING casting) - -Returns 1 if the array object may be cast to the given data type using -the casting rule, 0 otherwise. This differs from PyArray_CanCastTo in -that it handles scalar arrays (0 dimensions) specially, by checking -their value. - -:: - - npy_bool - PyArray_CanCastTypeTo(PyArray_Descr *from, PyArray_Descr - *to, NPY_CASTING casting) - -Returns true if data of type 'from' may be cast to data of type -'to' according to the rule 'casting'. - -:: - - PyArrayObject * - PyArray_EinsteinSum(char *subscripts, npy_intp nop, PyArrayObject - **op_in, PyArray_Descr *dtype, NPY_ORDER - order, NPY_CASTING casting, PyArrayObject *out) - -This function provides summation of array elements according to -the Einstein summation convention. For example: -- trace(a) -> einsum("ii", a) -- transpose(a) -> einsum("ji", a) -- multiply(a,b) -> einsum(",", a, b) -- inner(a,b) -> einsum("i,i", a, b) -- outer(a,b) -> einsum("i,j", a, b) -- matvec(a,b) -> einsum("ij,j", a, b) -- matmat(a,b) -> einsum("ij,jk", a, b) - -subscripts: The string of subscripts for einstein summation. -nop: The number of operands -op_in: The array of operands -dtype: Either NULL, or the data type to force the calculation as. -order: The order for the calculation/the output axes. -casting: What kind of casts should be permitted. -out: Either NULL, or an array into which the output should be placed. - -By default, the labels get placed in alphabetical order -at the end of the output. So, if c = einsum("i,j", a, b) -then c[i,j] == a[i]*b[j], but if c = einsum("j,i", a, b) -then c[i,j] = a[j]*b[i]. - -Alternatively, you can control the output order or prevent -an axis from being summed/force an axis to be summed by providing -indices for the output. This allows us to turn 'trace' into -'diag', for example. -- diag(a) -> einsum("ii->i", a) -- sum(a, axis=0) -> einsum("i...->", a) - -Subscripts at the beginning and end may be specified by -putting an ellipsis "..." in the middle. For example, -the function einsum("i...i", a) takes the diagonal of -the first and last dimensions of the operand, and -einsum("ij...,jk...->ik...") takes the matrix product using -the first two indices of each operand instead of the last two. - -When there is only one operand, no axes being summed, and -no output parameter, this function returns a view -into the operand instead of making a copy. - -:: - - PyObject * - PyArray_NewLikeArray(PyArrayObject *prototype, NPY_ORDER - order, PyArray_Descr *dtype, int subok) - -Creates a new array with the same shape as the provided one, -with possible memory layout order and data type changes. - -prototype - The array the new one should be like. -order - NPY_CORDER - C-contiguous result. -NPY_FORTRANORDER - Fortran-contiguous result. -NPY_ANYORDER - Fortran if prototype is Fortran, C otherwise. -NPY_KEEPORDER - Keeps the axis ordering of prototype. -dtype - If not NULL, overrides the data type of the result. -subok - If 1, use the prototype's array subtype, otherwise -always create a base-class array. - -NOTE: If dtype is not NULL, steals the dtype reference. - -:: - - int - PyArray_GetArrayParamsFromObject(PyObject *op, PyArray_Descr - *requested_dtype, npy_bool - writeable, PyArray_Descr - **out_dtype, int *out_ndim, npy_intp - *out_dims, PyArrayObject - **out_arr, PyObject *context) - -Retrieves the array parameters for viewing/converting an arbitrary -PyObject* to a NumPy array. This allows the "innate type and shape" -of Python list-of-lists to be discovered without -actually converting to an array. - -In some cases, such as structured arrays and the __array__ interface, -a data type needs to be used to make sense of the object. When -this is needed, provide a Descr for 'requested_dtype', otherwise -provide NULL. This reference is not stolen. Also, if the requested -dtype doesn't modify the interpretation of the input, out_dtype will -still get the "innate" dtype of the object, not the dtype passed -in 'requested_dtype'. - -If writing to the value in 'op' is desired, set the boolean -'writeable' to 1. This raises an error when 'op' is a scalar, list -of lists, or other non-writeable 'op'. - -Result: When success (0 return value) is returned, either out_arr -is filled with a non-NULL PyArrayObject and -the rest of the parameters are untouched, or out_arr is -filled with NULL, and the rest of the parameters are -filled. - -Typical usage: - -PyArrayObject *arr = NULL; -PyArray_Descr *dtype = NULL; -int ndim = 0; -npy_intp dims[NPY_MAXDIMS]; - -if (PyArray_GetArrayParamsFromObject(op, NULL, 1, &dtype, -&ndim, &dims, &arr, NULL) < 0) { -return NULL; -} -if (arr == NULL) { -... validate/change dtype, validate flags, ndim, etc ... -// Could make custom strides here too -arr = PyArray_NewFromDescr(&PyArray_Type, dtype, ndim, -dims, NULL, -is_f_order ? NPY_ARRAY_F_CONTIGUOUS : 0, -NULL); -if (arr == NULL) { -return NULL; -} -if (PyArray_CopyObject(arr, op) < 0) { -Py_DECREF(arr); -return NULL; -} -} -else { -... in this case the other parameters weren't filled, just -validate and possibly copy arr itself ... -} -... use arr ... - -:: - - int - PyArray_ConvertClipmodeSequence(PyObject *object, NPY_CLIPMODE - *modes, int n) - -Convert an object to an array of n NPY_CLIPMODE values. -This is intended to be used in functions where a different mode -could be applied to each axis, like in ravel_multi_index. - -:: - - PyObject * - PyArray_MatrixProduct2(PyObject *op1, PyObject - *op2, PyArrayObject*out) - -Numeric.matrixproduct(a,v,out) -just like inner product but does the swapaxes stuff on the fly - -:: - - npy_bool - NpyIter_IsFirstVisit(NpyIter *iter, int iop) - -Checks to see whether this is the first time the elements -of the specified reduction operand which the iterator points at are -being seen for the first time. The function returns -a reasonable answer for reduction operands and when buffering is -disabled. The answer may be incorrect for buffered non-reduction -operands. - -This function is intended to be used in EXTERNAL_LOOP mode only, -and will produce some wrong answers when that mode is not enabled. - -If this function returns true, the caller should also -check the inner loop stride of the operand, because if -that stride is 0, then only the first element of the innermost -external loop is being visited for the first time. - -WARNING: For performance reasons, 'iop' is not bounds-checked, -it is not confirmed that 'iop' is actually a reduction -operand, and it is not confirmed that EXTERNAL_LOOP -mode is enabled. These checks are the responsibility of -the caller, and should be done outside of any inner loops. - -:: - - int - PyArray_SetBaseObject(PyArrayObject *arr, PyObject *obj) - -Sets the 'base' attribute of the array. This steals a reference -to 'obj'. - -Returns 0 on success, -1 on failure. - -:: - - void - PyArray_CreateSortedStridePerm(int ndim, npy_intp - *strides, npy_stride_sort_item - *out_strideperm) - - -This function populates the first ndim elements -of strideperm with sorted descending by their absolute values. -For example, the stride array (4, -2, 12) becomes -[(2, 12), (0, 4), (1, -2)]. - -:: - - void - PyArray_RemoveAxesInPlace(PyArrayObject *arr, npy_bool *flags) - - -Removes the axes flagged as True from the array, -modifying it in place. If an axis flagged for removal -has a shape entry bigger than one, this effectively selects -index zero for that axis. - -WARNING: If an axis flagged for removal has a shape equal to zero, -the array will point to invalid memory. The caller must -validate this! - -For example, this can be used to remove the reduction axes -from a reduction result once its computation is complete. - -:: - - void - PyArray_DebugPrint(PyArrayObject *obj) - -Prints the raw data of the ndarray in a form useful for debugging -low-level C issues. - -:: - - int - PyArray_FailUnlessWriteable(PyArrayObject *obj, const char *name) - - -This function does nothing if obj is writeable, and raises an exception -(and returns -1) if obj is not writeable. It may also do other -house-keeping, such as issuing warnings on arrays which are transitioning -to become views. Always call this function at some point before writing to -an array. - -'name' is a name for the array, used to give better error -messages. Something like "assignment destination", "output array", or even -just "array". - -:: - - int - PyArray_SetUpdateIfCopyBase(PyArrayObject *arr, PyArrayObject *base) - - -Precondition: 'arr' is a copy of 'base' (though possibly with different -strides, ordering, etc.). This function sets the UPDATEIFCOPY flag and the -->base pointer on 'arr', so that when 'arr' is destructed, it will copy any -changes back to 'base'. - -Steals a reference to 'base'. - -Returns 0 on success, -1 on failure. - -:: - - void * - PyDataMem_NEW(size_t size) - -Allocates memory for array data. - -:: - - void - PyDataMem_FREE(void *ptr) - -Free memory for array data. - -:: - - void * - PyDataMem_RENEW(void *ptr, size_t size) - -Reallocate/resize memory for array data. - -:: - - PyDataMem_EventHookFunc * - PyDataMem_SetEventHook(PyDataMem_EventHookFunc *newhook, void - *user_data, void **old_data) - -Sets the allocation event hook for numpy array data. -Takes a PyDataMem_EventHookFunc *, which has the signature: -void hook(void *old, void *new, size_t size, void *user_data). -Also takes a void *user_data, and void **old_data. - -Returns a pointer to the previous hook or NULL. If old_data is -non-NULL, the previous user_data pointer will be copied to it. - -If not NULL, hook will be called at the end of each PyDataMem_NEW/FREE/RENEW: -result = PyDataMem_NEW(size) -> (*hook)(NULL, result, size, user_data) -PyDataMem_FREE(ptr) -> (*hook)(ptr, NULL, 0, user_data) -result = PyDataMem_RENEW(ptr, size) -> (*hook)(ptr, result, size, user_data) - -When the hook is called, the GIL will be held by the calling -thread. The hook should be written to be reentrant, if it performs -operations that might cause new allocation events (such as the -creation/descruction numpy objects, or creating/destroying Python -objects which might cause a gc) - diff --git a/include/numpy/ndarrayobject.h b/include/numpy/ndarrayobject.h deleted file mode 100644 index f00dd77..0000000 --- a/include/numpy/ndarrayobject.h +++ /dev/null @@ -1,244 +0,0 @@ -/* - * DON'T INCLUDE THIS DIRECTLY. - */ - -#ifndef NPY_NDARRAYOBJECT_H -#define NPY_NDARRAYOBJECT_H -#ifdef __cplusplus -#define CONFUSE_EMACS { -#define CONFUSE_EMACS2 } -extern "C" CONFUSE_EMACS -#undef CONFUSE_EMACS -#undef CONFUSE_EMACS2 -/* ... otherwise a semi-smart identer (like emacs) tries to indent - everything when you're typing */ -#endif - -#include "ndarraytypes.h" - -/* Includes the "function" C-API -- these are all stored in a - list of pointers --- one for each file - The two lists are concatenated into one in multiarray. - - They are available as import_array() -*/ - -#include "__multiarray_api.h" - - -/* C-API that requries previous API to be defined */ - -#define PyArray_DescrCheck(op) (((PyObject*)(op))->ob_type==&PyArrayDescr_Type) - -#define PyArray_Check(op) PyObject_TypeCheck(op, &PyArray_Type) -#define PyArray_CheckExact(op) (((PyObject*)(op))->ob_type == &PyArray_Type) - -#define PyArray_HasArrayInterfaceType(op, type, context, out) \ - ((((out)=PyArray_FromStructInterface(op)) != Py_NotImplemented) || \ - (((out)=PyArray_FromInterface(op)) != Py_NotImplemented) || \ - (((out)=PyArray_FromArrayAttr(op, type, context)) != \ - Py_NotImplemented)) - -#define PyArray_HasArrayInterface(op, out) \ - PyArray_HasArrayInterfaceType(op, NULL, NULL, out) - -#define PyArray_IsZeroDim(op) (PyArray_Check(op) && \ - (PyArray_NDIM((PyArrayObject *)op) == 0)) - -#define PyArray_IsScalar(obj, cls) \ - (PyObject_TypeCheck(obj, &Py##cls##ArrType_Type)) - -#define PyArray_CheckScalar(m) (PyArray_IsScalar(m, Generic) || \ - PyArray_IsZeroDim(m)) - -#define PyArray_IsPythonNumber(obj) \ - (PyInt_Check(obj) || PyFloat_Check(obj) || PyComplex_Check(obj) || \ - PyLong_Check(obj) || PyBool_Check(obj)) - -#define PyArray_IsPythonScalar(obj) \ - (PyArray_IsPythonNumber(obj) || PyString_Check(obj) || \ - PyUnicode_Check(obj)) - -#define PyArray_IsAnyScalar(obj) \ - (PyArray_IsScalar(obj, Generic) || PyArray_IsPythonScalar(obj)) - -#define PyArray_CheckAnyScalar(obj) (PyArray_IsPythonScalar(obj) || \ - PyArray_CheckScalar(obj)) - -#define PyArray_IsIntegerScalar(obj) (PyInt_Check(obj) \ - || PyLong_Check(obj) \ - || PyArray_IsScalar((obj), Integer)) - - -#define PyArray_GETCONTIGUOUS(m) (PyArray_ISCONTIGUOUS(m) ? \ - Py_INCREF(m), (m) : \ - (PyArrayObject *)(PyArray_Copy(m))) - -#define PyArray_SAMESHAPE(a1,a2) ((PyArray_NDIM(a1) == PyArray_NDIM(a2)) && \ - PyArray_CompareLists(PyArray_DIMS(a1), \ - PyArray_DIMS(a2), \ - PyArray_NDIM(a1))) - -#define PyArray_SIZE(m) PyArray_MultiplyList(PyArray_DIMS(m), PyArray_NDIM(m)) -#define PyArray_NBYTES(m) (PyArray_ITEMSIZE(m) * PyArray_SIZE(m)) -#define PyArray_FROM_O(m) PyArray_FromAny(m, NULL, 0, 0, 0, NULL) - -#define PyArray_FROM_OF(m,flags) PyArray_CheckFromAny(m, NULL, 0, 0, flags, \ - NULL) - -#define PyArray_FROM_OT(m,type) PyArray_FromAny(m, \ - PyArray_DescrFromType(type), 0, 0, 0, NULL); - -#define PyArray_FROM_OTF(m, type, flags) \ - PyArray_FromAny(m, PyArray_DescrFromType(type), 0, 0, \ - (((flags) & NPY_ARRAY_ENSURECOPY) ? \ - ((flags) | NPY_ARRAY_DEFAULT) : (flags)), NULL) - -#define PyArray_FROMANY(m, type, min, max, flags) \ - PyArray_FromAny(m, PyArray_DescrFromType(type), min, max, \ - (((flags) & NPY_ARRAY_ENSURECOPY) ? \ - (flags) | NPY_ARRAY_DEFAULT : (flags)), NULL) - -#define PyArray_ZEROS(m, dims, type, is_f_order) \ - PyArray_Zeros(m, dims, PyArray_DescrFromType(type), is_f_order) - -#define PyArray_EMPTY(m, dims, type, is_f_order) \ - PyArray_Empty(m, dims, PyArray_DescrFromType(type), is_f_order) - -#define PyArray_FILLWBYTE(obj, val) memset(PyArray_DATA(obj), val, \ - PyArray_NBYTES(obj)) - -#define PyArray_REFCOUNT(obj) (((PyObject *)(obj))->ob_refcnt) -#define NPY_REFCOUNT PyArray_REFCOUNT -#define NPY_MAX_ELSIZE (2 * NPY_SIZEOF_LONGDOUBLE) - -#define PyArray_ContiguousFromAny(op, type, min_depth, max_depth) \ - PyArray_FromAny(op, PyArray_DescrFromType(type), min_depth, \ - max_depth, NPY_ARRAY_DEFAULT, NULL) - -#define PyArray_EquivArrTypes(a1, a2) \ - PyArray_EquivTypes(PyArray_DESCR(a1), PyArray_DESCR(a2)) - -#define PyArray_EquivByteorders(b1, b2) \ - (((b1) == (b2)) || (PyArray_ISNBO(b1) == PyArray_ISNBO(b2))) - -#define PyArray_SimpleNew(nd, dims, typenum) \ - PyArray_New(&PyArray_Type, nd, dims, typenum, NULL, NULL, 0, 0, NULL) - -#define PyArray_SimpleNewFromData(nd, dims, typenum, data) \ - PyArray_New(&PyArray_Type, nd, dims, typenum, NULL, \ - data, 0, NPY_ARRAY_CARRAY, NULL) - -#define PyArray_SimpleNewFromDescr(nd, dims, descr) \ - PyArray_NewFromDescr(&PyArray_Type, descr, nd, dims, \ - NULL, NULL, 0, NULL) - -#define PyArray_ToScalar(data, arr) \ - PyArray_Scalar(data, PyArray_DESCR(arr), (PyObject *)arr) - - -/* These might be faster without the dereferencing of obj - going on inside -- of course an optimizing compiler should - inline the constants inside a for loop making it a moot point -*/ - -#define PyArray_GETPTR1(obj, i) ((void *)(PyArray_BYTES(obj) + \ - (i)*PyArray_STRIDES(obj)[0])) - -#define PyArray_GETPTR2(obj, i, j) ((void *)(PyArray_BYTES(obj) + \ - (i)*PyArray_STRIDES(obj)[0] + \ - (j)*PyArray_STRIDES(obj)[1])) - -#define PyArray_GETPTR3(obj, i, j, k) ((void *)(PyArray_BYTES(obj) + \ - (i)*PyArray_STRIDES(obj)[0] + \ - (j)*PyArray_STRIDES(obj)[1] + \ - (k)*PyArray_STRIDES(obj)[2])) - -#define PyArray_GETPTR4(obj, i, j, k, l) ((void *)(PyArray_BYTES(obj) + \ - (i)*PyArray_STRIDES(obj)[0] + \ - (j)*PyArray_STRIDES(obj)[1] + \ - (k)*PyArray_STRIDES(obj)[2] + \ - (l)*PyArray_STRIDES(obj)[3])) - -static NPY_INLINE void -PyArray_XDECREF_ERR(PyArrayObject *arr) -{ - if (arr != NULL) { - if (PyArray_FLAGS(arr) & NPY_ARRAY_UPDATEIFCOPY) { - PyArrayObject *base = (PyArrayObject *)PyArray_BASE(arr); - PyArray_ENABLEFLAGS(base, NPY_ARRAY_WRITEABLE); - PyArray_CLEARFLAGS(arr, NPY_ARRAY_UPDATEIFCOPY); - } - Py_DECREF(arr); - } -} - -#define PyArray_DESCR_REPLACE(descr) do { \ - PyArray_Descr *_new_; \ - _new_ = PyArray_DescrNew(descr); \ - Py_XDECREF(descr); \ - descr = _new_; \ - } while(0) - -/* Copy should always return contiguous array */ -#define PyArray_Copy(obj) PyArray_NewCopy(obj, NPY_CORDER) - -#define PyArray_FromObject(op, type, min_depth, max_depth) \ - PyArray_FromAny(op, PyArray_DescrFromType(type), min_depth, \ - max_depth, NPY_ARRAY_BEHAVED | \ - NPY_ARRAY_ENSUREARRAY, NULL) - -#define PyArray_ContiguousFromObject(op, type, min_depth, max_depth) \ - PyArray_FromAny(op, PyArray_DescrFromType(type), min_depth, \ - max_depth, NPY_ARRAY_DEFAULT | \ - NPY_ARRAY_ENSUREARRAY, NULL) - -#define PyArray_CopyFromObject(op, type, min_depth, max_depth) \ - PyArray_FromAny(op, PyArray_DescrFromType(type), min_depth, \ - max_depth, NPY_ARRAY_ENSURECOPY | \ - NPY_ARRAY_DEFAULT | \ - NPY_ARRAY_ENSUREARRAY, NULL) - -#define PyArray_Cast(mp, type_num) \ - PyArray_CastToType(mp, PyArray_DescrFromType(type_num), 0) - -#define PyArray_Take(ap, items, axis) \ - PyArray_TakeFrom(ap, items, axis, NULL, NPY_RAISE) - -#define PyArray_Put(ap, items, values) \ - PyArray_PutTo(ap, items, values, NPY_RAISE) - -/* Compatibility with old Numeric stuff -- don't use in new code */ - -#define PyArray_FromDimsAndData(nd, d, type, data) \ - PyArray_FromDimsAndDataAndDescr(nd, d, PyArray_DescrFromType(type), \ - data) - - -/* - Check to see if this key in the dictionary is the "title" - entry of the tuple (i.e. a duplicate dictionary entry in the fields - dict. -*/ - -#define NPY_TITLE_KEY(key, value) ((PyTuple_GET_SIZE((value))==3) && \ - (PyTuple_GET_ITEM((value), 2) == (key))) - - -/* Define python version independent deprecation macro */ - -#if PY_VERSION_HEX >= 0x02050000 -#define DEPRECATE(msg) PyErr_WarnEx(PyExc_DeprecationWarning,msg,1) -#define DEPRECATE_FUTUREWARNING(msg) PyErr_WarnEx(PyExc_FutureWarning,msg,1) -#else -#define DEPRECATE(msg) PyErr_Warn(PyExc_DeprecationWarning,msg) -#define DEPRECATE_FUTUREWARNING(msg) PyErr_Warn(PyExc_FutureWarning,msg) -#endif - - -#ifdef __cplusplus -} -#endif - - -#endif /* NPY_NDARRAYOBJECT_H */ diff --git a/include/numpy/ndarraytypes.h b/include/numpy/ndarraytypes.h deleted file mode 100644 index 04d037e..0000000 --- a/include/numpy/ndarraytypes.h +++ /dev/null @@ -1,1731 +0,0 @@ -#ifndef NDARRAYTYPES_H -#define NDARRAYTYPES_H - -/* numpyconfig.h is auto-generated by the installer */ -#include "numpyconfig.h" - -#include "npy_common.h" -#include "npy_endian.h" -#include "npy_cpu.h" -#include "utils.h" - -#ifdef NPY_ENABLE_SEPARATE_COMPILATION - #define NPY_NO_EXPORT NPY_VISIBILITY_HIDDEN -#else - #define NPY_NO_EXPORT static -#endif - -/* Only use thread if configured in config and python supports it */ -#if defined WITH_THREAD && !NPY_NO_SMP - #define NPY_ALLOW_THREADS 1 -#else - #define NPY_ALLOW_THREADS 0 -#endif - - - -/* - * There are several places in the code where an array of dimensions - * is allocated statically. This is the size of that static - * allocation. - * - * The array creation itself could have arbitrary dimensions but all - * the places where static allocation is used would need to be changed - * to dynamic (including inside of several structures) - */ - -#define NPY_MAXDIMS 32 -#define NPY_MAXARGS 32 - -/* Used for Converter Functions "O&" code in ParseTuple */ -#define NPY_FAIL 0 -#define NPY_SUCCEED 1 - -/* - * Binary compatibility version number. This number is increased - * whenever the C-API is changed such that binary compatibility is - * broken, i.e. whenever a recompile of extension modules is needed. - */ -#define NPY_VERSION NPY_ABI_VERSION - -/* - * Minor API version. This number is increased whenever a change is - * made to the C-API -- whether it breaks binary compatibility or not. - * Some changes, such as adding a function pointer to the end of the - * function table, can be made without breaking binary compatibility. - * In this case, only the NPY_FEATURE_VERSION (*not* NPY_VERSION) - * would be increased. Whenever binary compatibility is broken, both - * NPY_VERSION and NPY_FEATURE_VERSION should be increased. - */ -#define NPY_FEATURE_VERSION NPY_API_VERSION - -enum NPY_TYPES { NPY_BOOL=0, - NPY_BYTE, NPY_UBYTE, - NPY_SHORT, NPY_USHORT, - NPY_INT, NPY_UINT, - NPY_LONG, NPY_ULONG, - NPY_LONGLONG, NPY_ULONGLONG, - NPY_FLOAT, NPY_DOUBLE, NPY_LONGDOUBLE, - NPY_CFLOAT, NPY_CDOUBLE, NPY_CLONGDOUBLE, - NPY_OBJECT=17, - NPY_STRING, NPY_UNICODE, - NPY_VOID, - /* - * New 1.6 types appended, may be integrated - * into the above in 2.0. - */ - NPY_DATETIME, NPY_TIMEDELTA, NPY_HALF, - - NPY_NTYPES, - NPY_NOTYPE, - NPY_CHAR, /* special flag */ - NPY_USERDEF=256, /* leave room for characters */ - - /* The number of types not including the new 1.6 types */ - NPY_NTYPES_ABI_COMPATIBLE=21 -}; - -/* basetype array priority */ -#define NPY_PRIORITY 0.0 - -/* default subtype priority */ -#define NPY_SUBTYPE_PRIORITY 1.0 - -/* default scalar priority */ -#define NPY_SCALAR_PRIORITY -1000000.0 - -/* How many floating point types are there (excluding half) */ -#define NPY_NUM_FLOATTYPE 3 - -/* - * These characters correspond to the array type and the struct - * module - */ - -enum NPY_TYPECHAR { - NPY_BOOLLTR = '?', - NPY_BYTELTR = 'b', - NPY_UBYTELTR = 'B', - NPY_SHORTLTR = 'h', - NPY_USHORTLTR = 'H', - NPY_INTLTR = 'i', - NPY_UINTLTR = 'I', - NPY_LONGLTR = 'l', - NPY_ULONGLTR = 'L', - NPY_LONGLONGLTR = 'q', - NPY_ULONGLONGLTR = 'Q', - NPY_HALFLTR = 'e', - NPY_FLOATLTR = 'f', - NPY_DOUBLELTR = 'd', - NPY_LONGDOUBLELTR = 'g', - NPY_CFLOATLTR = 'F', - NPY_CDOUBLELTR = 'D', - NPY_CLONGDOUBLELTR = 'G', - NPY_OBJECTLTR = 'O', - NPY_STRINGLTR = 'S', - NPY_STRINGLTR2 = 'a', - NPY_UNICODELTR = 'U', - NPY_VOIDLTR = 'V', - NPY_DATETIMELTR = 'M', - NPY_TIMEDELTALTR = 'm', - NPY_CHARLTR = 'c', - - /* - * No Descriptor, just a define -- this let's - * Python users specify an array of integers - * large enough to hold a pointer on the - * platform - */ - NPY_INTPLTR = 'p', - NPY_UINTPLTR = 'P', - - /* - * These are for dtype 'kinds', not dtype 'typecodes' - * as the above are for. - */ - NPY_GENBOOLLTR ='b', - NPY_SIGNEDLTR = 'i', - NPY_UNSIGNEDLTR = 'u', - NPY_FLOATINGLTR = 'f', - NPY_COMPLEXLTR = 'c' -}; - -typedef enum { - NPY_QUICKSORT=0, - NPY_HEAPSORT=1, - NPY_MERGESORT=2 -} NPY_SORTKIND; -#define NPY_NSORTS (NPY_MERGESORT + 1) - - -typedef enum { - NPY_SEARCHLEFT=0, - NPY_SEARCHRIGHT=1 -} NPY_SEARCHSIDE; -#define NPY_NSEARCHSIDES (NPY_SEARCHRIGHT + 1) - - -typedef enum { - NPY_NOSCALAR=-1, - NPY_BOOL_SCALAR, - NPY_INTPOS_SCALAR, - NPY_INTNEG_SCALAR, - NPY_FLOAT_SCALAR, - NPY_COMPLEX_SCALAR, - NPY_OBJECT_SCALAR -} NPY_SCALARKIND; -#define NPY_NSCALARKINDS (NPY_OBJECT_SCALAR + 1) - -/* For specifying array memory layout or iteration order */ -typedef enum { - /* Fortran order if inputs are all Fortran, C otherwise */ - NPY_ANYORDER=-1, - /* C order */ - NPY_CORDER=0, - /* Fortran order */ - NPY_FORTRANORDER=1, - /* An order as close to the inputs as possible */ - NPY_KEEPORDER=2 -} NPY_ORDER; - -/* For specifying allowed casting in operations which support it */ -typedef enum { - /* Only allow identical types */ - NPY_NO_CASTING=0, - /* Allow identical and byte swapped types */ - NPY_EQUIV_CASTING=1, - /* Only allow safe casts */ - NPY_SAFE_CASTING=2, - /* Allow safe casts or casts within the same kind */ - NPY_SAME_KIND_CASTING=3, - /* Allow any casts */ - NPY_UNSAFE_CASTING=4, - - /* - * Temporary internal definition only, will be removed in upcoming - * release, see below - * */ - NPY_INTERNAL_UNSAFE_CASTING_BUT_WARN_UNLESS_SAME_KIND = 100, -} NPY_CASTING; - -typedef enum { - NPY_CLIP=0, - NPY_WRAP=1, - NPY_RAISE=2 -} NPY_CLIPMODE; - -/* The special not-a-time (NaT) value */ -#define NPY_DATETIME_NAT NPY_MIN_INT64 - -/* - * Upper bound on the length of a DATETIME ISO 8601 string - * YEAR: 21 (64-bit year) - * MONTH: 3 - * DAY: 3 - * HOURS: 3 - * MINUTES: 3 - * SECONDS: 3 - * ATTOSECONDS: 1 + 3*6 - * TIMEZONE: 5 - * NULL TERMINATOR: 1 - */ -#define NPY_DATETIME_MAX_ISO8601_STRLEN (21+3*5+1+3*6+6+1) - -typedef enum { - NPY_FR_Y = 0, /* Years */ - NPY_FR_M = 1, /* Months */ - NPY_FR_W = 2, /* Weeks */ - /* Gap where 1.6 NPY_FR_B (value 3) was */ - NPY_FR_D = 4, /* Days */ - NPY_FR_h = 5, /* hours */ - NPY_FR_m = 6, /* minutes */ - NPY_FR_s = 7, /* seconds */ - NPY_FR_ms = 8, /* milliseconds */ - NPY_FR_us = 9, /* microseconds */ - NPY_FR_ns = 10,/* nanoseconds */ - NPY_FR_ps = 11,/* picoseconds */ - NPY_FR_fs = 12,/* femtoseconds */ - NPY_FR_as = 13,/* attoseconds */ - NPY_FR_GENERIC = 14 /* Generic, unbound units, can convert to anything */ -} NPY_DATETIMEUNIT; - -/* - * NOTE: With the NPY_FR_B gap for 1.6 ABI compatibility, NPY_DATETIME_NUMUNITS - * is technically one more than the actual number of units. - */ -#define NPY_DATETIME_NUMUNITS (NPY_FR_GENERIC + 1) -#define NPY_DATETIME_DEFAULTUNIT NPY_FR_GENERIC - -/* - * Business day conventions for mapping invalid business - * days to valid business days. - */ -typedef enum { - /* Go forward in time to the following business day. */ - NPY_BUSDAY_FORWARD, - NPY_BUSDAY_FOLLOWING = NPY_BUSDAY_FORWARD, - /* Go backward in time to the preceding business day. */ - NPY_BUSDAY_BACKWARD, - NPY_BUSDAY_PRECEDING = NPY_BUSDAY_BACKWARD, - /* - * Go forward in time to the following business day, unless it - * crosses a month boundary, in which case go backward - */ - NPY_BUSDAY_MODIFIEDFOLLOWING, - /* - * Go backward in time to the preceding business day, unless it - * crosses a month boundary, in which case go forward. - */ - NPY_BUSDAY_MODIFIEDPRECEDING, - /* Produce a NaT for non-business days. */ - NPY_BUSDAY_NAT, - /* Raise an exception for non-business days. */ - NPY_BUSDAY_RAISE -} NPY_BUSDAY_ROLL; - -/************************************************************ - * NumPy Auxiliary Data for inner loops, sort functions, etc. - ************************************************************/ - -/* - * When creating an auxiliary data struct, this should always appear - * as the first member, like this: - * - * typedef struct { - * NpyAuxData base; - * double constant; - * } constant_multiplier_aux_data; - */ -typedef struct NpyAuxData_tag NpyAuxData; - -/* Function pointers for freeing or cloning auxiliary data */ -typedef void (NpyAuxData_FreeFunc) (NpyAuxData *); -typedef NpyAuxData *(NpyAuxData_CloneFunc) (NpyAuxData *); - -struct NpyAuxData_tag { - NpyAuxData_FreeFunc *free; - NpyAuxData_CloneFunc *clone; - /* To allow for a bit of expansion without breaking the ABI */ - void *reserved[2]; -}; - -/* Macros to use for freeing and cloning auxiliary data */ -#define NPY_AUXDATA_FREE(auxdata) \ - do { \ - if ((auxdata) != NULL) { \ - (auxdata)->free(auxdata); \ - } \ - } while(0) -#define NPY_AUXDATA_CLONE(auxdata) \ - ((auxdata)->clone(auxdata)) - -#define NPY_ERR(str) fprintf(stderr, #str); fflush(stderr); -#define NPY_ERR2(str) fprintf(stderr, str); fflush(stderr); - -#define NPY_STRINGIFY(x) #x -#define NPY_TOSTRING(x) NPY_STRINGIFY(x) - - /* - * Macros to define how array, and dimension/strides data is - * allocated. - */ - - /* Data buffer - PyDataMem_NEW/FREE/RENEW are in multiarraymodule.c */ - -#define NPY_USE_PYMEM 1 - -#if NPY_USE_PYMEM == 1 -#define PyArray_malloc PyMem_Malloc -#define PyArray_free PyMem_Free -#define PyArray_realloc PyMem_Realloc -#else -#define PyArray_malloc malloc -#define PyArray_free free -#define PyArray_realloc realloc -#endif - -/* Dimensions and strides */ -#define PyDimMem_NEW(size) \ - ((npy_intp *)PyArray_malloc(size*sizeof(npy_intp))) - -#define PyDimMem_FREE(ptr) PyArray_free(ptr) - -#define PyDimMem_RENEW(ptr,size) \ - ((npy_intp *)PyArray_realloc(ptr,size*sizeof(npy_intp))) - -/* forward declaration */ -struct _PyArray_Descr; - -/* These must deal with unaligned and swapped data if necessary */ -typedef PyObject * (PyArray_GetItemFunc) (void *, void *); -typedef int (PyArray_SetItemFunc)(PyObject *, void *, void *); - -typedef void (PyArray_CopySwapNFunc)(void *, npy_intp, void *, npy_intp, - npy_intp, int, void *); - -typedef void (PyArray_CopySwapFunc)(void *, void *, int, void *); -typedef npy_bool (PyArray_NonzeroFunc)(void *, void *); - - -/* - * These assume aligned and notswapped data -- a buffer will be used - * before or contiguous data will be obtained - */ - -typedef int (PyArray_CompareFunc)(const void *, const void *, void *); -typedef int (PyArray_ArgFunc)(void*, npy_intp, npy_intp*, void *); - -typedef void (PyArray_DotFunc)(void *, npy_intp, void *, npy_intp, void *, - npy_intp, void *); - -typedef void (PyArray_VectorUnaryFunc)(void *, void *, npy_intp, void *, - void *); - -/* - * XXX the ignore argument should be removed next time the API version - * is bumped. It used to be the separator. - */ -typedef int (PyArray_ScanFunc)(FILE *fp, void *dptr, - char *ignore, struct _PyArray_Descr *); -typedef int (PyArray_FromStrFunc)(char *s, void *dptr, char **endptr, - struct _PyArray_Descr *); - -typedef int (PyArray_FillFunc)(void *, npy_intp, void *); - -typedef int (PyArray_SortFunc)(void *, npy_intp, void *); -typedef int (PyArray_ArgSortFunc)(void *, npy_intp *, npy_intp, void *); - -typedef int (PyArray_FillWithScalarFunc)(void *, npy_intp, void *, void *); - -typedef int (PyArray_ScalarKindFunc)(void *); - -typedef void (PyArray_FastClipFunc)(void *in, npy_intp n_in, void *min, - void *max, void *out); -typedef void (PyArray_FastPutmaskFunc)(void *in, void *mask, npy_intp n_in, - void *values, npy_intp nv); -typedef int (PyArray_FastTakeFunc)(void *dest, void *src, npy_intp *indarray, - npy_intp nindarray, npy_intp n_outer, - npy_intp m_middle, npy_intp nelem, - NPY_CLIPMODE clipmode); - -typedef struct { - npy_intp *ptr; - int len; -} PyArray_Dims; - -typedef struct { - /* - * Functions to cast to most other standard types - * Can have some NULL entries. The types - * DATETIME, TIMEDELTA, and HALF go into the castdict - * even though they are built-in. - */ - PyArray_VectorUnaryFunc *cast[NPY_NTYPES_ABI_COMPATIBLE]; - - /* The next four functions *cannot* be NULL */ - - /* - * Functions to get and set items with standard Python types - * -- not array scalars - */ - PyArray_GetItemFunc *getitem; - PyArray_SetItemFunc *setitem; - - /* - * Copy and/or swap data. Memory areas may not overlap - * Use memmove first if they might - */ - PyArray_CopySwapNFunc *copyswapn; - PyArray_CopySwapFunc *copyswap; - - /* - * Function to compare items - * Can be NULL - */ - PyArray_CompareFunc *compare; - - /* - * Function to select largest - * Can be NULL - */ - PyArray_ArgFunc *argmax; - - /* - * Function to compute dot product - * Can be NULL - */ - PyArray_DotFunc *dotfunc; - - /* - * Function to scan an ASCII file and - * place a single value plus possible separator - * Can be NULL - */ - PyArray_ScanFunc *scanfunc; - - /* - * Function to read a single value from a string - * and adjust the pointer; Can be NULL - */ - PyArray_FromStrFunc *fromstr; - - /* - * Function to determine if data is zero or not - * If NULL a default version is - * used at Registration time. - */ - PyArray_NonzeroFunc *nonzero; - - /* - * Used for arange. - * Can be NULL. - */ - PyArray_FillFunc *fill; - - /* - * Function to fill arrays with scalar values - * Can be NULL - */ - PyArray_FillWithScalarFunc *fillwithscalar; - - /* - * Sorting functions - * Can be NULL - */ - PyArray_SortFunc *sort[NPY_NSORTS]; - PyArray_ArgSortFunc *argsort[NPY_NSORTS]; - - /* - * Dictionary of additional casting functions - * PyArray_VectorUnaryFuncs - * which can be populated to support casting - * to other registered types. Can be NULL - */ - PyObject *castdict; - - /* - * Functions useful for generalizing - * the casting rules. - * Can be NULL; - */ - PyArray_ScalarKindFunc *scalarkind; - int **cancastscalarkindto; - int *cancastto; - - PyArray_FastClipFunc *fastclip; - PyArray_FastPutmaskFunc *fastputmask; - PyArray_FastTakeFunc *fasttake; - - /* - * Function to select smallest - * Can be NULL - */ - PyArray_ArgFunc *argmin; - -} PyArray_ArrFuncs; - -/* The item must be reference counted when it is inserted or extracted. */ -#define NPY_ITEM_REFCOUNT 0x01 -/* Same as needing REFCOUNT */ -#define NPY_ITEM_HASOBJECT 0x01 -/* Convert to list for pickling */ -#define NPY_LIST_PICKLE 0x02 -/* The item is a POINTER */ -#define NPY_ITEM_IS_POINTER 0x04 -/* memory needs to be initialized for this data-type */ -#define NPY_NEEDS_INIT 0x08 -/* operations need Python C-API so don't give-up thread. */ -#define NPY_NEEDS_PYAPI 0x10 -/* Use f.getitem when extracting elements of this data-type */ -#define NPY_USE_GETITEM 0x20 -/* Use f.setitem when setting creating 0-d array from this data-type.*/ -#define NPY_USE_SETITEM 0x40 -/* A sticky flag specifically for structured arrays */ -#define NPY_ALIGNED_STRUCT 0x80 - -/* - *These are inherited for global data-type if any data-types in the - * field have them - */ -#define NPY_FROM_FIELDS (NPY_NEEDS_INIT | NPY_LIST_PICKLE | \ - NPY_ITEM_REFCOUNT | NPY_NEEDS_PYAPI) - -#define NPY_OBJECT_DTYPE_FLAGS (NPY_LIST_PICKLE | NPY_USE_GETITEM | \ - NPY_ITEM_IS_POINTER | NPY_ITEM_REFCOUNT | \ - NPY_NEEDS_INIT | NPY_NEEDS_PYAPI) - -#define PyDataType_FLAGCHK(dtype, flag) \ - (((dtype)->flags & (flag)) == (flag)) - -#define PyDataType_REFCHK(dtype) \ - PyDataType_FLAGCHK(dtype, NPY_ITEM_REFCOUNT) - -typedef struct _PyArray_Descr { - PyObject_HEAD - /* - * the type object representing an - * instance of this type -- should not - * be two type_numbers with the same type - * object. - */ - PyTypeObject *typeobj; - /* kind for this type */ - char kind; - /* unique-character representing this type */ - char type; - /* - * '>' (big), '<' (little), '|' - * (not-applicable), or '=' (native). - */ - char byteorder; - /* flags describing data type */ - char flags; - /* number representing this type */ - int type_num; - /* element size (itemsize) for this type */ - int elsize; - /* alignment needed for this type */ - int alignment; - /* - * Non-NULL if this type is - * is an array (C-contiguous) - * of some other type - */ - struct _arr_descr *subarray; - /* - * The fields dictionary for this type - * For statically defined descr this - * is always Py_None - */ - PyObject *fields; - /* - * An ordered tuple of field names or NULL - * if no fields are defined - */ - PyObject *names; - /* - * a table of functions specific for each - * basic data descriptor - */ - PyArray_ArrFuncs *f; - /* Metadata about this dtype */ - PyObject *metadata; - /* - * Metadata specific to the C implementation - * of the particular dtype. This was added - * for NumPy 1.7.0. - */ - NpyAuxData *c_metadata; -} PyArray_Descr; - -typedef struct _arr_descr { - PyArray_Descr *base; - PyObject *shape; /* a tuple */ -} PyArray_ArrayDescr; - -/* - * The main array object structure. - * - * It has been recommended to use the inline functions defined below - * (PyArray_DATA and friends) to access fields here for a number of - * releases. Direct access to the members themselves is deprecated. - * To ensure that your code does not use deprecated access, - * #define NPY_NO_DEPRECATED_API NPY_1_7_VERSION - * (or NPY_1_8_VERSION or higher as required). - */ -/* This struct will be moved to a private header in a future release */ -typedef struct tagPyArrayObject_fields { - PyObject_HEAD - /* Pointer to the raw data buffer */ - char *data; - /* The number of dimensions, also called 'ndim' */ - int nd; - /* The size in each dimension, also called 'shape' */ - npy_intp *dimensions; - /* - * Number of bytes to jump to get to the - * next element in each dimension - */ - npy_intp *strides; - /* - * This object is decref'd upon - * deletion of array. Except in the - * case of UPDATEIFCOPY which has - * special handling. - * - * For views it points to the original - * array, collapsed so no chains of - * views occur. - * - * For creation from buffer object it - * points to an object that shold be - * decref'd on deletion - * - * For UPDATEIFCOPY flag this is an - * array to-be-updated upon deletion - * of this one - */ - PyObject *base; - /* Pointer to type structure */ - PyArray_Descr *descr; - /* Flags describing array -- see below */ - int flags; - /* For weak references */ - PyObject *weakreflist; -} PyArrayObject_fields; - -/* - * To hide the implementation details, we only expose - * the Python struct HEAD. - */ -#if !(defined(NPY_NO_DEPRECATED_API) && (NPY_API_VERSION <= NPY_NO_DEPRECATED_API)) -/* - * Can't put this in npy_deprecated_api.h like the others. - * PyArrayObject field access is deprecated as of NumPy 1.7. - */ -typedef PyArrayObject_fields PyArrayObject; -#else -typedef struct tagPyArrayObject { - PyObject_HEAD -} PyArrayObject; -#endif - -#define NPY_SIZEOF_PYARRAYOBJECT (sizeof(PyArrayObject_fields)) - -/* Array Flags Object */ -typedef struct PyArrayFlagsObject { - PyObject_HEAD - PyObject *arr; - int flags; -} PyArrayFlagsObject; - -/* Mirrors buffer object to ptr */ - -typedef struct { - PyObject_HEAD - PyObject *base; - void *ptr; - npy_intp len; - int flags; -} PyArray_Chunk; - -typedef struct { - NPY_DATETIMEUNIT base; - int num; -} PyArray_DatetimeMetaData; - -typedef struct { - NpyAuxData base; - PyArray_DatetimeMetaData meta; -} PyArray_DatetimeDTypeMetaData; - -/* - * This structure contains an exploded view of a date-time value. - * NaT is represented by year == NPY_DATETIME_NAT. - */ -typedef struct { - npy_int64 year; - npy_int32 month, day, hour, min, sec, us, ps, as; -} npy_datetimestruct; - -/* This is not used internally. */ -typedef struct { - npy_int64 day; - npy_int32 sec, us, ps, as; -} npy_timedeltastruct; - -typedef int (PyArray_FinalizeFunc)(PyArrayObject *, PyObject *); - -/* - * Means c-style contiguous (last index varies the fastest). The data - * elements right after each other. - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_C_CONTIGUOUS 0x0001 - -/* - * Set if array is a contiguous Fortran array: the first index varies - * the fastest in memory (strides array is reverse of C-contiguous - * array) - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_F_CONTIGUOUS 0x0002 - -/* - * Note: all 0-d arrays are C_CONTIGUOUS and F_CONTIGUOUS. If a - * 1-d array is C_CONTIGUOUS it is also F_CONTIGUOUS - */ - -/* - * If set, the array owns the data: it will be free'd when the array - * is deleted. - * - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_OWNDATA 0x0004 - -/* - * An array never has the next four set; they're only used as parameter - * flags to the the various FromAny functions - * - * This flag may be requested in constructor functions. - */ - -/* Cause a cast to occur regardless of whether or not it is safe. */ -#define NPY_ARRAY_FORCECAST 0x0010 - -/* - * Always copy the array. Returned arrays are always CONTIGUOUS, - * ALIGNED, and WRITEABLE. - * - * This flag may be requested in constructor functions. - */ -#define NPY_ARRAY_ENSURECOPY 0x0020 - -/* - * Make sure the returned array is a base-class ndarray - * - * This flag may be requested in constructor functions. - */ -#define NPY_ARRAY_ENSUREARRAY 0x0040 - -/* - * Make sure that the strides are in units of the element size Needed - * for some operations with record-arrays. - * - * This flag may be requested in constructor functions. - */ -#define NPY_ARRAY_ELEMENTSTRIDES 0x0080 - -/* - * Array data is aligned on the appropiate memory address for the type - * stored according to how the compiler would align things (e.g., an - * array of integers (4 bytes each) starts on a memory address that's - * a multiple of 4) - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_ALIGNED 0x0100 - -/* - * Array data has the native endianness - * - * This flag may be requested in constructor functions. - */ -#define NPY_ARRAY_NOTSWAPPED 0x0200 - -/* - * Array data is writeable - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_WRITEABLE 0x0400 - -/* - * If this flag is set, then base contains a pointer to an array of - * the same size that should be updated with the current contents of - * this array when this array is deallocated - * - * This flag may be requested in constructor functions. - * This flag may be tested for in PyArray_FLAGS(arr). - */ -#define NPY_ARRAY_UPDATEIFCOPY 0x1000 - -/* - * NOTE: there are also internal flags defined in multiarray/arrayobject.h, - * which start at bit 31 and work down. - */ - -#define NPY_ARRAY_BEHAVED (NPY_ARRAY_ALIGNED | \ - NPY_ARRAY_WRITEABLE) -#define NPY_ARRAY_BEHAVED_NS (NPY_ARRAY_ALIGNED | \ - NPY_ARRAY_WRITEABLE | \ - NPY_ARRAY_NOTSWAPPED) -#define NPY_ARRAY_CARRAY (NPY_ARRAY_C_CONTIGUOUS | \ - NPY_ARRAY_BEHAVED) -#define NPY_ARRAY_CARRAY_RO (NPY_ARRAY_C_CONTIGUOUS | \ - NPY_ARRAY_ALIGNED) -#define NPY_ARRAY_FARRAY (NPY_ARRAY_F_CONTIGUOUS | \ - NPY_ARRAY_BEHAVED) -#define NPY_ARRAY_FARRAY_RO (NPY_ARRAY_F_CONTIGUOUS | \ - NPY_ARRAY_ALIGNED) -#define NPY_ARRAY_DEFAULT (NPY_ARRAY_CARRAY) -#define NPY_ARRAY_IN_ARRAY (NPY_ARRAY_CARRAY_RO) -#define NPY_ARRAY_OUT_ARRAY (NPY_ARRAY_CARRAY) -#define NPY_ARRAY_INOUT_ARRAY (NPY_ARRAY_CARRAY | \ - NPY_ARRAY_UPDATEIFCOPY) -#define NPY_ARRAY_IN_FARRAY (NPY_ARRAY_FARRAY_RO) -#define NPY_ARRAY_OUT_FARRAY (NPY_ARRAY_FARRAY) -#define NPY_ARRAY_INOUT_FARRAY (NPY_ARRAY_FARRAY | \ - NPY_ARRAY_UPDATEIFCOPY) - -#define NPY_ARRAY_UPDATE_ALL (NPY_ARRAY_C_CONTIGUOUS | \ - NPY_ARRAY_F_CONTIGUOUS | \ - NPY_ARRAY_ALIGNED) - -/* This flag is for the array interface, not PyArrayObject */ -#define NPY_ARR_HAS_DESCR 0x0800 - - - - -/* - * Size of internal buffers used for alignment Make BUFSIZE a multiple - * of sizeof(npy_cdouble) -- usually 16 so that ufunc buffers are aligned - */ -#define NPY_MIN_BUFSIZE ((int)sizeof(npy_cdouble)) -#define NPY_MAX_BUFSIZE (((int)sizeof(npy_cdouble))*1000000) -#define NPY_BUFSIZE 8192 -/* buffer stress test size: */ -/*#define NPY_BUFSIZE 17*/ - -#define PyArray_MAX(a,b) (((a)>(b))?(a):(b)) -#define PyArray_MIN(a,b) (((a)<(b))?(a):(b)) -#define PyArray_CLT(p,q) ((((p).real==(q).real) ? ((p).imag < (q).imag) : \ - ((p).real < (q).real))) -#define PyArray_CGT(p,q) ((((p).real==(q).real) ? ((p).imag > (q).imag) : \ - ((p).real > (q).real))) -#define PyArray_CLE(p,q) ((((p).real==(q).real) ? ((p).imag <= (q).imag) : \ - ((p).real <= (q).real))) -#define PyArray_CGE(p,q) ((((p).real==(q).real) ? ((p).imag >= (q).imag) : \ - ((p).real >= (q).real))) -#define PyArray_CEQ(p,q) (((p).real==(q).real) && ((p).imag == (q).imag)) -#define PyArray_CNE(p,q) (((p).real!=(q).real) || ((p).imag != (q).imag)) - -/* - * C API: consists of Macros and functions. The MACROS are defined - * here. - */ - - -#define PyArray_ISCONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_C_CONTIGUOUS) -#define PyArray_ISWRITEABLE(m) PyArray_CHKFLAGS(m, NPY_ARRAY_WRITEABLE) -#define PyArray_ISALIGNED(m) PyArray_CHKFLAGS(m, NPY_ARRAY_ALIGNED) - -#define PyArray_IS_C_CONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_C_CONTIGUOUS) -#define PyArray_IS_F_CONTIGUOUS(m) PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS) - -#if NPY_ALLOW_THREADS -#define NPY_BEGIN_ALLOW_THREADS Py_BEGIN_ALLOW_THREADS -#define NPY_END_ALLOW_THREADS Py_END_ALLOW_THREADS -#define NPY_BEGIN_THREADS_DEF PyThreadState *_save=NULL; -#define NPY_BEGIN_THREADS do {_save = PyEval_SaveThread();} while (0); -#define NPY_END_THREADS do {if (_save) PyEval_RestoreThread(_save);} while (0); - -#define NPY_BEGIN_THREADS_DESCR(dtype) \ - do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI))) \ - NPY_BEGIN_THREADS;} while (0); - -#define NPY_END_THREADS_DESCR(dtype) \ - do {if (!(PyDataType_FLAGCHK(dtype, NPY_NEEDS_PYAPI))) \ - NPY_END_THREADS; } while (0); - -#define NPY_ALLOW_C_API_DEF PyGILState_STATE __save__; -#define NPY_ALLOW_C_API do {__save__ = PyGILState_Ensure();} while (0); -#define NPY_DISABLE_C_API do {PyGILState_Release(__save__);} while (0); -#else -#define NPY_BEGIN_ALLOW_THREADS -#define NPY_END_ALLOW_THREADS -#define NPY_BEGIN_THREADS_DEF -#define NPY_BEGIN_THREADS -#define NPY_END_THREADS -#define NPY_BEGIN_THREADS_DESCR(dtype) -#define NPY_END_THREADS_DESCR(dtype) -#define NPY_ALLOW_C_API_DEF -#define NPY_ALLOW_C_API -#define NPY_DISABLE_C_API -#endif - -/********************************** - * The nditer object, added in 1.6 - **********************************/ - -/* The actual structure of the iterator is an internal detail */ -typedef struct NpyIter_InternalOnly NpyIter; - -/* Iterator function pointers that may be specialized */ -typedef int (NpyIter_IterNextFunc)(NpyIter *iter); -typedef void (NpyIter_GetMultiIndexFunc)(NpyIter *iter, - npy_intp *outcoords); - -/*** Global flags that may be passed to the iterator constructors ***/ - -/* Track an index representing C order */ -#define NPY_ITER_C_INDEX 0x00000001 -/* Track an index representing Fortran order */ -#define NPY_ITER_F_INDEX 0x00000002 -/* Track a multi-index */ -#define NPY_ITER_MULTI_INDEX 0x00000004 -/* User code external to the iterator does the 1-dimensional innermost loop */ -#define NPY_ITER_EXTERNAL_LOOP 0x00000008 -/* Convert all the operands to a common data type */ -#define NPY_ITER_COMMON_DTYPE 0x00000010 -/* Operands may hold references, requiring API access during iteration */ -#define NPY_ITER_REFS_OK 0x00000020 -/* Zero-sized operands should be permitted, iteration checks IterSize for 0 */ -#define NPY_ITER_ZEROSIZE_OK 0x00000040 -/* Permits reductions (size-0 stride with dimension size > 1) */ -#define NPY_ITER_REDUCE_OK 0x00000080 -/* Enables sub-range iteration */ -#define NPY_ITER_RANGED 0x00000100 -/* Enables buffering */ -#define NPY_ITER_BUFFERED 0x00000200 -/* When buffering is enabled, grows the inner loop if possible */ -#define NPY_ITER_GROWINNER 0x00000400 -/* Delay allocation of buffers until first Reset* call */ -#define NPY_ITER_DELAY_BUFALLOC 0x00000800 -/* When NPY_KEEPORDER is specified, disable reversing negative-stride axes */ -#define NPY_ITER_DONT_NEGATE_STRIDES 0x00001000 - -/*** Per-operand flags that may be passed to the iterator constructors ***/ - -/* The operand will be read from and written to */ -#define NPY_ITER_READWRITE 0x00010000 -/* The operand will only be read from */ -#define NPY_ITER_READONLY 0x00020000 -/* The operand will only be written to */ -#define NPY_ITER_WRITEONLY 0x00040000 -/* The operand's data must be in native byte order */ -#define NPY_ITER_NBO 0x00080000 -/* The operand's data must be aligned */ -#define NPY_ITER_ALIGNED 0x00100000 -/* The operand's data must be contiguous (within the inner loop) */ -#define NPY_ITER_CONTIG 0x00200000 -/* The operand may be copied to satisfy requirements */ -#define NPY_ITER_COPY 0x00400000 -/* The operand may be copied with UPDATEIFCOPY to satisfy requirements */ -#define NPY_ITER_UPDATEIFCOPY 0x00800000 -/* Allocate the operand if it is NULL */ -#define NPY_ITER_ALLOCATE 0x01000000 -/* If an operand is allocated, don't use any subtype */ -#define NPY_ITER_NO_SUBTYPE 0x02000000 -/* This is a virtual array slot, operand is NULL but temporary data is there */ -#define NPY_ITER_VIRTUAL 0x04000000 -/* Require that the dimension match the iterator dimensions exactly */ -#define NPY_ITER_NO_BROADCAST 0x08000000 -/* A mask is being used on this array, affects buffer -> array copy */ -#define NPY_ITER_WRITEMASKED 0x10000000 -/* This array is the mask for all WRITEMASKED operands */ -#define NPY_ITER_ARRAYMASK 0x20000000 - -#define NPY_ITER_GLOBAL_FLAGS 0x0000ffff -#define NPY_ITER_PER_OP_FLAGS 0xffff0000 - - -/***************************** - * Basic iterator object - *****************************/ - -/* FWD declaration */ -typedef struct PyArrayIterObject_tag PyArrayIterObject; - -/* - * type of the function which translates a set of coordinates to a - * pointer to the data - */ -typedef char* (*npy_iter_get_dataptr_t)(PyArrayIterObject* iter, npy_intp*); - -struct PyArrayIterObject_tag { - PyObject_HEAD - int nd_m1; /* number of dimensions - 1 */ - npy_intp index, size; - npy_intp coordinates[NPY_MAXDIMS];/* N-dimensional loop */ - npy_intp dims_m1[NPY_MAXDIMS]; /* ao->dimensions - 1 */ - npy_intp strides[NPY_MAXDIMS]; /* ao->strides or fake */ - npy_intp backstrides[NPY_MAXDIMS];/* how far to jump back */ - npy_intp factors[NPY_MAXDIMS]; /* shape factors */ - PyArrayObject *ao; - char *dataptr; /* pointer to current item*/ - npy_bool contiguous; - - npy_intp bounds[NPY_MAXDIMS][2]; - npy_intp limits[NPY_MAXDIMS][2]; - npy_intp limits_sizes[NPY_MAXDIMS]; - npy_iter_get_dataptr_t translate; -} ; - - -/* Iterator API */ -#define PyArrayIter_Check(op) PyObject_TypeCheck(op, &PyArrayIter_Type) - -#define _PyAIT(it) ((PyArrayIterObject *)(it)) -#define PyArray_ITER_RESET(it) do { \ - _PyAIT(it)->index = 0; \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao); \ - memset(_PyAIT(it)->coordinates, 0, \ - (_PyAIT(it)->nd_m1+1)*sizeof(npy_intp)); \ -} while (0) - -#define _PyArray_ITER_NEXT1(it) do { \ - (it)->dataptr += _PyAIT(it)->strides[0]; \ - (it)->coordinates[0]++; \ -} while (0) - -#define _PyArray_ITER_NEXT2(it) do { \ - if ((it)->coordinates[1] < (it)->dims_m1[1]) { \ - (it)->coordinates[1]++; \ - (it)->dataptr += (it)->strides[1]; \ - } \ - else { \ - (it)->coordinates[1] = 0; \ - (it)->coordinates[0]++; \ - (it)->dataptr += (it)->strides[0] - \ - (it)->backstrides[1]; \ - } \ -} while (0) - -#define _PyArray_ITER_NEXT3(it) do { \ - if ((it)->coordinates[2] < (it)->dims_m1[2]) { \ - (it)->coordinates[2]++; \ - (it)->dataptr += (it)->strides[2]; \ - } \ - else { \ - (it)->coordinates[2] = 0; \ - (it)->dataptr -= (it)->backstrides[2]; \ - if ((it)->coordinates[1] < (it)->dims_m1[1]) { \ - (it)->coordinates[1]++; \ - (it)->dataptr += (it)->strides[1]; \ - } \ - else { \ - (it)->coordinates[1] = 0; \ - (it)->coordinates[0]++; \ - (it)->dataptr += (it)->strides[0] \ - (it)->backstrides[1]; \ - } \ - } \ -} while (0) - -#define PyArray_ITER_NEXT(it) do { \ - _PyAIT(it)->index++; \ - if (_PyAIT(it)->nd_m1 == 0) { \ - _PyArray_ITER_NEXT1(_PyAIT(it)); \ - } \ - else if (_PyAIT(it)->contiguous) \ - _PyAIT(it)->dataptr += PyArray_DESCR(_PyAIT(it)->ao)->elsize; \ - else if (_PyAIT(it)->nd_m1 == 1) { \ - _PyArray_ITER_NEXT2(_PyAIT(it)); \ - } \ - else { \ - int __npy_i; \ - for (__npy_i=_PyAIT(it)->nd_m1; __npy_i >= 0; __npy_i--) { \ - if (_PyAIT(it)->coordinates[__npy_i] < \ - _PyAIT(it)->dims_m1[__npy_i]) { \ - _PyAIT(it)->coordinates[__npy_i]++; \ - _PyAIT(it)->dataptr += \ - _PyAIT(it)->strides[__npy_i]; \ - break; \ - } \ - else { \ - _PyAIT(it)->coordinates[__npy_i] = 0; \ - _PyAIT(it)->dataptr -= \ - _PyAIT(it)->backstrides[__npy_i]; \ - } \ - } \ - } \ -} while (0) - -#define PyArray_ITER_GOTO(it, destination) do { \ - int __npy_i; \ - _PyAIT(it)->index = 0; \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao); \ - for (__npy_i = _PyAIT(it)->nd_m1; __npy_i>=0; __npy_i--) { \ - if (destination[__npy_i] < 0) { \ - destination[__npy_i] += \ - _PyAIT(it)->dims_m1[__npy_i]+1; \ - } \ - _PyAIT(it)->dataptr += destination[__npy_i] * \ - _PyAIT(it)->strides[__npy_i]; \ - _PyAIT(it)->coordinates[__npy_i] = \ - destination[__npy_i]; \ - _PyAIT(it)->index += destination[__npy_i] * \ - ( __npy_i==_PyAIT(it)->nd_m1 ? 1 : \ - _PyAIT(it)->dims_m1[__npy_i+1]+1) ; \ - } \ -} while (0) - -#define PyArray_ITER_GOTO1D(it, ind) do { \ - int __npy_i; \ - npy_intp __npy_ind = (npy_intp) (ind); \ - if (__npy_ind < 0) __npy_ind += _PyAIT(it)->size; \ - _PyAIT(it)->index = __npy_ind; \ - if (_PyAIT(it)->nd_m1 == 0) { \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao) + \ - __npy_ind * _PyAIT(it)->strides[0]; \ - } \ - else if (_PyAIT(it)->contiguous) \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao) + \ - __npy_ind * PyArray_DESCR(_PyAIT(it)->ao)->elsize; \ - else { \ - _PyAIT(it)->dataptr = PyArray_BYTES(_PyAIT(it)->ao); \ - for (__npy_i = 0; __npy_i<=_PyAIT(it)->nd_m1; \ - __npy_i++) { \ - _PyAIT(it)->dataptr += \ - (__npy_ind / _PyAIT(it)->factors[__npy_i]) \ - * _PyAIT(it)->strides[__npy_i]; \ - __npy_ind %= _PyAIT(it)->factors[__npy_i]; \ - } \ - } \ -} while (0) - -#define PyArray_ITER_DATA(it) ((void *)(_PyAIT(it)->dataptr)) - -#define PyArray_ITER_NOTDONE(it) (_PyAIT(it)->index < _PyAIT(it)->size) - - -/* - * Any object passed to PyArray_Broadcast must be binary compatible - * with this structure. - */ - -typedef struct { - PyObject_HEAD - int numiter; /* number of iters */ - npy_intp size; /* broadcasted size */ - npy_intp index; /* current index */ - int nd; /* number of dims */ - npy_intp dimensions[NPY_MAXDIMS]; /* dimensions */ - PyArrayIterObject *iters[NPY_MAXARGS]; /* iterators */ -} PyArrayMultiIterObject; - -#define _PyMIT(m) ((PyArrayMultiIterObject *)(m)) -#define PyArray_MultiIter_RESET(multi) do { \ - int __npy_mi; \ - _PyMIT(multi)->index = 0; \ - for (__npy_mi=0; __npy_mi < _PyMIT(multi)->numiter; __npy_mi++) { \ - PyArray_ITER_RESET(_PyMIT(multi)->iters[__npy_mi]); \ - } \ -} while (0) - -#define PyArray_MultiIter_NEXT(multi) do { \ - int __npy_mi; \ - _PyMIT(multi)->index++; \ - for (__npy_mi=0; __npy_mi < _PyMIT(multi)->numiter; __npy_mi++) { \ - PyArray_ITER_NEXT(_PyMIT(multi)->iters[__npy_mi]); \ - } \ -} while (0) - -#define PyArray_MultiIter_GOTO(multi, dest) do { \ - int __npy_mi; \ - for (__npy_mi=0; __npy_mi < _PyMIT(multi)->numiter; __npy_mi++) { \ - PyArray_ITER_GOTO(_PyMIT(multi)->iters[__npy_mi], dest); \ - } \ - _PyMIT(multi)->index = _PyMIT(multi)->iters[0]->index; \ -} while (0) - -#define PyArray_MultiIter_GOTO1D(multi, ind) do { \ - int __npy_mi; \ - for (__npy_mi=0; __npy_mi < _PyMIT(multi)->numiter; __npy_mi++) { \ - PyArray_ITER_GOTO1D(_PyMIT(multi)->iters[__npy_mi], ind); \ - } \ - _PyMIT(multi)->index = _PyMIT(multi)->iters[0]->index; \ -} while (0) - -#define PyArray_MultiIter_DATA(multi, i) \ - ((void *)(_PyMIT(multi)->iters[i]->dataptr)) - -#define PyArray_MultiIter_NEXTi(multi, i) \ - PyArray_ITER_NEXT(_PyMIT(multi)->iters[i]) - -#define PyArray_MultiIter_NOTDONE(multi) \ - (_PyMIT(multi)->index < _PyMIT(multi)->size) - -/* Store the information needed for fancy-indexing over an array */ - -typedef struct { - PyObject_HEAD - /* - * Multi-iterator portion --- needs to be present in this - * order to work with PyArray_Broadcast - */ - - int numiter; /* number of index-array - iterators */ - npy_intp size; /* size of broadcasted - result */ - npy_intp index; /* current index */ - int nd; /* number of dims */ - npy_intp dimensions[NPY_MAXDIMS]; /* dimensions */ - PyArrayIterObject *iters[NPY_MAXDIMS]; /* index object - iterators */ - PyArrayIterObject *ait; /* flat Iterator for - underlying array */ - - /* flat iterator for subspace (when numiter < nd) */ - PyArrayIterObject *subspace; - - /* - * if subspace iteration, then this is the array of axes in - * the underlying array represented by the index objects - */ - int iteraxes[NPY_MAXDIMS]; - /* - * if subspace iteration, the these are the coordinates to the - * start of the subspace. - */ - npy_intp bscoord[NPY_MAXDIMS]; - - PyObject *indexobj; /* creating obj */ - int consec; - char *dataptr; - -} PyArrayMapIterObject; - -enum { - NPY_NEIGHBORHOOD_ITER_ZERO_PADDING, - NPY_NEIGHBORHOOD_ITER_ONE_PADDING, - NPY_NEIGHBORHOOD_ITER_CONSTANT_PADDING, - NPY_NEIGHBORHOOD_ITER_CIRCULAR_PADDING, - NPY_NEIGHBORHOOD_ITER_MIRROR_PADDING -}; - -typedef struct { - PyObject_HEAD - - /* - * PyArrayIterObject part: keep this in this exact order - */ - int nd_m1; /* number of dimensions - 1 */ - npy_intp index, size; - npy_intp coordinates[NPY_MAXDIMS];/* N-dimensional loop */ - npy_intp dims_m1[NPY_MAXDIMS]; /* ao->dimensions - 1 */ - npy_intp strides[NPY_MAXDIMS]; /* ao->strides or fake */ - npy_intp backstrides[NPY_MAXDIMS];/* how far to jump back */ - npy_intp factors[NPY_MAXDIMS]; /* shape factors */ - PyArrayObject *ao; - char *dataptr; /* pointer to current item*/ - npy_bool contiguous; - - npy_intp bounds[NPY_MAXDIMS][2]; - npy_intp limits[NPY_MAXDIMS][2]; - npy_intp limits_sizes[NPY_MAXDIMS]; - npy_iter_get_dataptr_t translate; - - /* - * New members - */ - npy_intp nd; - - /* Dimensions is the dimension of the array */ - npy_intp dimensions[NPY_MAXDIMS]; - - /* - * Neighborhood points coordinates are computed relatively to the - * point pointed by _internal_iter - */ - PyArrayIterObject* _internal_iter; - /* - * To keep a reference to the representation of the constant value - * for constant padding - */ - char* constant; - - int mode; -} PyArrayNeighborhoodIterObject; - -/* - * Neighborhood iterator API - */ - -/* General: those work for any mode */ -static NPY_INLINE int -PyArrayNeighborhoodIter_Reset(PyArrayNeighborhoodIterObject* iter); -static NPY_INLINE int -PyArrayNeighborhoodIter_Next(PyArrayNeighborhoodIterObject* iter); -#if 0 -static NPY_INLINE int -PyArrayNeighborhoodIter_Next2D(PyArrayNeighborhoodIterObject* iter); -#endif - -/* - * Include inline implementations - functions defined there are not - * considered public API - */ -#define _NPY_INCLUDE_NEIGHBORHOOD_IMP -#include "_neighborhood_iterator_imp.h" -#undef _NPY_INCLUDE_NEIGHBORHOOD_IMP - -/* The default array type */ -#define NPY_DEFAULT_TYPE NPY_DOUBLE - -/* - * All sorts of useful ways to look into a PyArrayObject. It is recommended - * to use PyArrayObject * objects instead of always casting from PyObject *, - * for improved type checking. - * - * In many cases here the macro versions of the accessors are deprecated, - * but can't be immediately changed to inline functions because the - * preexisting macros accept PyObject * and do automatic casts. Inline - * functions accepting PyArrayObject * provides for some compile-time - * checking of correctness when working with these objects in C. - */ - -#define PyArray_ISONESEGMENT(m) (PyArray_NDIM(m) == 0 || \ - PyArray_CHKFLAGS(m, NPY_ARRAY_C_CONTIGUOUS) || \ - PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS)) - -#define PyArray_ISFORTRAN(m) (PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS) && \ - (PyArray_NDIM(m) > 1)) - -#define PyArray_FORTRAN_IF(m) ((PyArray_CHKFLAGS(m, NPY_ARRAY_F_CONTIGUOUS) ? \ - NPY_ARRAY_F_CONTIGUOUS : 0)) - -#if (defined(NPY_NO_DEPRECATED_API) && (NPY_API_VERSION <= NPY_NO_DEPRECATED_API)) -/* - * Changing access macros into functions, to allow for future hiding - * of the internal memory layout. This later hiding will allow the 2.x series - * to change the internal representation of arrays without affecting - * ABI compatibility. - */ - -static NPY_INLINE int -PyArray_NDIM(const PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->nd; -} - -static NPY_INLINE void * -PyArray_DATA(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->data; -} - -static NPY_INLINE char * -PyArray_BYTES(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->data; -} - -static NPY_INLINE npy_intp * -PyArray_DIMS(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->dimensions; -} - -static NPY_INLINE npy_intp * -PyArray_STRIDES(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->strides; -} - -static NPY_INLINE npy_intp -PyArray_DIM(const PyArrayObject *arr, int idim) -{ - return ((PyArrayObject_fields *)arr)->dimensions[idim]; -} - -static NPY_INLINE npy_intp -PyArray_STRIDE(const PyArrayObject *arr, int istride) -{ - return ((PyArrayObject_fields *)arr)->strides[istride]; -} - -static NPY_INLINE PyObject * -PyArray_BASE(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->base; -} - -static NPY_INLINE PyArray_Descr * -PyArray_DESCR(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->descr; -} - -static NPY_INLINE int -PyArray_FLAGS(const PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->flags; -} - -static NPY_INLINE npy_intp -PyArray_ITEMSIZE(const PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->descr->elsize; -} - -static NPY_INLINE int -PyArray_TYPE(const PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->descr->type_num; -} - -static NPY_INLINE int -PyArray_CHKFLAGS(const PyArrayObject *arr, int flags) -{ - return (PyArray_FLAGS(arr) & flags) == flags; -} - -static NPY_INLINE PyObject * -PyArray_GETITEM(const PyArrayObject *arr, const char *itemptr) -{ - return ((PyArrayObject_fields *)arr)->descr->f->getitem( - (void *)itemptr, (PyArrayObject *)arr); -} - -static NPY_INLINE int -PyArray_SETITEM(PyArrayObject *arr, char *itemptr, PyObject *v) -{ - return ((PyArrayObject_fields *)arr)->descr->f->setitem( - v, itemptr, arr); -} - -#else - -/* These macros are deprecated as of NumPy 1.7. */ -#define PyArray_NDIM(obj) (((PyArrayObject_fields *)(obj))->nd) -#define PyArray_BYTES(obj) (((PyArrayObject_fields *)(obj))->data) -#define PyArray_DATA(obj) ((void *)((PyArrayObject_fields *)(obj))->data) -#define PyArray_DIMS(obj) (((PyArrayObject_fields *)(obj))->dimensions) -#define PyArray_STRIDES(obj) (((PyArrayObject_fields *)(obj))->strides) -#define PyArray_DIM(obj,n) (PyArray_DIMS(obj)[n]) -#define PyArray_STRIDE(obj,n) (PyArray_STRIDES(obj)[n]) -#define PyArray_BASE(obj) (((PyArrayObject_fields *)(obj))->base) -#define PyArray_DESCR(obj) (((PyArrayObject_fields *)(obj))->descr) -#define PyArray_FLAGS(obj) (((PyArrayObject_fields *)(obj))->flags) -#define PyArray_CHKFLAGS(m, FLAGS) \ - ((((PyArrayObject_fields *)(m))->flags & (FLAGS)) == (FLAGS)) -#define PyArray_ITEMSIZE(obj) \ - (((PyArrayObject_fields *)(obj))->descr->elsize) -#define PyArray_TYPE(obj) \ - (((PyArrayObject_fields *)(obj))->descr->type_num) -#define PyArray_GETITEM(obj,itemptr) \ - PyArray_DESCR(obj)->f->getitem((char *)(itemptr), \ - (PyArrayObject *)(obj)) - -#define PyArray_SETITEM(obj,itemptr,v) \ - PyArray_DESCR(obj)->f->setitem((PyObject *)(v), \ - (char *)(itemptr), \ - (PyArrayObject *)(obj)) -#endif - -static NPY_INLINE PyArray_Descr * -PyArray_DTYPE(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->descr; -} - -static NPY_INLINE npy_intp * -PyArray_SHAPE(PyArrayObject *arr) -{ - return ((PyArrayObject_fields *)arr)->dimensions; -} - -/* - * Enables the specified array flags. Does no checking, - * assumes you know what you're doing. - */ -static NPY_INLINE void -PyArray_ENABLEFLAGS(PyArrayObject *arr, int flags) -{ - ((PyArrayObject_fields *)arr)->flags |= flags; -} - -/* - * Clears the specified array flags. Does no checking, - * assumes you know what you're doing. - */ -static NPY_INLINE void -PyArray_CLEARFLAGS(PyArrayObject *arr, int flags) -{ - ((PyArrayObject_fields *)arr)->flags &= ~flags; -} - -#define PyTypeNum_ISBOOL(type) ((type) == NPY_BOOL) - -#define PyTypeNum_ISUNSIGNED(type) (((type) == NPY_UBYTE) || \ - ((type) == NPY_USHORT) || \ - ((type) == NPY_UINT) || \ - ((type) == NPY_ULONG) || \ - ((type) == NPY_ULONGLONG)) - -#define PyTypeNum_ISSIGNED(type) (((type) == NPY_BYTE) || \ - ((type) == NPY_SHORT) || \ - ((type) == NPY_INT) || \ - ((type) == NPY_LONG) || \ - ((type) == NPY_LONGLONG)) - -#define PyTypeNum_ISINTEGER(type) (((type) >= NPY_BYTE) && \ - ((type) <= NPY_ULONGLONG)) - -#define PyTypeNum_ISFLOAT(type) ((((type) >= NPY_FLOAT) && \ - ((type) <= NPY_LONGDOUBLE)) || \ - ((type) == NPY_HALF)) - -#define PyTypeNum_ISNUMBER(type) (((type) <= NPY_CLONGDOUBLE) || \ - ((type) == NPY_HALF)) - -#define PyTypeNum_ISSTRING(type) (((type) == NPY_STRING) || \ - ((type) == NPY_UNICODE)) - -#define PyTypeNum_ISCOMPLEX(type) (((type) >= NPY_CFLOAT) && \ - ((type) <= NPY_CLONGDOUBLE)) - -#define PyTypeNum_ISPYTHON(type) (((type) == NPY_LONG) || \ - ((type) == NPY_DOUBLE) || \ - ((type) == NPY_CDOUBLE) || \ - ((type) == NPY_BOOL) || \ - ((type) == NPY_OBJECT )) - -#define PyTypeNum_ISFLEXIBLE(type) (((type) >=NPY_STRING) && \ - ((type) <=NPY_VOID)) - -#define PyTypeNum_ISDATETIME(type) (((type) >=NPY_DATETIME) && \ - ((type) <=NPY_TIMEDELTA)) - -#define PyTypeNum_ISUSERDEF(type) (((type) >= NPY_USERDEF) && \ - ((type) < NPY_USERDEF+ \ - NPY_NUMUSERTYPES)) - -#define PyTypeNum_ISEXTENDED(type) (PyTypeNum_ISFLEXIBLE(type) || \ - PyTypeNum_ISUSERDEF(type)) - -#define PyTypeNum_ISOBJECT(type) ((type) == NPY_OBJECT) - - -#define PyDataType_ISBOOL(obj) PyTypeNum_ISBOOL(_PyADt(obj)) -#define PyDataType_ISUNSIGNED(obj) PyTypeNum_ISUNSIGNED(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISSIGNED(obj) PyTypeNum_ISSIGNED(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISINTEGER(obj) PyTypeNum_ISINTEGER(((PyArray_Descr*)(obj))->type_num ) -#define PyDataType_ISFLOAT(obj) PyTypeNum_ISFLOAT(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISNUMBER(obj) PyTypeNum_ISNUMBER(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISSTRING(obj) PyTypeNum_ISSTRING(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISCOMPLEX(obj) PyTypeNum_ISCOMPLEX(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISPYTHON(obj) PyTypeNum_ISPYTHON(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISFLEXIBLE(obj) PyTypeNum_ISFLEXIBLE(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISDATETIME(obj) PyTypeNum_ISDATETIME(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISUSERDEF(obj) PyTypeNum_ISUSERDEF(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISEXTENDED(obj) PyTypeNum_ISEXTENDED(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_ISOBJECT(obj) PyTypeNum_ISOBJECT(((PyArray_Descr*)(obj))->type_num) -#define PyDataType_HASFIELDS(obj) (((PyArray_Descr *)(obj))->names != NULL) -#define PyDataType_HASSUBARRAY(dtype) ((dtype)->subarray != NULL) - -#define PyArray_ISBOOL(obj) PyTypeNum_ISBOOL(PyArray_TYPE(obj)) -#define PyArray_ISUNSIGNED(obj) PyTypeNum_ISUNSIGNED(PyArray_TYPE(obj)) -#define PyArray_ISSIGNED(obj) PyTypeNum_ISSIGNED(PyArray_TYPE(obj)) -#define PyArray_ISINTEGER(obj) PyTypeNum_ISINTEGER(PyArray_TYPE(obj)) -#define PyArray_ISFLOAT(obj) PyTypeNum_ISFLOAT(PyArray_TYPE(obj)) -#define PyArray_ISNUMBER(obj) PyTypeNum_ISNUMBER(PyArray_TYPE(obj)) -#define PyArray_ISSTRING(obj) PyTypeNum_ISSTRING(PyArray_TYPE(obj)) -#define PyArray_ISCOMPLEX(obj) PyTypeNum_ISCOMPLEX(PyArray_TYPE(obj)) -#define PyArray_ISPYTHON(obj) PyTypeNum_ISPYTHON(PyArray_TYPE(obj)) -#define PyArray_ISFLEXIBLE(obj) PyTypeNum_ISFLEXIBLE(PyArray_TYPE(obj)) -#define PyArray_ISDATETIME(obj) PyTypeNum_ISDATETIME(PyArray_TYPE(obj)) -#define PyArray_ISUSERDEF(obj) PyTypeNum_ISUSERDEF(PyArray_TYPE(obj)) -#define PyArray_ISEXTENDED(obj) PyTypeNum_ISEXTENDED(PyArray_TYPE(obj)) -#define PyArray_ISOBJECT(obj) PyTypeNum_ISOBJECT(PyArray_TYPE(obj)) -#define PyArray_HASFIELDS(obj) PyDataType_HASFIELDS(PyArray_DESCR(obj)) - - /* - * FIXME: This should check for a flag on the data-type that - * states whether or not it is variable length. Because the - * ISFLEXIBLE check is hard-coded to the built-in data-types. - */ -#define PyArray_ISVARIABLE(obj) PyTypeNum_ISFLEXIBLE(PyArray_TYPE(obj)) - -#define PyArray_SAFEALIGNEDCOPY(obj) (PyArray_ISALIGNED(obj) && !PyArray_ISVARIABLE(obj)) - - -#define NPY_LITTLE '<' -#define NPY_BIG '>' -#define NPY_NATIVE '=' -#define NPY_SWAP 's' -#define NPY_IGNORE '|' - -#if NPY_BYTE_ORDER == NPY_BIG_ENDIAN -#define NPY_NATBYTE NPY_BIG -#define NPY_OPPBYTE NPY_LITTLE -#else -#define NPY_NATBYTE NPY_LITTLE -#define NPY_OPPBYTE NPY_BIG -#endif - -#define PyArray_ISNBO(arg) ((arg) != NPY_OPPBYTE) -#define PyArray_IsNativeByteOrder PyArray_ISNBO -#define PyArray_ISNOTSWAPPED(m) PyArray_ISNBO(PyArray_DESCR(m)->byteorder) -#define PyArray_ISBYTESWAPPED(m) (!PyArray_ISNOTSWAPPED(m)) - -#define PyArray_FLAGSWAP(m, flags) (PyArray_CHKFLAGS(m, flags) && \ - PyArray_ISNOTSWAPPED(m)) - -#define PyArray_ISCARRAY(m) PyArray_FLAGSWAP(m, NPY_ARRAY_CARRAY) -#define PyArray_ISCARRAY_RO(m) PyArray_FLAGSWAP(m, NPY_ARRAY_CARRAY_RO) -#define PyArray_ISFARRAY(m) PyArray_FLAGSWAP(m, NPY_ARRAY_FARRAY) -#define PyArray_ISFARRAY_RO(m) PyArray_FLAGSWAP(m, NPY_ARRAY_FARRAY_RO) -#define PyArray_ISBEHAVED(m) PyArray_FLAGSWAP(m, NPY_ARRAY_BEHAVED) -#define PyArray_ISBEHAVED_RO(m) PyArray_FLAGSWAP(m, NPY_ARRAY_ALIGNED) - - -#define PyDataType_ISNOTSWAPPED(d) PyArray_ISNBO(((PyArray_Descr *)(d))->byteorder) -#define PyDataType_ISBYTESWAPPED(d) (!PyDataType_ISNOTSWAPPED(d)) - -/************************************************************ - * A struct used by PyArray_CreateSortedStridePerm, new in 1.7. - ************************************************************/ - -typedef struct { - npy_intp perm, stride; -} npy_stride_sort_item; - -/************************************************************ - * This is the form of the struct that's returned pointed by the - * PyCObject attribute of an array __array_struct__. See - * http://docs.scipy.org/doc/numpy/reference/arrays.interface.html for the full - * documentation. - ************************************************************/ -typedef struct { - int two; /* - * contains the integer 2 as a sanity - * check - */ - - int nd; /* number of dimensions */ - - char typekind; /* - * kind in array --- character code of - * typestr - */ - - int itemsize; /* size of each element */ - - int flags; /* - * how should be data interpreted. Valid - * flags are CONTIGUOUS (1), F_CONTIGUOUS (2), - * ALIGNED (0x100), NOTSWAPPED (0x200), and - * WRITEABLE (0x400). ARR_HAS_DESCR (0x800) - * states that arrdescr field is present in - * structure - */ - - npy_intp *shape; /* - * A length-nd array of shape - * information - */ - - npy_intp *strides; /* A length-nd array of stride information */ - - void *data; /* A pointer to the first element of the array */ - - PyObject *descr; /* - * A list of fields or NULL (ignored if flags - * does not have ARR_HAS_DESCR flag set) - */ -} PyArrayInterface; - -/* - * This is a function for hooking into the PyDataMem_NEW/FREE/RENEW functions. - * See the documentation for PyDataMem_SetEventHook. - */ -typedef void (PyDataMem_EventHookFunc)(void *inp, void *outp, size_t size, - void *user_data); - -#if !(defined(NPY_NO_DEPRECATED_API) && (NPY_API_VERSION <= NPY_NO_DEPRECATED_API)) -#include "npy_deprecated_api.h" -#endif - -#endif /* NPY_ARRAYTYPES_H */ diff --git a/include/numpy/noprefix.h b/include/numpy/noprefix.h deleted file mode 100644 index b3e5748..0000000 --- a/include/numpy/noprefix.h +++ /dev/null @@ -1,209 +0,0 @@ -#ifndef NPY_NOPREFIX_H -#define NPY_NOPREFIX_H - -/* - * You can directly include noprefix.h as a backward - * compatibility measure - */ -#ifndef NPY_NO_PREFIX -#include "ndarrayobject.h" -#include "npy_interrupt.h" -#endif - -#define SIGSETJMP NPY_SIGSETJMP -#define SIGLONGJMP NPY_SIGLONGJMP -#define SIGJMP_BUF NPY_SIGJMP_BUF - -#define MAX_DIMS NPY_MAXDIMS - -#define longlong npy_longlong -#define ulonglong npy_ulonglong -#define Bool npy_bool -#define longdouble npy_longdouble -#define byte npy_byte - -#ifndef _BSD_SOURCE -#define ushort npy_ushort -#define uint npy_uint -#define ulong npy_ulong -#endif - -#define ubyte npy_ubyte -#define ushort npy_ushort -#define uint npy_uint -#define ulong npy_ulong -#define cfloat npy_cfloat -#define cdouble npy_cdouble -#define clongdouble npy_clongdouble -#define Int8 npy_int8 -#define UInt8 npy_uint8 -#define Int16 npy_int16 -#define UInt16 npy_uint16 -#define Int32 npy_int32 -#define UInt32 npy_uint32 -#define Int64 npy_int64 -#define UInt64 npy_uint64 -#define Int128 npy_int128 -#define UInt128 npy_uint128 -#define Int256 npy_int256 -#define UInt256 npy_uint256 -#define Float16 npy_float16 -#define Complex32 npy_complex32 -#define Float32 npy_float32 -#define Complex64 npy_complex64 -#define Float64 npy_float64 -#define Complex128 npy_complex128 -#define Float80 npy_float80 -#define Complex160 npy_complex160 -#define Float96 npy_float96 -#define Complex192 npy_complex192 -#define Float128 npy_float128 -#define Complex256 npy_complex256 -#define intp npy_intp -#define uintp npy_uintp -#define datetime npy_datetime -#define timedelta npy_timedelta - -#define SIZEOF_INTP NPY_SIZEOF_INTP -#define SIZEOF_UINTP NPY_SIZEOF_UINTP -#define SIZEOF_DATETIME NPY_SIZEOF_DATETIME -#define SIZEOF_TIMEDELTA NPY_SIZEOF_TIMEDELTA - -#define LONGLONG_FMT NPY_LONGLONG_FMT -#define ULONGLONG_FMT NPY_ULONGLONG_FMT -#define LONGLONG_SUFFIX NPY_LONGLONG_SUFFIX -#define ULONGLONG_SUFFIX NPY_ULONGLONG_SUFFIX - -#define MAX_INT8 127 -#define MIN_INT8 -128 -#define MAX_UINT8 255 -#define MAX_INT16 32767 -#define MIN_INT16 -32768 -#define MAX_UINT16 65535 -#define MAX_INT32 2147483647 -#define MIN_INT32 (-MAX_INT32 - 1) -#define MAX_UINT32 4294967295U -#define MAX_INT64 LONGLONG_SUFFIX(9223372036854775807) -#define MIN_INT64 (-MAX_INT64 - LONGLONG_SUFFIX(1)) -#define MAX_UINT64 ULONGLONG_SUFFIX(18446744073709551615) -#define MAX_INT128 LONGLONG_SUFFIX(85070591730234615865843651857942052864) -#define MIN_INT128 (-MAX_INT128 - LONGLONG_SUFFIX(1)) -#define MAX_UINT128 ULONGLONG_SUFFIX(170141183460469231731687303715884105728) -#define MAX_INT256 LONGLONG_SUFFIX(57896044618658097711785492504343953926634992332820282019728792003956564819967) -#define MIN_INT256 (-MAX_INT256 - LONGLONG_SUFFIX(1)) -#define MAX_UINT256 ULONGLONG_SUFFIX(115792089237316195423570985008687907853269984665640564039457584007913129639935) - -#define MAX_BYTE NPY_MAX_BYTE -#define MIN_BYTE NPY_MIN_BYTE -#define MAX_UBYTE NPY_MAX_UBYTE -#define MAX_SHORT NPY_MAX_SHORT -#define MIN_SHORT NPY_MIN_SHORT -#define MAX_USHORT NPY_MAX_USHORT -#define MAX_INT NPY_MAX_INT -#define MIN_INT NPY_MIN_INT -#define MAX_UINT NPY_MAX_UINT -#define MAX_LONG NPY_MAX_LONG -#define MIN_LONG NPY_MIN_LONG -#define MAX_ULONG NPY_MAX_ULONG -#define MAX_LONGLONG NPY_MAX_LONGLONG -#define MIN_LONGLONG NPY_MIN_LONGLONG -#define MAX_ULONGLONG NPY_MAX_ULONGLONG -#define MIN_DATETIME NPY_MIN_DATETIME -#define MAX_DATETIME NPY_MAX_DATETIME -#define MIN_TIMEDELTA NPY_MIN_TIMEDELTA -#define MAX_TIMEDELTA NPY_MAX_TIMEDELTA - -#define SIZEOF_LONGDOUBLE NPY_SIZEOF_LONGDOUBLE -#define SIZEOF_LONGLONG NPY_SIZEOF_LONGLONG -#define SIZEOF_HALF NPY_SIZEOF_HALF -#define BITSOF_BOOL NPY_BITSOF_BOOL -#define BITSOF_CHAR NPY_BITSOF_CHAR -#define BITSOF_SHORT NPY_BITSOF_SHORT -#define BITSOF_INT NPY_BITSOF_INT -#define BITSOF_LONG NPY_BITSOF_LONG -#define BITSOF_LONGLONG NPY_BITSOF_LONGLONG -#define BITSOF_HALF NPY_BITSOF_HALF -#define BITSOF_FLOAT NPY_BITSOF_FLOAT -#define BITSOF_DOUBLE NPY_BITSOF_DOUBLE -#define BITSOF_LONGDOUBLE NPY_BITSOF_LONGDOUBLE -#define BITSOF_DATETIME NPY_BITSOF_DATETIME -#define BITSOF_TIMEDELTA NPY_BITSOF_TIMEDELTA - -#define _pya_malloc PyArray_malloc -#define _pya_free PyArray_free -#define _pya_realloc PyArray_realloc - -#define BEGIN_THREADS_DEF NPY_BEGIN_THREADS_DEF -#define BEGIN_THREADS NPY_BEGIN_THREADS -#define END_THREADS NPY_END_THREADS -#define ALLOW_C_API_DEF NPY_ALLOW_C_API_DEF -#define ALLOW_C_API NPY_ALLOW_C_API -#define DISABLE_C_API NPY_DISABLE_C_API - -#define PY_FAIL NPY_FAIL -#define PY_SUCCEED NPY_SUCCEED - -#ifndef TRUE -#define TRUE NPY_TRUE -#endif - -#ifndef FALSE -#define FALSE NPY_FALSE -#endif - -#define LONGDOUBLE_FMT NPY_LONGDOUBLE_FMT - -#define CONTIGUOUS NPY_CONTIGUOUS -#define C_CONTIGUOUS NPY_C_CONTIGUOUS -#define FORTRAN NPY_FORTRAN -#define F_CONTIGUOUS NPY_F_CONTIGUOUS -#define OWNDATA NPY_OWNDATA -#define FORCECAST NPY_FORCECAST -#define ENSURECOPY NPY_ENSURECOPY -#define ENSUREARRAY NPY_ENSUREARRAY -#define ELEMENTSTRIDES NPY_ELEMENTSTRIDES -#define ALIGNED NPY_ALIGNED -#define NOTSWAPPED NPY_NOTSWAPPED -#define WRITEABLE NPY_WRITEABLE -#define UPDATEIFCOPY NPY_UPDATEIFCOPY -#define ARR_HAS_DESCR NPY_ARR_HAS_DESCR -#define BEHAVED NPY_BEHAVED -#define BEHAVED_NS NPY_BEHAVED_NS -#define CARRAY NPY_CARRAY -#define CARRAY_RO NPY_CARRAY_RO -#define FARRAY NPY_FARRAY -#define FARRAY_RO NPY_FARRAY_RO -#define DEFAULT NPY_DEFAULT -#define IN_ARRAY NPY_IN_ARRAY -#define OUT_ARRAY NPY_OUT_ARRAY -#define INOUT_ARRAY NPY_INOUT_ARRAY -#define IN_FARRAY NPY_IN_FARRAY -#define OUT_FARRAY NPY_OUT_FARRAY -#define INOUT_FARRAY NPY_INOUT_FARRAY -#define UPDATE_ALL NPY_UPDATE_ALL - -#define OWN_DATA NPY_OWNDATA -#define BEHAVED_FLAGS NPY_BEHAVED -#define BEHAVED_FLAGS_NS NPY_BEHAVED_NS -#define CARRAY_FLAGS_RO NPY_CARRAY_RO -#define CARRAY_FLAGS NPY_CARRAY -#define FARRAY_FLAGS NPY_FARRAY -#define FARRAY_FLAGS_RO NPY_FARRAY_RO -#define DEFAULT_FLAGS NPY_DEFAULT -#define UPDATE_ALL_FLAGS NPY_UPDATE_ALL_FLAGS - -#ifndef MIN -#define MIN PyArray_MIN -#endif -#ifndef MAX -#define MAX PyArray_MAX -#endif -#define MAX_INTP NPY_MAX_INTP -#define MIN_INTP NPY_MIN_INTP -#define MAX_UINTP NPY_MAX_UINTP -#define INTP_FMT NPY_INTP_FMT - -#define REFCOUNT PyArray_REFCOUNT -#define MAX_ELSIZE NPY_MAX_ELSIZE - -#endif diff --git a/include/numpy/npy_3kcompat.h b/include/numpy/npy_3kcompat.h deleted file mode 100644 index d0cd9ac..0000000 --- a/include/numpy/npy_3kcompat.h +++ /dev/null @@ -1,417 +0,0 @@ -/* - * This is a convenience header file providing compatibility utilities - * for supporting Python 2 and Python 3 in the same code base. - * - * If you want to use this for your own projects, it's recommended to make a - * copy of it. Although the stuff below is unlikely to change, we don't provide - * strong backwards compatibility guarantees at the moment. - */ - -#ifndef _NPY_3KCOMPAT_H_ -#define _NPY_3KCOMPAT_H_ - -#include -#include - -#if PY_VERSION_HEX >= 0x03000000 -#ifndef NPY_PY3K -#define NPY_PY3K 1 -#endif -#endif - -#include "numpy/npy_common.h" -#include "numpy/ndarrayobject.h" - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * PyInt -> PyLong - */ - -#if defined(NPY_PY3K) -/* Return True only if the long fits in a C long */ -static NPY_INLINE int PyInt_Check(PyObject *op) { - int overflow = 0; - if (!PyLong_Check(op)) { - return 0; - } - PyLong_AsLongAndOverflow(op, &overflow); - return (overflow == 0); -} - -#define PyInt_FromLong PyLong_FromLong -#define PyInt_AsLong PyLong_AsLong -#define PyInt_AS_LONG PyLong_AsLong -#define PyInt_AsSsize_t PyLong_AsSsize_t - -/* NOTE: - * - * Since the PyLong type is very different from the fixed-range PyInt, - * we don't define PyInt_Type -> PyLong_Type. - */ -#endif /* NPY_PY3K */ - -/* - * PyString -> PyBytes - */ - -#if defined(NPY_PY3K) - -#define PyString_Type PyBytes_Type -#define PyString_Check PyBytes_Check -#define PyStringObject PyBytesObject -#define PyString_FromString PyBytes_FromString -#define PyString_FromStringAndSize PyBytes_FromStringAndSize -#define PyString_AS_STRING PyBytes_AS_STRING -#define PyString_AsStringAndSize PyBytes_AsStringAndSize -#define PyString_FromFormat PyBytes_FromFormat -#define PyString_Concat PyBytes_Concat -#define PyString_ConcatAndDel PyBytes_ConcatAndDel -#define PyString_AsString PyBytes_AsString -#define PyString_GET_SIZE PyBytes_GET_SIZE -#define PyString_Size PyBytes_Size - -#define PyUString_Type PyUnicode_Type -#define PyUString_Check PyUnicode_Check -#define PyUStringObject PyUnicodeObject -#define PyUString_FromString PyUnicode_FromString -#define PyUString_FromStringAndSize PyUnicode_FromStringAndSize -#define PyUString_FromFormat PyUnicode_FromFormat -#define PyUString_Concat PyUnicode_Concat2 -#define PyUString_ConcatAndDel PyUnicode_ConcatAndDel -#define PyUString_GET_SIZE PyUnicode_GET_SIZE -#define PyUString_Size PyUnicode_Size -#define PyUString_InternFromString PyUnicode_InternFromString -#define PyUString_Format PyUnicode_Format - -#else - -#define PyBytes_Type PyString_Type -#define PyBytes_Check PyString_Check -#define PyBytesObject PyStringObject -#define PyBytes_FromString PyString_FromString -#define PyBytes_FromStringAndSize PyString_FromStringAndSize -#define PyBytes_AS_STRING PyString_AS_STRING -#define PyBytes_AsStringAndSize PyString_AsStringAndSize -#define PyBytes_FromFormat PyString_FromFormat -#define PyBytes_Concat PyString_Concat -#define PyBytes_ConcatAndDel PyString_ConcatAndDel -#define PyBytes_AsString PyString_AsString -#define PyBytes_GET_SIZE PyString_GET_SIZE -#define PyBytes_Size PyString_Size - -#define PyUString_Type PyString_Type -#define PyUString_Check PyString_Check -#define PyUStringObject PyStringObject -#define PyUString_FromString PyString_FromString -#define PyUString_FromStringAndSize PyString_FromStringAndSize -#define PyUString_FromFormat PyString_FromFormat -#define PyUString_Concat PyString_Concat -#define PyUString_ConcatAndDel PyString_ConcatAndDel -#define PyUString_GET_SIZE PyString_GET_SIZE -#define PyUString_Size PyString_Size -#define PyUString_InternFromString PyString_InternFromString -#define PyUString_Format PyString_Format - -#endif /* NPY_PY3K */ - - -static NPY_INLINE void -PyUnicode_ConcatAndDel(PyObject **left, PyObject *right) -{ - PyObject *newobj; - newobj = PyUnicode_Concat(*left, right); - Py_DECREF(*left); - Py_DECREF(right); - *left = newobj; -} - -static NPY_INLINE void -PyUnicode_Concat2(PyObject **left, PyObject *right) -{ - PyObject *newobj; - newobj = PyUnicode_Concat(*left, right); - Py_DECREF(*left); - *left = newobj; -} - -/* - * PyFile_* compatibility - */ -#if defined(NPY_PY3K) - -/* - * Get a FILE* handle to the file represented by the Python object - */ -static NPY_INLINE FILE* -npy_PyFile_Dup(PyObject *file, char *mode) -{ - int fd, fd2; - PyObject *ret, *os; - Py_ssize_t pos; - FILE *handle; - /* Flush first to ensure things end up in the file in the correct order */ - ret = PyObject_CallMethod(file, "flush", ""); - if (ret == NULL) { - return NULL; - } - Py_DECREF(ret); - fd = PyObject_AsFileDescriptor(file); - if (fd == -1) { - return NULL; - } - os = PyImport_ImportModule("os"); - if (os == NULL) { - return NULL; - } - ret = PyObject_CallMethod(os, "dup", "i", fd); - Py_DECREF(os); - if (ret == NULL) { - return NULL; - } - fd2 = PyNumber_AsSsize_t(ret, NULL); - Py_DECREF(ret); -#ifdef _WIN32 - handle = _fdopen(fd2, mode); -#else - handle = fdopen(fd2, mode); -#endif - if (handle == NULL) { - PyErr_SetString(PyExc_IOError, - "Getting a FILE* from a Python file object failed"); - } - ret = PyObject_CallMethod(file, "tell", ""); - if (ret == NULL) { - fclose(handle); - return NULL; - } - pos = PyNumber_AsSsize_t(ret, PyExc_OverflowError); - Py_DECREF(ret); - if (PyErr_Occurred()) { - fclose(handle); - return NULL; - } - npy_fseek(handle, pos, SEEK_SET); - return handle; -} - -/* - * Close the dup-ed file handle, and seek the Python one to the current position - */ -static NPY_INLINE int -npy_PyFile_DupClose(PyObject *file, FILE* handle) -{ - PyObject *ret; - Py_ssize_t position; - position = npy_ftell(handle); - fclose(handle); - - ret = PyObject_CallMethod(file, "seek", NPY_SSIZE_T_PYFMT "i", position, 0); - if (ret == NULL) { - return -1; - } - Py_DECREF(ret); - return 0; -} - -static NPY_INLINE int -npy_PyFile_Check(PyObject *file) -{ - int fd; - fd = PyObject_AsFileDescriptor(file); - if (fd == -1) { - PyErr_Clear(); - return 0; - } - return 1; -} - -#else - -#define npy_PyFile_Dup(file, mode) PyFile_AsFile(file) -#define npy_PyFile_DupClose(file, handle) (0) -#define npy_PyFile_Check PyFile_Check - -#endif - -static NPY_INLINE PyObject* -npy_PyFile_OpenFile(PyObject *filename, const char *mode) -{ - PyObject *open; - open = PyDict_GetItemString(PyEval_GetBuiltins(), "open"); - if (open == NULL) { - return NULL; - } - return PyObject_CallFunction(open, "Os", filename, mode); -} - -static NPY_INLINE int -npy_PyFile_CloseFile(PyObject *file) -{ - PyObject *ret; - - ret = PyObject_CallMethod(file, "close", NULL); - if (ret == NULL) { - return -1; - } - Py_DECREF(ret); - return 0; -} - -/* - * PyObject_Cmp - */ -#if defined(NPY_PY3K) -static NPY_INLINE int -PyObject_Cmp(PyObject *i1, PyObject *i2, int *cmp) -{ - int v; - v = PyObject_RichCompareBool(i1, i2, Py_LT); - if (v == 0) { - *cmp = -1; - return 1; - } - else if (v == -1) { - return -1; - } - - v = PyObject_RichCompareBool(i1, i2, Py_GT); - if (v == 0) { - *cmp = 1; - return 1; - } - else if (v == -1) { - return -1; - } - - v = PyObject_RichCompareBool(i1, i2, Py_EQ); - if (v == 0) { - *cmp = 0; - return 1; - } - else { - *cmp = 0; - return -1; - } -} -#endif - -/* - * PyCObject functions adapted to PyCapsules. - * - * The main job here is to get rid of the improved error handling - * of PyCapsules. It's a shame... - */ -#if PY_VERSION_HEX >= 0x03000000 - -static NPY_INLINE PyObject * -NpyCapsule_FromVoidPtr(void *ptr, void (*dtor)(PyObject *)) -{ - PyObject *ret = PyCapsule_New(ptr, NULL, dtor); - if (ret == NULL) { - PyErr_Clear(); - } - return ret; -} - -static NPY_INLINE PyObject * -NpyCapsule_FromVoidPtrAndDesc(void *ptr, void* context, void (*dtor)(PyObject *)) -{ - PyObject *ret = NpyCapsule_FromVoidPtr(ptr, dtor); - if (ret != NULL && PyCapsule_SetContext(ret, context) != 0) { - PyErr_Clear(); - Py_DECREF(ret); - ret = NULL; - } - return ret; -} - -static NPY_INLINE void * -NpyCapsule_AsVoidPtr(PyObject *obj) -{ - void *ret = PyCapsule_GetPointer(obj, NULL); - if (ret == NULL) { - PyErr_Clear(); - } - return ret; -} - -static NPY_INLINE void * -NpyCapsule_GetDesc(PyObject *obj) -{ - return PyCapsule_GetContext(obj); -} - -static NPY_INLINE int -NpyCapsule_Check(PyObject *ptr) -{ - return PyCapsule_CheckExact(ptr); -} - -static NPY_INLINE void -simple_capsule_dtor(PyObject *cap) -{ - PyArray_free(PyCapsule_GetPointer(cap, NULL)); -} - -#else - -static NPY_INLINE PyObject * -NpyCapsule_FromVoidPtr(void *ptr, void (*dtor)(void *)) -{ - return PyCObject_FromVoidPtr(ptr, dtor); -} - -static NPY_INLINE PyObject * -NpyCapsule_FromVoidPtrAndDesc(void *ptr, void* context, - void (*dtor)(void *, void *)) -{ - return PyCObject_FromVoidPtrAndDesc(ptr, context, dtor); -} - -static NPY_INLINE void * -NpyCapsule_AsVoidPtr(PyObject *ptr) -{ - return PyCObject_AsVoidPtr(ptr); -} - -static NPY_INLINE void * -NpyCapsule_GetDesc(PyObject *obj) -{ - return PyCObject_GetDesc(obj); -} - -static NPY_INLINE int -NpyCapsule_Check(PyObject *ptr) -{ - return PyCObject_Check(ptr); -} - -static NPY_INLINE void -simple_capsule_dtor(void *ptr) -{ - PyArray_free(ptr); -} - -#endif - -/* - * Hash value compatibility. - * As of Python 3.2 hash values are of type Py_hash_t. - * Previous versions use C long. - */ -#if PY_VERSION_HEX < 0x03020000 -typedef long npy_hash_t; -#define NPY_SIZEOF_HASH_T NPY_SIZEOF_LONG -#else -typedef Py_hash_t npy_hash_t; -#define NPY_SIZEOF_HASH_T NPY_SIZEOF_INTP -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* _NPY_3KCOMPAT_H_ */ diff --git a/include/numpy/npy_common.h b/include/numpy/npy_common.h deleted file mode 100644 index 7fca7e2..0000000 --- a/include/numpy/npy_common.h +++ /dev/null @@ -1,930 +0,0 @@ -#ifndef _NPY_COMMON_H_ -#define _NPY_COMMON_H_ - -/* numpconfig.h is auto-generated */ -#include "numpyconfig.h" - -#if defined(_MSC_VER) - #define NPY_INLINE __inline -#elif defined(__GNUC__) - #if defined(__STRICT_ANSI__) - #define NPY_INLINE __inline__ - #else - #define NPY_INLINE inline - #endif -#else - #define NPY_INLINE -#endif - -/* Enable 64 bit file position support on win-amd64. Ticket #1660 */ -#if defined(_MSC_VER) && defined(_WIN64) && (_MSC_VER > 1400) - #define npy_fseek _fseeki64 - #define npy_ftell _ftelli64 -#else - #define npy_fseek fseek - #define npy_ftell ftell -#endif - -/* enums for detected endianness */ -enum { - NPY_CPU_UNKNOWN_ENDIAN, - NPY_CPU_LITTLE, - NPY_CPU_BIG -}; - -/* - * This is to typedef npy_intp to the appropriate pointer size for - * this platform. Py_intptr_t, Py_uintptr_t are defined in pyport.h. - */ -typedef Py_intptr_t npy_intp; -typedef Py_uintptr_t npy_uintp; -#define NPY_SIZEOF_CHAR 1 -#define NPY_SIZEOF_BYTE 1 -#define NPY_SIZEOF_INTP NPY_SIZEOF_PY_INTPTR_T -#define NPY_SIZEOF_UINTP NPY_SIZEOF_PY_INTPTR_T -#define NPY_SIZEOF_CFLOAT NPY_SIZEOF_COMPLEX_FLOAT -#define NPY_SIZEOF_CDOUBLE NPY_SIZEOF_COMPLEX_DOUBLE -#define NPY_SIZEOF_CLONGDOUBLE NPY_SIZEOF_COMPLEX_LONGDOUBLE - -#ifdef constchar -#undef constchar -#endif - -#if (PY_VERSION_HEX < 0x02050000) - #ifndef PY_SSIZE_T_MIN - typedef int Py_ssize_t; - #define PY_SSIZE_T_MAX INT_MAX - #define PY_SSIZE_T_MIN INT_MIN - #endif -#define NPY_SSIZE_T_PYFMT "i" -#define constchar const char -#else -#define NPY_SSIZE_T_PYFMT "n" -#define constchar char -#endif - -/* NPY_INTP_FMT Note: - * Unlike the other NPY_*_FMT macros which are used with - * PyOS_snprintf, NPY_INTP_FMT is used with PyErr_Format and - * PyString_Format. These functions use different formatting - * codes which are portably specified according to the Python - * documentation. See ticket #1795. - * - * On Windows x64, the LONGLONG formatter should be used, but - * in Python 2.6 the %lld formatter is not supported. In this - * case we work around the problem by using the %zd formatter. - */ -#if NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_INT - #define NPY_INTP NPY_INT - #define NPY_UINTP NPY_UINT - #define PyIntpArrType_Type PyIntArrType_Type - #define PyUIntpArrType_Type PyUIntArrType_Type - #define NPY_MAX_INTP NPY_MAX_INT - #define NPY_MIN_INTP NPY_MIN_INT - #define NPY_MAX_UINTP NPY_MAX_UINT - #define NPY_INTP_FMT "d" -#elif NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_LONG - #define NPY_INTP NPY_LONG - #define NPY_UINTP NPY_ULONG - #define PyIntpArrType_Type PyLongArrType_Type - #define PyUIntpArrType_Type PyULongArrType_Type - #define NPY_MAX_INTP NPY_MAX_LONG - #define NPY_MIN_INTP NPY_MIN_LONG - #define NPY_MAX_UINTP NPY_MAX_ULONG - #define NPY_INTP_FMT "ld" -#elif defined(PY_LONG_LONG) && (NPY_SIZEOF_PY_INTPTR_T == NPY_SIZEOF_LONGLONG) - #define NPY_INTP NPY_LONGLONG - #define NPY_UINTP NPY_ULONGLONG - #define PyIntpArrType_Type PyLongLongArrType_Type - #define PyUIntpArrType_Type PyULongLongArrType_Type - #define NPY_MAX_INTP NPY_MAX_LONGLONG - #define NPY_MIN_INTP NPY_MIN_LONGLONG - #define NPY_MAX_UINTP NPY_MAX_ULONGLONG - #if (PY_VERSION_HEX >= 0x02070000) - #define NPY_INTP_FMT "lld" - #else - #define NPY_INTP_FMT "zd" - #endif -#endif - -/* - * We can only use C99 formats for npy_int_p if it is the same as - * intp_t, hence the condition on HAVE_UNITPTR_T - */ -#if (NPY_USE_C99_FORMATS) == 1 \ - && (defined HAVE_UINTPTR_T) \ - && (defined HAVE_INTTYPES_H) - #include - #undef NPY_INTP_FMT - #define NPY_INTP_FMT PRIdPTR -#endif - - -/* - * Some platforms don't define bool, long long, or long double. - * Handle that here. - */ -#define NPY_BYTE_FMT "hhd" -#define NPY_UBYTE_FMT "hhu" -#define NPY_SHORT_FMT "hd" -#define NPY_USHORT_FMT "hu" -#define NPY_INT_FMT "d" -#define NPY_UINT_FMT "u" -#define NPY_LONG_FMT "ld" -#define NPY_ULONG_FMT "lu" -#define NPY_HALF_FMT "g" -#define NPY_FLOAT_FMT "g" -#define NPY_DOUBLE_FMT "g" - - -#ifdef PY_LONG_LONG -typedef PY_LONG_LONG npy_longlong; -typedef unsigned PY_LONG_LONG npy_ulonglong; -# ifdef _MSC_VER -# define NPY_LONGLONG_FMT "I64d" -# define NPY_ULONGLONG_FMT "I64u" -# elif defined(__APPLE__) || defined(__FreeBSD__) -/* "%Ld" only parses 4 bytes -- "L" is floating modifier on MacOS X/BSD */ -# define NPY_LONGLONG_FMT "lld" -# define NPY_ULONGLONG_FMT "llu" -/* - another possible variant -- *quad_t works on *BSD, but is deprecated: - #define LONGLONG_FMT "qd" - #define ULONGLONG_FMT "qu" -*/ -# else -# define NPY_LONGLONG_FMT "Ld" -# define NPY_ULONGLONG_FMT "Lu" -# endif -# ifdef _MSC_VER -# define NPY_LONGLONG_SUFFIX(x) (x##i64) -# define NPY_ULONGLONG_SUFFIX(x) (x##Ui64) -# else -# define NPY_LONGLONG_SUFFIX(x) (x##LL) -# define NPY_ULONGLONG_SUFFIX(x) (x##ULL) -# endif -#else -typedef long npy_longlong; -typedef unsigned long npy_ulonglong; -# define NPY_LONGLONG_SUFFIX(x) (x##L) -# define NPY_ULONGLONG_SUFFIX(x) (x##UL) -#endif - - -typedef unsigned char npy_bool; -#define NPY_FALSE 0 -#define NPY_TRUE 1 - - -#if NPY_SIZEOF_LONGDOUBLE == NPY_SIZEOF_DOUBLE - typedef double npy_longdouble; - #define NPY_LONGDOUBLE_FMT "g" -#else - typedef long double npy_longdouble; - #define NPY_LONGDOUBLE_FMT "Lg" -#endif - -#ifndef Py_USING_UNICODE -#error Must use Python with unicode enabled. -#endif - - -typedef signed char npy_byte; -typedef unsigned char npy_ubyte; -typedef unsigned short npy_ushort; -typedef unsigned int npy_uint; -typedef unsigned long npy_ulong; - -/* These are for completeness */ -typedef char npy_char; -typedef short npy_short; -typedef int npy_int; -typedef long npy_long; -typedef float npy_float; -typedef double npy_double; - -/* - * Disabling C99 complex usage: a lot of C code in numpy/scipy rely on being - * able to do .real/.imag. Will have to convert code first. - */ -#if 0 -#if defined(NPY_USE_C99_COMPLEX) && defined(NPY_HAVE_COMPLEX_DOUBLE) -typedef complex npy_cdouble; -#else -typedef struct { double real, imag; } npy_cdouble; -#endif - -#if defined(NPY_USE_C99_COMPLEX) && defined(NPY_HAVE_COMPLEX_FLOAT) -typedef complex float npy_cfloat; -#else -typedef struct { float real, imag; } npy_cfloat; -#endif - -#if defined(NPY_USE_C99_COMPLEX) && defined(NPY_HAVE_COMPLEX_LONG_DOUBLE) -typedef complex long double npy_clongdouble; -#else -typedef struct {npy_longdouble real, imag;} npy_clongdouble; -#endif -#endif -#if NPY_SIZEOF_COMPLEX_DOUBLE != 2 * NPY_SIZEOF_DOUBLE -#error npy_cdouble definition is not compatible with C99 complex definition ! \ - Please contact Numpy maintainers and give detailed information about your \ - compiler and platform -#endif -typedef struct { double real, imag; } npy_cdouble; - -#if NPY_SIZEOF_COMPLEX_FLOAT != 2 * NPY_SIZEOF_FLOAT -#error npy_cfloat definition is not compatible with C99 complex definition ! \ - Please contact Numpy maintainers and give detailed information about your \ - compiler and platform -#endif -typedef struct { float real, imag; } npy_cfloat; - -#if NPY_SIZEOF_COMPLEX_LONGDOUBLE != 2 * NPY_SIZEOF_LONGDOUBLE -#error npy_clongdouble definition is not compatible with C99 complex definition ! \ - Please contact Numpy maintainers and give detailed information about your \ - compiler and platform -#endif -typedef struct { npy_longdouble real, imag; } npy_clongdouble; - -/* - * numarray-style bit-width typedefs - */ -#define NPY_MAX_INT8 127 -#define NPY_MIN_INT8 -128 -#define NPY_MAX_UINT8 255 -#define NPY_MAX_INT16 32767 -#define NPY_MIN_INT16 -32768 -#define NPY_MAX_UINT16 65535 -#define NPY_MAX_INT32 2147483647 -#define NPY_MIN_INT32 (-NPY_MAX_INT32 - 1) -#define NPY_MAX_UINT32 4294967295U -#define NPY_MAX_INT64 NPY_LONGLONG_SUFFIX(9223372036854775807) -#define NPY_MIN_INT64 (-NPY_MAX_INT64 - NPY_LONGLONG_SUFFIX(1)) -#define NPY_MAX_UINT64 NPY_ULONGLONG_SUFFIX(18446744073709551615) -#define NPY_MAX_INT128 NPY_LONGLONG_SUFFIX(85070591730234615865843651857942052864) -#define NPY_MIN_INT128 (-NPY_MAX_INT128 - NPY_LONGLONG_SUFFIX(1)) -#define NPY_MAX_UINT128 NPY_ULONGLONG_SUFFIX(170141183460469231731687303715884105728) -#define NPY_MAX_INT256 NPY_LONGLONG_SUFFIX(57896044618658097711785492504343953926634992332820282019728792003956564819967) -#define NPY_MIN_INT256 (-NPY_MAX_INT256 - NPY_LONGLONG_SUFFIX(1)) -#define NPY_MAX_UINT256 NPY_ULONGLONG_SUFFIX(115792089237316195423570985008687907853269984665640564039457584007913129639935) -#define NPY_MIN_DATETIME NPY_MIN_INT64 -#define NPY_MAX_DATETIME NPY_MAX_INT64 -#define NPY_MIN_TIMEDELTA NPY_MIN_INT64 -#define NPY_MAX_TIMEDELTA NPY_MAX_INT64 - - /* Need to find the number of bits for each type and - make definitions accordingly. - - C states that sizeof(char) == 1 by definition - - So, just using the sizeof keyword won't help. - - It also looks like Python itself uses sizeof(char) quite a - bit, which by definition should be 1 all the time. - - Idea: Make Use of CHAR_BIT which should tell us how many - BITS per CHARACTER - */ - - /* Include platform definitions -- These are in the C89/90 standard */ -#include -#define NPY_MAX_BYTE SCHAR_MAX -#define NPY_MIN_BYTE SCHAR_MIN -#define NPY_MAX_UBYTE UCHAR_MAX -#define NPY_MAX_SHORT SHRT_MAX -#define NPY_MIN_SHORT SHRT_MIN -#define NPY_MAX_USHORT USHRT_MAX -#define NPY_MAX_INT INT_MAX -#ifndef INT_MIN -#define INT_MIN (-INT_MAX - 1) -#endif -#define NPY_MIN_INT INT_MIN -#define NPY_MAX_UINT UINT_MAX -#define NPY_MAX_LONG LONG_MAX -#define NPY_MIN_LONG LONG_MIN -#define NPY_MAX_ULONG ULONG_MAX - -#define NPY_SIZEOF_HALF 2 -#define NPY_SIZEOF_DATETIME 8 -#define NPY_SIZEOF_TIMEDELTA 8 - -#define NPY_BITSOF_BOOL (sizeof(npy_bool) * CHAR_BIT) -#define NPY_BITSOF_CHAR CHAR_BIT -#define NPY_BITSOF_BYTE (NPY_SIZEOF_BYTE * CHAR_BIT) -#define NPY_BITSOF_SHORT (NPY_SIZEOF_SHORT * CHAR_BIT) -#define NPY_BITSOF_INT (NPY_SIZEOF_INT * CHAR_BIT) -#define NPY_BITSOF_LONG (NPY_SIZEOF_LONG * CHAR_BIT) -#define NPY_BITSOF_LONGLONG (NPY_SIZEOF_LONGLONG * CHAR_BIT) -#define NPY_BITSOF_INTP (NPY_SIZEOF_INTP * CHAR_BIT) -#define NPY_BITSOF_HALF (NPY_SIZEOF_HALF * CHAR_BIT) -#define NPY_BITSOF_FLOAT (NPY_SIZEOF_FLOAT * CHAR_BIT) -#define NPY_BITSOF_DOUBLE (NPY_SIZEOF_DOUBLE * CHAR_BIT) -#define NPY_BITSOF_LONGDOUBLE (NPY_SIZEOF_LONGDOUBLE * CHAR_BIT) -#define NPY_BITSOF_CFLOAT (NPY_SIZEOF_CFLOAT * CHAR_BIT) -#define NPY_BITSOF_CDOUBLE (NPY_SIZEOF_CDOUBLE * CHAR_BIT) -#define NPY_BITSOF_CLONGDOUBLE (NPY_SIZEOF_CLONGDOUBLE * CHAR_BIT) -#define NPY_BITSOF_DATETIME (NPY_SIZEOF_DATETIME * CHAR_BIT) -#define NPY_BITSOF_TIMEDELTA (NPY_SIZEOF_TIMEDELTA * CHAR_BIT) - -#if NPY_BITSOF_LONG == 8 -#define NPY_INT8 NPY_LONG -#define NPY_UINT8 NPY_ULONG - typedef long npy_int8; - typedef unsigned long npy_uint8; -#define PyInt8ScalarObject PyLongScalarObject -#define PyInt8ArrType_Type PyLongArrType_Type -#define PyUInt8ScalarObject PyULongScalarObject -#define PyUInt8ArrType_Type PyULongArrType_Type -#define NPY_INT8_FMT NPY_LONG_FMT -#define NPY_UINT8_FMT NPY_ULONG_FMT -#elif NPY_BITSOF_LONG == 16 -#define NPY_INT16 NPY_LONG -#define NPY_UINT16 NPY_ULONG - typedef long npy_int16; - typedef unsigned long npy_uint16; -#define PyInt16ScalarObject PyLongScalarObject -#define PyInt16ArrType_Type PyLongArrType_Type -#define PyUInt16ScalarObject PyULongScalarObject -#define PyUInt16ArrType_Type PyULongArrType_Type -#define NPY_INT16_FMT NPY_LONG_FMT -#define NPY_UINT16_FMT NPY_ULONG_FMT -#elif NPY_BITSOF_LONG == 32 -#define NPY_INT32 NPY_LONG -#define NPY_UINT32 NPY_ULONG - typedef long npy_int32; - typedef unsigned long npy_uint32; - typedef unsigned long npy_ucs4; -#define PyInt32ScalarObject PyLongScalarObject -#define PyInt32ArrType_Type PyLongArrType_Type -#define PyUInt32ScalarObject PyULongScalarObject -#define PyUInt32ArrType_Type PyULongArrType_Type -#define NPY_INT32_FMT NPY_LONG_FMT -#define NPY_UINT32_FMT NPY_ULONG_FMT -#elif NPY_BITSOF_LONG == 64 -#define NPY_INT64 NPY_LONG -#define NPY_UINT64 NPY_ULONG - typedef long npy_int64; - typedef unsigned long npy_uint64; -#define PyInt64ScalarObject PyLongScalarObject -#define PyInt64ArrType_Type PyLongArrType_Type -#define PyUInt64ScalarObject PyULongScalarObject -#define PyUInt64ArrType_Type PyULongArrType_Type -#define NPY_INT64_FMT NPY_LONG_FMT -#define NPY_UINT64_FMT NPY_ULONG_FMT -#define MyPyLong_FromInt64 PyLong_FromLong -#define MyPyLong_AsInt64 PyLong_AsLong -#elif NPY_BITSOF_LONG == 128 -#define NPY_INT128 NPY_LONG -#define NPY_UINT128 NPY_ULONG - typedef long npy_int128; - typedef unsigned long npy_uint128; -#define PyInt128ScalarObject PyLongScalarObject -#define PyInt128ArrType_Type PyLongArrType_Type -#define PyUInt128ScalarObject PyULongScalarObject -#define PyUInt128ArrType_Type PyULongArrType_Type -#define NPY_INT128_FMT NPY_LONG_FMT -#define NPY_UINT128_FMT NPY_ULONG_FMT -#endif - -#if NPY_BITSOF_LONGLONG == 8 -# ifndef NPY_INT8 -# define NPY_INT8 NPY_LONGLONG -# define NPY_UINT8 NPY_ULONGLONG - typedef npy_longlong npy_int8; - typedef npy_ulonglong npy_uint8; -# define PyInt8ScalarObject PyLongLongScalarObject -# define PyInt8ArrType_Type PyLongLongArrType_Type -# define PyUInt8ScalarObject PyULongLongScalarObject -# define PyUInt8ArrType_Type PyULongLongArrType_Type -#define NPY_INT8_FMT NPY_LONGLONG_FMT -#define NPY_UINT8_FMT NPY_ULONGLONG_FMT -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT8 -# define NPY_MIN_LONGLONG NPY_MIN_INT8 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT8 -#elif NPY_BITSOF_LONGLONG == 16 -# ifndef NPY_INT16 -# define NPY_INT16 NPY_LONGLONG -# define NPY_UINT16 NPY_ULONGLONG - typedef npy_longlong npy_int16; - typedef npy_ulonglong npy_uint16; -# define PyInt16ScalarObject PyLongLongScalarObject -# define PyInt16ArrType_Type PyLongLongArrType_Type -# define PyUInt16ScalarObject PyULongLongScalarObject -# define PyUInt16ArrType_Type PyULongLongArrType_Type -#define NPY_INT16_FMT NPY_LONGLONG_FMT -#define NPY_UINT16_FMT NPY_ULONGLONG_FMT -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT16 -# define NPY_MIN_LONGLONG NPY_MIN_INT16 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT16 -#elif NPY_BITSOF_LONGLONG == 32 -# ifndef NPY_INT32 -# define NPY_INT32 NPY_LONGLONG -# define NPY_UINT32 NPY_ULONGLONG - typedef npy_longlong npy_int32; - typedef npy_ulonglong npy_uint32; - typedef npy_ulonglong npy_ucs4; -# define PyInt32ScalarObject PyLongLongScalarObject -# define PyInt32ArrType_Type PyLongLongArrType_Type -# define PyUInt32ScalarObject PyULongLongScalarObject -# define PyUInt32ArrType_Type PyULongLongArrType_Type -#define NPY_INT32_FMT NPY_LONGLONG_FMT -#define NPY_UINT32_FMT NPY_ULONGLONG_FMT -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT32 -# define NPY_MIN_LONGLONG NPY_MIN_INT32 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT32 -#elif NPY_BITSOF_LONGLONG == 64 -# ifndef NPY_INT64 -# define NPY_INT64 NPY_LONGLONG -# define NPY_UINT64 NPY_ULONGLONG - typedef npy_longlong npy_int64; - typedef npy_ulonglong npy_uint64; -# define PyInt64ScalarObject PyLongLongScalarObject -# define PyInt64ArrType_Type PyLongLongArrType_Type -# define PyUInt64ScalarObject PyULongLongScalarObject -# define PyUInt64ArrType_Type PyULongLongArrType_Type -#define NPY_INT64_FMT NPY_LONGLONG_FMT -#define NPY_UINT64_FMT NPY_ULONGLONG_FMT -# define MyPyLong_FromInt64 PyLong_FromLongLong -# define MyPyLong_AsInt64 PyLong_AsLongLong -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT64 -# define NPY_MIN_LONGLONG NPY_MIN_INT64 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT64 -#elif NPY_BITSOF_LONGLONG == 128 -# ifndef NPY_INT128 -# define NPY_INT128 NPY_LONGLONG -# define NPY_UINT128 NPY_ULONGLONG - typedef npy_longlong npy_int128; - typedef npy_ulonglong npy_uint128; -# define PyInt128ScalarObject PyLongLongScalarObject -# define PyInt128ArrType_Type PyLongLongArrType_Type -# define PyUInt128ScalarObject PyULongLongScalarObject -# define PyUInt128ArrType_Type PyULongLongArrType_Type -#define NPY_INT128_FMT NPY_LONGLONG_FMT -#define NPY_UINT128_FMT NPY_ULONGLONG_FMT -# endif -# define NPY_MAX_LONGLONG NPY_MAX_INT128 -# define NPY_MIN_LONGLONG NPY_MIN_INT128 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT128 -#elif NPY_BITSOF_LONGLONG == 256 -# define NPY_INT256 NPY_LONGLONG -# define NPY_UINT256 NPY_ULONGLONG - typedef npy_longlong npy_int256; - typedef npy_ulonglong npy_uint256; -# define PyInt256ScalarObject PyLongLongScalarObject -# define PyInt256ArrType_Type PyLongLongArrType_Type -# define PyUInt256ScalarObject PyULongLongScalarObject -# define PyUInt256ArrType_Type PyULongLongArrType_Type -#define NPY_INT256_FMT NPY_LONGLONG_FMT -#define NPY_UINT256_FMT NPY_ULONGLONG_FMT -# define NPY_MAX_LONGLONG NPY_MAX_INT256 -# define NPY_MIN_LONGLONG NPY_MIN_INT256 -# define NPY_MAX_ULONGLONG NPY_MAX_UINT256 -#endif - -#if NPY_BITSOF_INT == 8 -#ifndef NPY_INT8 -#define NPY_INT8 NPY_INT -#define NPY_UINT8 NPY_UINT - typedef int npy_int8; - typedef unsigned int npy_uint8; -# define PyInt8ScalarObject PyIntScalarObject -# define PyInt8ArrType_Type PyIntArrType_Type -# define PyUInt8ScalarObject PyUIntScalarObject -# define PyUInt8ArrType_Type PyUIntArrType_Type -#define NPY_INT8_FMT NPY_INT_FMT -#define NPY_UINT8_FMT NPY_UINT_FMT -#endif -#elif NPY_BITSOF_INT == 16 -#ifndef NPY_INT16 -#define NPY_INT16 NPY_INT -#define NPY_UINT16 NPY_UINT - typedef int npy_int16; - typedef unsigned int npy_uint16; -# define PyInt16ScalarObject PyIntScalarObject -# define PyInt16ArrType_Type PyIntArrType_Type -# define PyUInt16ScalarObject PyIntUScalarObject -# define PyUInt16ArrType_Type PyIntUArrType_Type -#define NPY_INT16_FMT NPY_INT_FMT -#define NPY_UINT16_FMT NPY_UINT_FMT -#endif -#elif NPY_BITSOF_INT == 32 -#ifndef NPY_INT32 -#define NPY_INT32 NPY_INT -#define NPY_UINT32 NPY_UINT - typedef int npy_int32; - typedef unsigned int npy_uint32; - typedef unsigned int npy_ucs4; -# define PyInt32ScalarObject PyIntScalarObject -# define PyInt32ArrType_Type PyIntArrType_Type -# define PyUInt32ScalarObject PyUIntScalarObject -# define PyUInt32ArrType_Type PyUIntArrType_Type -#define NPY_INT32_FMT NPY_INT_FMT -#define NPY_UINT32_FMT NPY_UINT_FMT -#endif -#elif NPY_BITSOF_INT == 64 -#ifndef NPY_INT64 -#define NPY_INT64 NPY_INT -#define NPY_UINT64 NPY_UINT - typedef int npy_int64; - typedef unsigned int npy_uint64; -# define PyInt64ScalarObject PyIntScalarObject -# define PyInt64ArrType_Type PyIntArrType_Type -# define PyUInt64ScalarObject PyUIntScalarObject -# define PyUInt64ArrType_Type PyUIntArrType_Type -#define NPY_INT64_FMT NPY_INT_FMT -#define NPY_UINT64_FMT NPY_UINT_FMT -# define MyPyLong_FromInt64 PyLong_FromLong -# define MyPyLong_AsInt64 PyLong_AsLong -#endif -#elif NPY_BITSOF_INT == 128 -#ifndef NPY_INT128 -#define NPY_INT128 NPY_INT -#define NPY_UINT128 NPY_UINT - typedef int npy_int128; - typedef unsigned int npy_uint128; -# define PyInt128ScalarObject PyIntScalarObject -# define PyInt128ArrType_Type PyIntArrType_Type -# define PyUInt128ScalarObject PyUIntScalarObject -# define PyUInt128ArrType_Type PyUIntArrType_Type -#define NPY_INT128_FMT NPY_INT_FMT -#define NPY_UINT128_FMT NPY_UINT_FMT -#endif -#endif - -#if NPY_BITSOF_SHORT == 8 -#ifndef NPY_INT8 -#define NPY_INT8 NPY_SHORT -#define NPY_UINT8 NPY_USHORT - typedef short npy_int8; - typedef unsigned short npy_uint8; -# define PyInt8ScalarObject PyShortScalarObject -# define PyInt8ArrType_Type PyShortArrType_Type -# define PyUInt8ScalarObject PyUShortScalarObject -# define PyUInt8ArrType_Type PyUShortArrType_Type -#define NPY_INT8_FMT NPY_SHORT_FMT -#define NPY_UINT8_FMT NPY_USHORT_FMT -#endif -#elif NPY_BITSOF_SHORT == 16 -#ifndef NPY_INT16 -#define NPY_INT16 NPY_SHORT -#define NPY_UINT16 NPY_USHORT - typedef short npy_int16; - typedef unsigned short npy_uint16; -# define PyInt16ScalarObject PyShortScalarObject -# define PyInt16ArrType_Type PyShortArrType_Type -# define PyUInt16ScalarObject PyUShortScalarObject -# define PyUInt16ArrType_Type PyUShortArrType_Type -#define NPY_INT16_FMT NPY_SHORT_FMT -#define NPY_UINT16_FMT NPY_USHORT_FMT -#endif -#elif NPY_BITSOF_SHORT == 32 -#ifndef NPY_INT32 -#define NPY_INT32 NPY_SHORT -#define NPY_UINT32 NPY_USHORT - typedef short npy_int32; - typedef unsigned short npy_uint32; - typedef unsigned short npy_ucs4; -# define PyInt32ScalarObject PyShortScalarObject -# define PyInt32ArrType_Type PyShortArrType_Type -# define PyUInt32ScalarObject PyUShortScalarObject -# define PyUInt32ArrType_Type PyUShortArrType_Type -#define NPY_INT32_FMT NPY_SHORT_FMT -#define NPY_UINT32_FMT NPY_USHORT_FMT -#endif -#elif NPY_BITSOF_SHORT == 64 -#ifndef NPY_INT64 -#define NPY_INT64 NPY_SHORT -#define NPY_UINT64 NPY_USHORT - typedef short npy_int64; - typedef unsigned short npy_uint64; -# define PyInt64ScalarObject PyShortScalarObject -# define PyInt64ArrType_Type PyShortArrType_Type -# define PyUInt64ScalarObject PyUShortScalarObject -# define PyUInt64ArrType_Type PyUShortArrType_Type -#define NPY_INT64_FMT NPY_SHORT_FMT -#define NPY_UINT64_FMT NPY_USHORT_FMT -# define MyPyLong_FromInt64 PyLong_FromLong -# define MyPyLong_AsInt64 PyLong_AsLong -#endif -#elif NPY_BITSOF_SHORT == 128 -#ifndef NPY_INT128 -#define NPY_INT128 NPY_SHORT -#define NPY_UINT128 NPY_USHORT - typedef short npy_int128; - typedef unsigned short npy_uint128; -# define PyInt128ScalarObject PyShortScalarObject -# define PyInt128ArrType_Type PyShortArrType_Type -# define PyUInt128ScalarObject PyUShortScalarObject -# define PyUInt128ArrType_Type PyUShortArrType_Type -#define NPY_INT128_FMT NPY_SHORT_FMT -#define NPY_UINT128_FMT NPY_USHORT_FMT -#endif -#endif - - -#if NPY_BITSOF_CHAR == 8 -#ifndef NPY_INT8 -#define NPY_INT8 NPY_BYTE -#define NPY_UINT8 NPY_UBYTE - typedef signed char npy_int8; - typedef unsigned char npy_uint8; -# define PyInt8ScalarObject PyByteScalarObject -# define PyInt8ArrType_Type PyByteArrType_Type -# define PyUInt8ScalarObject PyUByteScalarObject -# define PyUInt8ArrType_Type PyUByteArrType_Type -#define NPY_INT8_FMT NPY_BYTE_FMT -#define NPY_UINT8_FMT NPY_UBYTE_FMT -#endif -#elif NPY_BITSOF_CHAR == 16 -#ifndef NPY_INT16 -#define NPY_INT16 NPY_BYTE -#define NPY_UINT16 NPY_UBYTE - typedef signed char npy_int16; - typedef unsigned char npy_uint16; -# define PyInt16ScalarObject PyByteScalarObject -# define PyInt16ArrType_Type PyByteArrType_Type -# define PyUInt16ScalarObject PyUByteScalarObject -# define PyUInt16ArrType_Type PyUByteArrType_Type -#define NPY_INT16_FMT NPY_BYTE_FMT -#define NPY_UINT16_FMT NPY_UBYTE_FMT -#endif -#elif NPY_BITSOF_CHAR == 32 -#ifndef NPY_INT32 -#define NPY_INT32 NPY_BYTE -#define NPY_UINT32 NPY_UBYTE - typedef signed char npy_int32; - typedef unsigned char npy_uint32; - typedef unsigned char npy_ucs4; -# define PyInt32ScalarObject PyByteScalarObject -# define PyInt32ArrType_Type PyByteArrType_Type -# define PyUInt32ScalarObject PyUByteScalarObject -# define PyUInt32ArrType_Type PyUByteArrType_Type -#define NPY_INT32_FMT NPY_BYTE_FMT -#define NPY_UINT32_FMT NPY_UBYTE_FMT -#endif -#elif NPY_BITSOF_CHAR == 64 -#ifndef NPY_INT64 -#define NPY_INT64 NPY_BYTE -#define NPY_UINT64 NPY_UBYTE - typedef signed char npy_int64; - typedef unsigned char npy_uint64; -# define PyInt64ScalarObject PyByteScalarObject -# define PyInt64ArrType_Type PyByteArrType_Type -# define PyUInt64ScalarObject PyUByteScalarObject -# define PyUInt64ArrType_Type PyUByteArrType_Type -#define NPY_INT64_FMT NPY_BYTE_FMT -#define NPY_UINT64_FMT NPY_UBYTE_FMT -# define MyPyLong_FromInt64 PyLong_FromLong -# define MyPyLong_AsInt64 PyLong_AsLong -#endif -#elif NPY_BITSOF_CHAR == 128 -#ifndef NPY_INT128 -#define NPY_INT128 NPY_BYTE -#define NPY_UINT128 NPY_UBYTE - typedef signed char npy_int128; - typedef unsigned char npy_uint128; -# define PyInt128ScalarObject PyByteScalarObject -# define PyInt128ArrType_Type PyByteArrType_Type -# define PyUInt128ScalarObject PyUByteScalarObject -# define PyUInt128ArrType_Type PyUByteArrType_Type -#define NPY_INT128_FMT NPY_BYTE_FMT -#define NPY_UINT128_FMT NPY_UBYTE_FMT -#endif -#endif - - - -#if NPY_BITSOF_DOUBLE == 32 -#ifndef NPY_FLOAT32 -#define NPY_FLOAT32 NPY_DOUBLE -#define NPY_COMPLEX64 NPY_CDOUBLE - typedef double npy_float32; - typedef npy_cdouble npy_complex64; -# define PyFloat32ScalarObject PyDoubleScalarObject -# define PyComplex64ScalarObject PyCDoubleScalarObject -# define PyFloat32ArrType_Type PyDoubleArrType_Type -# define PyComplex64ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT32_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX64_FMT NPY_CDOUBLE_FMT -#endif -#elif NPY_BITSOF_DOUBLE == 64 -#ifndef NPY_FLOAT64 -#define NPY_FLOAT64 NPY_DOUBLE -#define NPY_COMPLEX128 NPY_CDOUBLE - typedef double npy_float64; - typedef npy_cdouble npy_complex128; -# define PyFloat64ScalarObject PyDoubleScalarObject -# define PyComplex128ScalarObject PyCDoubleScalarObject -# define PyFloat64ArrType_Type PyDoubleArrType_Type -# define PyComplex128ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT64_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX128_FMT NPY_CDOUBLE_FMT -#endif -#elif NPY_BITSOF_DOUBLE == 80 -#ifndef NPY_FLOAT80 -#define NPY_FLOAT80 NPY_DOUBLE -#define NPY_COMPLEX160 NPY_CDOUBLE - typedef double npy_float80; - typedef npy_cdouble npy_complex160; -# define PyFloat80ScalarObject PyDoubleScalarObject -# define PyComplex160ScalarObject PyCDoubleScalarObject -# define PyFloat80ArrType_Type PyDoubleArrType_Type -# define PyComplex160ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT80_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX160_FMT NPY_CDOUBLE_FMT -#endif -#elif NPY_BITSOF_DOUBLE == 96 -#ifndef NPY_FLOAT96 -#define NPY_FLOAT96 NPY_DOUBLE -#define NPY_COMPLEX192 NPY_CDOUBLE - typedef double npy_float96; - typedef npy_cdouble npy_complex192; -# define PyFloat96ScalarObject PyDoubleScalarObject -# define PyComplex192ScalarObject PyCDoubleScalarObject -# define PyFloat96ArrType_Type PyDoubleArrType_Type -# define PyComplex192ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT96_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX192_FMT NPY_CDOUBLE_FMT -#endif -#elif NPY_BITSOF_DOUBLE == 128 -#ifndef NPY_FLOAT128 -#define NPY_FLOAT128 NPY_DOUBLE -#define NPY_COMPLEX256 NPY_CDOUBLE - typedef double npy_float128; - typedef npy_cdouble npy_complex256; -# define PyFloat128ScalarObject PyDoubleScalarObject -# define PyComplex256ScalarObject PyCDoubleScalarObject -# define PyFloat128ArrType_Type PyDoubleArrType_Type -# define PyComplex256ArrType_Type PyCDoubleArrType_Type -#define NPY_FLOAT128_FMT NPY_DOUBLE_FMT -#define NPY_COMPLEX256_FMT NPY_CDOUBLE_FMT -#endif -#endif - - - -#if NPY_BITSOF_FLOAT == 32 -#ifndef NPY_FLOAT32 -#define NPY_FLOAT32 NPY_FLOAT -#define NPY_COMPLEX64 NPY_CFLOAT - typedef float npy_float32; - typedef npy_cfloat npy_complex64; -# define PyFloat32ScalarObject PyFloatScalarObject -# define PyComplex64ScalarObject PyCFloatScalarObject -# define PyFloat32ArrType_Type PyFloatArrType_Type -# define PyComplex64ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT32_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX64_FMT NPY_CFLOAT_FMT -#endif -#elif NPY_BITSOF_FLOAT == 64 -#ifndef NPY_FLOAT64 -#define NPY_FLOAT64 NPY_FLOAT -#define NPY_COMPLEX128 NPY_CFLOAT - typedef float npy_float64; - typedef npy_cfloat npy_complex128; -# define PyFloat64ScalarObject PyFloatScalarObject -# define PyComplex128ScalarObject PyCFloatScalarObject -# define PyFloat64ArrType_Type PyFloatArrType_Type -# define PyComplex128ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT64_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX128_FMT NPY_CFLOAT_FMT -#endif -#elif NPY_BITSOF_FLOAT == 80 -#ifndef NPY_FLOAT80 -#define NPY_FLOAT80 NPY_FLOAT -#define NPY_COMPLEX160 NPY_CFLOAT - typedef float npy_float80; - typedef npy_cfloat npy_complex160; -# define PyFloat80ScalarObject PyFloatScalarObject -# define PyComplex160ScalarObject PyCFloatScalarObject -# define PyFloat80ArrType_Type PyFloatArrType_Type -# define PyComplex160ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT80_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX160_FMT NPY_CFLOAT_FMT -#endif -#elif NPY_BITSOF_FLOAT == 96 -#ifndef NPY_FLOAT96 -#define NPY_FLOAT96 NPY_FLOAT -#define NPY_COMPLEX192 NPY_CFLOAT - typedef float npy_float96; - typedef npy_cfloat npy_complex192; -# define PyFloat96ScalarObject PyFloatScalarObject -# define PyComplex192ScalarObject PyCFloatScalarObject -# define PyFloat96ArrType_Type PyFloatArrType_Type -# define PyComplex192ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT96_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX192_FMT NPY_CFLOAT_FMT -#endif -#elif NPY_BITSOF_FLOAT == 128 -#ifndef NPY_FLOAT128 -#define NPY_FLOAT128 NPY_FLOAT -#define NPY_COMPLEX256 NPY_CFLOAT - typedef float npy_float128; - typedef npy_cfloat npy_complex256; -# define PyFloat128ScalarObject PyFloatScalarObject -# define PyComplex256ScalarObject PyCFloatScalarObject -# define PyFloat128ArrType_Type PyFloatArrType_Type -# define PyComplex256ArrType_Type PyCFloatArrType_Type -#define NPY_FLOAT128_FMT NPY_FLOAT_FMT -#define NPY_COMPLEX256_FMT NPY_CFLOAT_FMT -#endif -#endif - -/* half/float16 isn't a floating-point type in C */ -#define NPY_FLOAT16 NPY_HALF -typedef npy_uint16 npy_half; -typedef npy_half npy_float16; - -#if NPY_BITSOF_LONGDOUBLE == 32 -#ifndef NPY_FLOAT32 -#define NPY_FLOAT32 NPY_LONGDOUBLE -#define NPY_COMPLEX64 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float32; - typedef npy_clongdouble npy_complex64; -# define PyFloat32ScalarObject PyLongDoubleScalarObject -# define PyComplex64ScalarObject PyCLongDoubleScalarObject -# define PyFloat32ArrType_Type PyLongDoubleArrType_Type -# define PyComplex64ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT32_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX64_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 64 -#ifndef NPY_FLOAT64 -#define NPY_FLOAT64 NPY_LONGDOUBLE -#define NPY_COMPLEX128 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float64; - typedef npy_clongdouble npy_complex128; -# define PyFloat64ScalarObject PyLongDoubleScalarObject -# define PyComplex128ScalarObject PyCLongDoubleScalarObject -# define PyFloat64ArrType_Type PyLongDoubleArrType_Type -# define PyComplex128ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT64_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX128_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 80 -#ifndef NPY_FLOAT80 -#define NPY_FLOAT80 NPY_LONGDOUBLE -#define NPY_COMPLEX160 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float80; - typedef npy_clongdouble npy_complex160; -# define PyFloat80ScalarObject PyLongDoubleScalarObject -# define PyComplex160ScalarObject PyCLongDoubleScalarObject -# define PyFloat80ArrType_Type PyLongDoubleArrType_Type -# define PyComplex160ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT80_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX160_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 96 -#ifndef NPY_FLOAT96 -#define NPY_FLOAT96 NPY_LONGDOUBLE -#define NPY_COMPLEX192 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float96; - typedef npy_clongdouble npy_complex192; -# define PyFloat96ScalarObject PyLongDoubleScalarObject -# define PyComplex192ScalarObject PyCLongDoubleScalarObject -# define PyFloat96ArrType_Type PyLongDoubleArrType_Type -# define PyComplex192ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT96_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX192_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 128 -#ifndef NPY_FLOAT128 -#define NPY_FLOAT128 NPY_LONGDOUBLE -#define NPY_COMPLEX256 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float128; - typedef npy_clongdouble npy_complex256; -# define PyFloat128ScalarObject PyLongDoubleScalarObject -# define PyComplex256ScalarObject PyCLongDoubleScalarObject -# define PyFloat128ArrType_Type PyLongDoubleArrType_Type -# define PyComplex256ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT128_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX256_FMT NPY_CLONGDOUBLE_FMT -#endif -#elif NPY_BITSOF_LONGDOUBLE == 256 -#define NPY_FLOAT256 NPY_LONGDOUBLE -#define NPY_COMPLEX512 NPY_CLONGDOUBLE - typedef npy_longdouble npy_float256; - typedef npy_clongdouble npy_complex512; -# define PyFloat256ScalarObject PyLongDoubleScalarObject -# define PyComplex512ScalarObject PyCLongDoubleScalarObject -# define PyFloat256ArrType_Type PyLongDoubleArrType_Type -# define PyComplex512ArrType_Type PyCLongDoubleArrType_Type -#define NPY_FLOAT256_FMT NPY_LONGDOUBLE_FMT -#define NPY_COMPLEX512_FMT NPY_CLONGDOUBLE_FMT -#endif - -/* datetime typedefs */ -typedef npy_int64 npy_timedelta; -typedef npy_int64 npy_datetime; -#define NPY_DATETIME_FMT NPY_INT64_FMT -#define NPY_TIMEDELTA_FMT NPY_INT64_FMT - -/* End of typedefs for numarray style bit-width names */ - -#endif - diff --git a/include/numpy/npy_cpu.h b/include/numpy/npy_cpu.h deleted file mode 100644 index 9707a7a..0000000 --- a/include/numpy/npy_cpu.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * This set (target) cpu specific macros: - * - Possible values: - * NPY_CPU_X86 - * NPY_CPU_AMD64 - * NPY_CPU_PPC - * NPY_CPU_PPC64 - * NPY_CPU_SPARC - * NPY_CPU_S390 - * NPY_CPU_IA64 - * NPY_CPU_HPPA - * NPY_CPU_ALPHA - * NPY_CPU_ARMEL - * NPY_CPU_ARMEB - * NPY_CPU_SH_LE - * NPY_CPU_SH_BE - */ -#ifndef _NPY_CPUARCH_H_ -#define _NPY_CPUARCH_H_ - -#include "numpyconfig.h" - -#if defined( __i386__ ) || defined(i386) || defined(_M_IX86) - /* - * __i386__ is defined by gcc and Intel compiler on Linux, - * _M_IX86 by VS compiler, - * i386 by Sun compilers on opensolaris at least - */ - #define NPY_CPU_X86 -#elif defined(__x86_64__) || defined(__amd64__) || defined(__x86_64) || defined(_M_AMD64) - /* - * both __x86_64__ and __amd64__ are defined by gcc - * __x86_64 defined by sun compiler on opensolaris at least - * _M_AMD64 defined by MS compiler - */ - #define NPY_CPU_AMD64 -#elif defined(__ppc__) || defined(__powerpc__) || defined(_ARCH_PPC) - /* - * __ppc__ is defined by gcc, I remember having seen __powerpc__ once, - * but can't find it ATM - * _ARCH_PPC is used by at least gcc on AIX - */ - #define NPY_CPU_PPC -#elif defined(__ppc64__) - #define NPY_CPU_PPC64 -#elif defined(__sparc__) || defined(__sparc) - /* __sparc__ is defined by gcc and Forte (e.g. Sun) compilers */ - #define NPY_CPU_SPARC -#elif defined(__s390__) - #define NPY_CPU_S390 -#elif defined(__ia64) - #define NPY_CPU_IA64 -#elif defined(__hppa) - #define NPY_CPU_HPPA -#elif defined(__alpha__) - #define NPY_CPU_ALPHA -#elif defined(__arm__) && defined(__ARMEL__) - #define NPY_CPU_ARMEL -#elif defined(__arm__) && defined(__ARMEB__) - #define NPY_CPU_ARMEB -#elif defined(__sh__) && defined(__LITTLE_ENDIAN__) - #define NPY_CPU_SH_LE -#elif defined(__sh__) && defined(__BIG_ENDIAN__) - #define NPY_CPU_SH_BE -#elif defined(__MIPSEL__) - #define NPY_CPU_MIPSEL -#elif defined(__MIPSEB__) - #define NPY_CPU_MIPSEB -#elif defined(__aarch64__) - #define NPY_CPU_AARCH64 -#else - #error Unknown CPU, please report this to numpy maintainers with \ - information about your platform (OS, CPU and compiler) -#endif - -/* - This "white-lists" the architectures that we know don't require - pointer alignment. We white-list, since the memcpy version will - work everywhere, whereas assignment will only work where pointer - dereferencing doesn't require alignment. - - TODO: There may be more architectures we can white list. -*/ -#if defined(NPY_CPU_X86) || defined(NPY_CPU_AMD64) - #define NPY_COPY_PYOBJECT_PTR(dst, src) (*((PyObject **)(dst)) = *((PyObject **)(src))) -#else - #if NPY_SIZEOF_PY_INTPTR_T == 4 - #define NPY_COPY_PYOBJECT_PTR(dst, src) \ - ((char*)(dst))[0] = ((char*)(src))[0]; \ - ((char*)(dst))[1] = ((char*)(src))[1]; \ - ((char*)(dst))[2] = ((char*)(src))[2]; \ - ((char*)(dst))[3] = ((char*)(src))[3]; - #elif NPY_SIZEOF_PY_INTPTR_T == 8 - #define NPY_COPY_PYOBJECT_PTR(dst, src) \ - ((char*)(dst))[0] = ((char*)(src))[0]; \ - ((char*)(dst))[1] = ((char*)(src))[1]; \ - ((char*)(dst))[2] = ((char*)(src))[2]; \ - ((char*)(dst))[3] = ((char*)(src))[3]; \ - ((char*)(dst))[4] = ((char*)(src))[4]; \ - ((char*)(dst))[5] = ((char*)(src))[5]; \ - ((char*)(dst))[6] = ((char*)(src))[6]; \ - ((char*)(dst))[7] = ((char*)(src))[7]; - #else - #error Unknown architecture, please report this to numpy maintainers with \ - information about your platform (OS, CPU and compiler) - #endif -#endif - -#endif diff --git a/include/numpy/npy_deprecated_api.h b/include/numpy/npy_deprecated_api.h deleted file mode 100644 index c27b4a4..0000000 --- a/include/numpy/npy_deprecated_api.h +++ /dev/null @@ -1,129 +0,0 @@ -#ifndef _NPY_DEPRECATED_API_H -#define _NPY_DEPRECATED_API_H - -#if defined(_WIN32) -#define _WARN___STR2__(x) #x -#define _WARN___STR1__(x) _WARN___STR2__(x) -#define _WARN___LOC__ __FILE__ "(" _WARN___STR1__(__LINE__) ") : Warning Msg: " -#pragma message(_WARN___LOC__"Using deprecated NumPy API, disable it by " \ - "#defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION") -#elif defined(__GNUC__) -#warning "Using deprecated NumPy API, disable it by #defining NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION" -#endif -/* TODO: How to do this warning message for other compilers? */ - -/* - * This header exists to collect all dangerous/deprecated NumPy API. - * - * This is an attempt to remove bad API, the proliferation of macros, - * and namespace pollution currently produced by the NumPy headers. - */ - -#if defined(NPY_NO_DEPRECATED_API) -#error Should never include npy_deprecated_api directly. -#endif - -/* These array flags are deprecated as of NumPy 1.7 */ -#define NPY_CONTIGUOUS NPY_ARRAY_C_CONTIGUOUS -#define NPY_FORTRAN NPY_ARRAY_F_CONTIGUOUS - -/* - * The consistent NPY_ARRAY_* names which don't pollute the NPY_* - * namespace were added in NumPy 1.7. - * - * These versions of the carray flags are deprecated, but - * probably should only be removed after two releases instead of one. - */ -#define NPY_C_CONTIGUOUS NPY_ARRAY_C_CONTIGUOUS -#define NPY_F_CONTIGUOUS NPY_ARRAY_F_CONTIGUOUS -#define NPY_OWNDATA NPY_ARRAY_OWNDATA -#define NPY_FORCECAST NPY_ARRAY_FORCECAST -#define NPY_ENSURECOPY NPY_ARRAY_ENSURECOPY -#define NPY_ENSUREARRAY NPY_ARRAY_ENSUREARRAY -#define NPY_ELEMENTSTRIDES NPY_ARRAY_ELEMENTSTRIDES -#define NPY_ALIGNED NPY_ARRAY_ALIGNED -#define NPY_NOTSWAPPED NPY_ARRAY_NOTSWAPPED -#define NPY_WRITEABLE NPY_ARRAY_WRITEABLE -#define NPY_UPDATEIFCOPY NPY_ARRAY_UPDATEIFCOPY -#define NPY_BEHAVED NPY_ARRAY_BEHAVED -#define NPY_BEHAVED_NS NPY_ARRAY_BEHAVED_NS -#define NPY_CARRAY NPY_ARRAY_CARRAY -#define NPY_CARRAY_RO NPY_ARRAY_CARRAY_RO -#define NPY_FARRAY NPY_ARRAY_FARRAY -#define NPY_FARRAY_RO NPY_ARRAY_FARRAY_RO -#define NPY_DEFAULT NPY_ARRAY_DEFAULT -#define NPY_IN_ARRAY NPY_ARRAY_IN_ARRAY -#define NPY_OUT_ARRAY NPY_ARRAY_OUT_ARRAY -#define NPY_INOUT_ARRAY NPY_ARRAY_INOUT_ARRAY -#define NPY_IN_FARRAY NPY_ARRAY_IN_FARRAY -#define NPY_OUT_FARRAY NPY_ARRAY_OUT_FARRAY -#define NPY_INOUT_FARRAY NPY_ARRAY_INOUT_FARRAY -#define NPY_UPDATE_ALL NPY_ARRAY_UPDATE_ALL - -/* This way of accessing the default type is deprecated as of NumPy 1.7 */ -#define PyArray_DEFAULT NPY_DEFAULT_TYPE - -/* These DATETIME bits aren't used internally */ -#if PY_VERSION_HEX >= 0x03000000 -#define PyDataType_GetDatetimeMetaData(descr) \ - ((descr->metadata == NULL) ? NULL : \ - ((PyArray_DatetimeMetaData *)(PyCapsule_GetPointer( \ - PyDict_GetItemString( \ - descr->metadata, NPY_METADATA_DTSTR), NULL)))) -#else -#define PyDataType_GetDatetimeMetaData(descr) \ - ((descr->metadata == NULL) ? NULL : \ - ((PyArray_DatetimeMetaData *)(PyCObject_AsVoidPtr( \ - PyDict_GetItemString(descr->metadata, NPY_METADATA_DTSTR))))) -#endif - -/* - * Deprecated as of NumPy 1.7, this kind of shortcut doesn't - * belong in the public API. - */ -#define NPY_AO PyArrayObject - -/* - * Deprecated as of NumPy 1.7, an all-lowercase macro doesn't - * belong in the public API. - */ -#define fortran fortran_ - -/* - * Deprecated as of NumPy 1.7, as it is a namespace-polluting - * macro. - */ -#define FORTRAN_IF PyArray_FORTRAN_IF - -/* Deprecated as of NumPy 1.7, datetime64 uses c_metadata instead */ -#define NPY_METADATA_DTSTR "__timeunit__" - -/* - * Deprecated as of NumPy 1.7. - * The reasoning: - * - These are for datetime, but there's no datetime "namespace". - * - They just turn NPY_STR_ into "", which is just - * making something simple be indirected. - */ -#define NPY_STR_Y "Y" -#define NPY_STR_M "M" -#define NPY_STR_W "W" -#define NPY_STR_D "D" -#define NPY_STR_h "h" -#define NPY_STR_m "m" -#define NPY_STR_s "s" -#define NPY_STR_ms "ms" -#define NPY_STR_us "us" -#define NPY_STR_ns "ns" -#define NPY_STR_ps "ps" -#define NPY_STR_fs "fs" -#define NPY_STR_as "as" - -/* - * The macros in old_defines.h are Deprecated as of NumPy 1.7 and will be - * removed in the next major release. - */ -#include "old_defines.h" - - -#endif diff --git a/include/numpy/npy_endian.h b/include/numpy/npy_endian.h deleted file mode 100644 index 4e3349f..0000000 --- a/include/numpy/npy_endian.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef _NPY_ENDIAN_H_ -#define _NPY_ENDIAN_H_ - -/* - * NPY_BYTE_ORDER is set to the same value as BYTE_ORDER set by glibc in - * endian.h - */ - -#ifdef NPY_HAVE_ENDIAN_H - /* Use endian.h if available */ - #include - - #define NPY_BYTE_ORDER __BYTE_ORDER - #define NPY_LITTLE_ENDIAN __LITTLE_ENDIAN - #define NPY_BIG_ENDIAN __BIG_ENDIAN -#else - /* Set endianness info using target CPU */ - #include "npy_cpu.h" - - #define NPY_LITTLE_ENDIAN 1234 - #define NPY_BIG_ENDIAN 4321 - - #if defined(NPY_CPU_X86) \ - || defined(NPY_CPU_AMD64) \ - || defined(NPY_CPU_IA64) \ - || defined(NPY_CPU_ALPHA) \ - || defined(NPY_CPU_ARMEL) \ - || defined(NPY_CPU_AARCH64) \ - || defined(NPY_CPU_SH_LE) \ - || defined(NPY_CPU_MIPSEL) - #define NPY_BYTE_ORDER NPY_LITTLE_ENDIAN - #elif defined(NPY_CPU_PPC) \ - || defined(NPY_CPU_SPARC) \ - || defined(NPY_CPU_S390) \ - || defined(NPY_CPU_HPPA) \ - || defined(NPY_CPU_PPC64) \ - || defined(NPY_CPU_ARMEB) \ - || defined(NPY_CPU_SH_BE) \ - || defined(NPY_CPU_MIPSEB) - #define NPY_BYTE_ORDER NPY_BIG_ENDIAN - #else - #error Unknown CPU: can not set endianness - #endif -#endif - -#endif diff --git a/include/numpy/npy_interrupt.h b/include/numpy/npy_interrupt.h deleted file mode 100644 index f71fd68..0000000 --- a/include/numpy/npy_interrupt.h +++ /dev/null @@ -1,117 +0,0 @@ - -/* Signal handling: - -This header file defines macros that allow your code to handle -interrupts received during processing. Interrupts that -could reasonably be handled: - -SIGINT, SIGABRT, SIGALRM, SIGSEGV - -****Warning*************** - -Do not allow code that creates temporary memory or increases reference -counts of Python objects to be interrupted unless you handle it -differently. - -************************** - -The mechanism for handling interrupts is conceptually simple: - - - replace the signal handler with our own home-grown version - and store the old one. - - run the code to be interrupted -- if an interrupt occurs - the handler should basically just cause a return to the - calling function for finish work. - - restore the old signal handler - -Of course, every code that allows interrupts must account for -returning via the interrupt and handle clean-up correctly. But, -even still, the simple paradigm is complicated by at least three -factors. - - 1) platform portability (i.e. Microsoft says not to use longjmp - to return from signal handling. They have a __try and __except - extension to C instead but what about mingw?). - - 2) how to handle threads: apparently whether signals are delivered to - every thread of the process or the "invoking" thread is platform - dependent. --- we don't handle threads for now. - - 3) do we need to worry about re-entrance. For now, assume the - code will not call-back into itself. - -Ideas: - - 1) Start by implementing an approach that works on platforms that - can use setjmp and longjmp functionality and does nothing - on other platforms. - - 2) Ignore threads --- i.e. do not mix interrupt handling and threads - - 3) Add a default signal_handler function to the C-API but have the rest - use macros. - - -Simple Interface: - - -In your C-extension: around a block of code you want to be interruptable -with a SIGINT - -NPY_SIGINT_ON -[code] -NPY_SIGINT_OFF - -In order for this to work correctly, the -[code] block must not allocate any memory or alter the reference count of any -Python objects. In other words [code] must be interruptible so that continuation -after NPY_SIGINT_OFF will only be "missing some computations" - -Interrupt handling does not work well with threads. - -*/ - -/* Add signal handling macros - Make the global variable and signal handler part of the C-API -*/ - -#ifndef NPY_INTERRUPT_H -#define NPY_INTERRUPT_H - -#ifndef NPY_NO_SIGNAL - -#include -#include - -#ifndef sigsetjmp - -#define NPY_SIGSETJMP(arg1, arg2) setjmp(arg1) -#define NPY_SIGLONGJMP(arg1, arg2) longjmp(arg1, arg2) -#define NPY_SIGJMP_BUF jmp_buf - -#else - -#define NPY_SIGSETJMP(arg1, arg2) sigsetjmp(arg1, arg2) -#define NPY_SIGLONGJMP(arg1, arg2) siglongjmp(arg1, arg2) -#define NPY_SIGJMP_BUF sigjmp_buf - -#endif - -# define NPY_SIGINT_ON { \ - PyOS_sighandler_t _npy_sig_save; \ - _npy_sig_save = PyOS_setsig(SIGINT, _PyArray_SigintHandler); \ - if (NPY_SIGSETJMP(*((NPY_SIGJMP_BUF *)_PyArray_GetSigintBuf()), \ - 1) == 0) { \ - -# define NPY_SIGINT_OFF } \ - PyOS_setsig(SIGINT, _npy_sig_save); \ - } - -#else /* NPY_NO_SIGNAL */ - -#define NPY_SIGINT_ON -#define NPY_SIGINT_OFF - -#endif /* HAVE_SIGSETJMP */ - -#endif /* NPY_INTERRUPT_H */ diff --git a/include/numpy/npy_math.h b/include/numpy/npy_math.h deleted file mode 100644 index 7ae166e..0000000 --- a/include/numpy/npy_math.h +++ /dev/null @@ -1,438 +0,0 @@ -#ifndef __NPY_MATH_C99_H_ -#define __NPY_MATH_C99_H_ - -#include -#ifdef __SUNPRO_CC -#include -#endif -#include - -/* - * NAN and INFINITY like macros (same behavior as glibc for NAN, same as C99 - * for INFINITY) - * - * XXX: I should test whether INFINITY and NAN are available on the platform - */ -NPY_INLINE static float __npy_inff(void) -{ - const union { npy_uint32 __i; float __f;} __bint = {0x7f800000UL}; - return __bint.__f; -} - -NPY_INLINE static float __npy_nanf(void) -{ - const union { npy_uint32 __i; float __f;} __bint = {0x7fc00000UL}; - return __bint.__f; -} - -NPY_INLINE static float __npy_pzerof(void) -{ - const union { npy_uint32 __i; float __f;} __bint = {0x00000000UL}; - return __bint.__f; -} - -NPY_INLINE static float __npy_nzerof(void) -{ - const union { npy_uint32 __i; float __f;} __bint = {0x80000000UL}; - return __bint.__f; -} - -#define NPY_INFINITYF __npy_inff() -#define NPY_NANF __npy_nanf() -#define NPY_PZEROF __npy_pzerof() -#define NPY_NZEROF __npy_nzerof() - -#define NPY_INFINITY ((npy_double)NPY_INFINITYF) -#define NPY_NAN ((npy_double)NPY_NANF) -#define NPY_PZERO ((npy_double)NPY_PZEROF) -#define NPY_NZERO ((npy_double)NPY_NZEROF) - -#define NPY_INFINITYL ((npy_longdouble)NPY_INFINITYF) -#define NPY_NANL ((npy_longdouble)NPY_NANF) -#define NPY_PZEROL ((npy_longdouble)NPY_PZEROF) -#define NPY_NZEROL ((npy_longdouble)NPY_NZEROF) - -/* - * Useful constants - */ -#define NPY_E 2.718281828459045235360287471352662498 /* e */ -#define NPY_LOG2E 1.442695040888963407359924681001892137 /* log_2 e */ -#define NPY_LOG10E 0.434294481903251827651128918916605082 /* log_10 e */ -#define NPY_LOGE2 0.693147180559945309417232121458176568 /* log_e 2 */ -#define NPY_LOGE10 2.302585092994045684017991454684364208 /* log_e 10 */ -#define NPY_PI 3.141592653589793238462643383279502884 /* pi */ -#define NPY_PI_2 1.570796326794896619231321691639751442 /* pi/2 */ -#define NPY_PI_4 0.785398163397448309615660845819875721 /* pi/4 */ -#define NPY_1_PI 0.318309886183790671537767526745028724 /* 1/pi */ -#define NPY_2_PI 0.636619772367581343075535053490057448 /* 2/pi */ -#define NPY_EULER 0.577215664901532860606512090082402431 /* Euler constant */ -#define NPY_SQRT2 1.414213562373095048801688724209698079 /* sqrt(2) */ -#define NPY_SQRT1_2 0.707106781186547524400844362104849039 /* 1/sqrt(2) */ - -#define NPY_Ef 2.718281828459045235360287471352662498F /* e */ -#define NPY_LOG2Ef 1.442695040888963407359924681001892137F /* log_2 e */ -#define NPY_LOG10Ef 0.434294481903251827651128918916605082F /* log_10 e */ -#define NPY_LOGE2f 0.693147180559945309417232121458176568F /* log_e 2 */ -#define NPY_LOGE10f 2.302585092994045684017991454684364208F /* log_e 10 */ -#define NPY_PIf 3.141592653589793238462643383279502884F /* pi */ -#define NPY_PI_2f 1.570796326794896619231321691639751442F /* pi/2 */ -#define NPY_PI_4f 0.785398163397448309615660845819875721F /* pi/4 */ -#define NPY_1_PIf 0.318309886183790671537767526745028724F /* 1/pi */ -#define NPY_2_PIf 0.636619772367581343075535053490057448F /* 2/pi */ -#define NPY_EULERf 0.577215664901532860606512090082402431F /* Euler constan*/ -#define NPY_SQRT2f 1.414213562373095048801688724209698079F /* sqrt(2) */ -#define NPY_SQRT1_2f 0.707106781186547524400844362104849039F /* 1/sqrt(2) */ - -#define NPY_El 2.718281828459045235360287471352662498L /* e */ -#define NPY_LOG2El 1.442695040888963407359924681001892137L /* log_2 e */ -#define NPY_LOG10El 0.434294481903251827651128918916605082L /* log_10 e */ -#define NPY_LOGE2l 0.693147180559945309417232121458176568L /* log_e 2 */ -#define NPY_LOGE10l 2.302585092994045684017991454684364208L /* log_e 10 */ -#define NPY_PIl 3.141592653589793238462643383279502884L /* pi */ -#define NPY_PI_2l 1.570796326794896619231321691639751442L /* pi/2 */ -#define NPY_PI_4l 0.785398163397448309615660845819875721L /* pi/4 */ -#define NPY_1_PIl 0.318309886183790671537767526745028724L /* 1/pi */ -#define NPY_2_PIl 0.636619772367581343075535053490057448L /* 2/pi */ -#define NPY_EULERl 0.577215664901532860606512090082402431L /* Euler constan*/ -#define NPY_SQRT2l 1.414213562373095048801688724209698079L /* sqrt(2) */ -#define NPY_SQRT1_2l 0.707106781186547524400844362104849039L /* 1/sqrt(2) */ - -/* - * C99 double math funcs - */ -double npy_sin(double x); -double npy_cos(double x); -double npy_tan(double x); -double npy_sinh(double x); -double npy_cosh(double x); -double npy_tanh(double x); - -double npy_asin(double x); -double npy_acos(double x); -double npy_atan(double x); -double npy_aexp(double x); -double npy_alog(double x); -double npy_asqrt(double x); -double npy_afabs(double x); - -double npy_log(double x); -double npy_log10(double x); -double npy_exp(double x); -double npy_sqrt(double x); - -double npy_fabs(double x); -double npy_ceil(double x); -double npy_fmod(double x, double y); -double npy_floor(double x); - -double npy_expm1(double x); -double npy_log1p(double x); -double npy_hypot(double x, double y); -double npy_acosh(double x); -double npy_asinh(double xx); -double npy_atanh(double x); -double npy_rint(double x); -double npy_trunc(double x); -double npy_exp2(double x); -double npy_log2(double x); - -double npy_atan2(double x, double y); -double npy_pow(double x, double y); -double npy_modf(double x, double* y); - -double npy_copysign(double x, double y); -double npy_nextafter(double x, double y); -double npy_spacing(double x); - -/* - * IEEE 754 fpu handling. Those are guaranteed to be macros - */ -#ifndef NPY_HAVE_DECL_ISNAN - #define npy_isnan(x) ((x) != (x)) -#else - #ifdef _MSC_VER - #define npy_isnan(x) _isnan((x)) - #else - #define npy_isnan(x) isnan((x)) - #endif -#endif - -#ifndef NPY_HAVE_DECL_ISFINITE - #ifdef _MSC_VER - #define npy_isfinite(x) _finite((x)) - #else - #define npy_isfinite(x) !npy_isnan((x) + (-x)) - #endif -#else - #define npy_isfinite(x) isfinite((x)) -#endif - -#ifndef NPY_HAVE_DECL_ISINF - #define npy_isinf(x) (!npy_isfinite(x) && !npy_isnan(x)) -#else - #ifdef _MSC_VER - #define npy_isinf(x) (!_finite((x)) && !_isnan((x))) - #else - #define npy_isinf(x) isinf((x)) - #endif -#endif - -#ifndef NPY_HAVE_DECL_SIGNBIT - int _npy_signbit_f(float x); - int _npy_signbit_d(double x); - int _npy_signbit_ld(long double x); - #define npy_signbit(x) \ - (sizeof (x) == sizeof (long double) ? _npy_signbit_ld (x) \ - : sizeof (x) == sizeof (double) ? _npy_signbit_d (x) \ - : _npy_signbit_f (x)) -#else - #define npy_signbit(x) signbit((x)) -#endif - -/* - * float C99 math functions - */ - -float npy_sinf(float x); -float npy_cosf(float x); -float npy_tanf(float x); -float npy_sinhf(float x); -float npy_coshf(float x); -float npy_tanhf(float x); -float npy_fabsf(float x); -float npy_floorf(float x); -float npy_ceilf(float x); -float npy_rintf(float x); -float npy_truncf(float x); -float npy_sqrtf(float x); -float npy_log10f(float x); -float npy_logf(float x); -float npy_expf(float x); -float npy_expm1f(float x); -float npy_asinf(float x); -float npy_acosf(float x); -float npy_atanf(float x); -float npy_asinhf(float x); -float npy_acoshf(float x); -float npy_atanhf(float x); -float npy_log1pf(float x); -float npy_exp2f(float x); -float npy_log2f(float x); - -float npy_atan2f(float x, float y); -float npy_hypotf(float x, float y); -float npy_powf(float x, float y); -float npy_fmodf(float x, float y); - -float npy_modff(float x, float* y); - -float npy_copysignf(float x, float y); -float npy_nextafterf(float x, float y); -float npy_spacingf(float x); - -/* - * float C99 math functions - */ - -npy_longdouble npy_sinl(npy_longdouble x); -npy_longdouble npy_cosl(npy_longdouble x); -npy_longdouble npy_tanl(npy_longdouble x); -npy_longdouble npy_sinhl(npy_longdouble x); -npy_longdouble npy_coshl(npy_longdouble x); -npy_longdouble npy_tanhl(npy_longdouble x); -npy_longdouble npy_fabsl(npy_longdouble x); -npy_longdouble npy_floorl(npy_longdouble x); -npy_longdouble npy_ceill(npy_longdouble x); -npy_longdouble npy_rintl(npy_longdouble x); -npy_longdouble npy_truncl(npy_longdouble x); -npy_longdouble npy_sqrtl(npy_longdouble x); -npy_longdouble npy_log10l(npy_longdouble x); -npy_longdouble npy_logl(npy_longdouble x); -npy_longdouble npy_expl(npy_longdouble x); -npy_longdouble npy_expm1l(npy_longdouble x); -npy_longdouble npy_asinl(npy_longdouble x); -npy_longdouble npy_acosl(npy_longdouble x); -npy_longdouble npy_atanl(npy_longdouble x); -npy_longdouble npy_asinhl(npy_longdouble x); -npy_longdouble npy_acoshl(npy_longdouble x); -npy_longdouble npy_atanhl(npy_longdouble x); -npy_longdouble npy_log1pl(npy_longdouble x); -npy_longdouble npy_exp2l(npy_longdouble x); -npy_longdouble npy_log2l(npy_longdouble x); - -npy_longdouble npy_atan2l(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_hypotl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_powl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_fmodl(npy_longdouble x, npy_longdouble y); - -npy_longdouble npy_modfl(npy_longdouble x, npy_longdouble* y); - -npy_longdouble npy_copysignl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_nextafterl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_spacingl(npy_longdouble x); - -/* - * Non standard functions - */ -double npy_deg2rad(double x); -double npy_rad2deg(double x); -double npy_logaddexp(double x, double y); -double npy_logaddexp2(double x, double y); - -float npy_deg2radf(float x); -float npy_rad2degf(float x); -float npy_logaddexpf(float x, float y); -float npy_logaddexp2f(float x, float y); - -npy_longdouble npy_deg2radl(npy_longdouble x); -npy_longdouble npy_rad2degl(npy_longdouble x); -npy_longdouble npy_logaddexpl(npy_longdouble x, npy_longdouble y); -npy_longdouble npy_logaddexp2l(npy_longdouble x, npy_longdouble y); - -#define npy_degrees npy_rad2deg -#define npy_degreesf npy_rad2degf -#define npy_degreesl npy_rad2degl - -#define npy_radians npy_deg2rad -#define npy_radiansf npy_deg2radf -#define npy_radiansl npy_deg2radl - -/* - * Complex declarations - */ - -/* - * C99 specifies that complex numbers have the same representation as - * an array of two elements, where the first element is the real part - * and the second element is the imaginary part. - */ -#define __NPY_CPACK_IMP(x, y, type, ctype) \ - union { \ - ctype z; \ - type a[2]; \ - } z1;; \ - \ - z1.a[0] = (x); \ - z1.a[1] = (y); \ - \ - return z1.z; - -static NPY_INLINE npy_cdouble npy_cpack(double x, double y) -{ - __NPY_CPACK_IMP(x, y, double, npy_cdouble); -} - -static NPY_INLINE npy_cfloat npy_cpackf(float x, float y) -{ - __NPY_CPACK_IMP(x, y, float, npy_cfloat); -} - -static NPY_INLINE npy_clongdouble npy_cpackl(npy_longdouble x, npy_longdouble y) -{ - __NPY_CPACK_IMP(x, y, npy_longdouble, npy_clongdouble); -} -#undef __NPY_CPACK_IMP - -/* - * Same remark as above, but in the other direction: extract first/second - * member of complex number, assuming a C99-compatible representation - * - * Those are defineds as static inline, and such as a reasonable compiler would - * most likely compile this to one or two instructions (on CISC at least) - */ -#define __NPY_CEXTRACT_IMP(z, index, type, ctype) \ - union { \ - ctype z; \ - type a[2]; \ - } __z_repr; \ - __z_repr.z = z; \ - \ - return __z_repr.a[index]; - -static NPY_INLINE double npy_creal(npy_cdouble z) -{ - __NPY_CEXTRACT_IMP(z, 0, double, npy_cdouble); -} - -static NPY_INLINE double npy_cimag(npy_cdouble z) -{ - __NPY_CEXTRACT_IMP(z, 1, double, npy_cdouble); -} - -static NPY_INLINE float npy_crealf(npy_cfloat z) -{ - __NPY_CEXTRACT_IMP(z, 0, float, npy_cfloat); -} - -static NPY_INLINE float npy_cimagf(npy_cfloat z) -{ - __NPY_CEXTRACT_IMP(z, 1, float, npy_cfloat); -} - -static NPY_INLINE npy_longdouble npy_creall(npy_clongdouble z) -{ - __NPY_CEXTRACT_IMP(z, 0, npy_longdouble, npy_clongdouble); -} - -static NPY_INLINE npy_longdouble npy_cimagl(npy_clongdouble z) -{ - __NPY_CEXTRACT_IMP(z, 1, npy_longdouble, npy_clongdouble); -} -#undef __NPY_CEXTRACT_IMP - -/* - * Double precision complex functions - */ -double npy_cabs(npy_cdouble z); -double npy_carg(npy_cdouble z); - -npy_cdouble npy_cexp(npy_cdouble z); -npy_cdouble npy_clog(npy_cdouble z); -npy_cdouble npy_cpow(npy_cdouble x, npy_cdouble y); - -npy_cdouble npy_csqrt(npy_cdouble z); - -npy_cdouble npy_ccos(npy_cdouble z); -npy_cdouble npy_csin(npy_cdouble z); - -/* - * Single precision complex functions - */ -float npy_cabsf(npy_cfloat z); -float npy_cargf(npy_cfloat z); - -npy_cfloat npy_cexpf(npy_cfloat z); -npy_cfloat npy_clogf(npy_cfloat z); -npy_cfloat npy_cpowf(npy_cfloat x, npy_cfloat y); - -npy_cfloat npy_csqrtf(npy_cfloat z); - -npy_cfloat npy_ccosf(npy_cfloat z); -npy_cfloat npy_csinf(npy_cfloat z); - -/* - * Extended precision complex functions - */ -npy_longdouble npy_cabsl(npy_clongdouble z); -npy_longdouble npy_cargl(npy_clongdouble z); - -npy_clongdouble npy_cexpl(npy_clongdouble z); -npy_clongdouble npy_clogl(npy_clongdouble z); -npy_clongdouble npy_cpowl(npy_clongdouble x, npy_clongdouble y); - -npy_clongdouble npy_csqrtl(npy_clongdouble z); - -npy_clongdouble npy_ccosl(npy_clongdouble z); -npy_clongdouble npy_csinl(npy_clongdouble z); - -/* - * Functions that set the floating point error - * status word. - */ - -void npy_set_floatstatus_divbyzero(void); -void npy_set_floatstatus_overflow(void); -void npy_set_floatstatus_underflow(void); -void npy_set_floatstatus_invalid(void); - -#endif diff --git a/include/numpy/npy_no_deprecated_api.h b/include/numpy/npy_no_deprecated_api.h deleted file mode 100644 index 6183dc2..0000000 --- a/include/numpy/npy_no_deprecated_api.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * This include file is provided for inclusion in Cython *.pyd files where - * one would like to define the NPY_NO_DEPRECATED_API macro. It can be - * included by - * - * cdef extern from "npy_no_deprecated_api.h": pass - * - */ -#ifndef NPY_NO_DEPRECATED_API - -/* put this check here since there may be multiple includes in C extensions. */ -#if defined(NDARRAYTYPES_H) || defined(_NPY_DEPRECATED_API_H) || \ - defined(OLD_DEFINES_H) -#error "npy_no_deprecated_api.h" must be first among numpy includes. -#else -#define NPY_NO_DEPRECATED_API NPY_API_VERSION -#endif - -#endif diff --git a/include/numpy/npy_os.h b/include/numpy/npy_os.h deleted file mode 100644 index 9228c39..0000000 --- a/include/numpy/npy_os.h +++ /dev/null @@ -1,30 +0,0 @@ -#ifndef _NPY_OS_H_ -#define _NPY_OS_H_ - -#if defined(linux) || defined(__linux) || defined(__linux__) - #define NPY_OS_LINUX -#elif defined(__FreeBSD__) || defined(__NetBSD__) || \ - defined(__OpenBSD__) || defined(__DragonFly__) - #define NPY_OS_BSD - #ifdef __FreeBSD__ - #define NPY_OS_FREEBSD - #elif defined(__NetBSD__) - #define NPY_OS_NETBSD - #elif defined(__OpenBSD__) - #define NPY_OS_OPENBSD - #elif defined(__DragonFly__) - #define NPY_OS_DRAGONFLY - #endif -#elif defined(sun) || defined(__sun) - #define NPY_OS_SOLARIS -#elif defined(__CYGWIN__) - #define NPY_OS_CYGWIN -#elif defined(_WIN32) || defined(__WIN32__) || defined(WIN32) - #define NPY_OS_WIN32 -#elif defined(__APPLE__) - #define NPY_OS_DARWIN -#else - #define NPY_OS_UNKNOWN -#endif - -#endif diff --git a/include/numpy/numpyconfig.h b/include/numpy/numpyconfig.h deleted file mode 100644 index 401d19f..0000000 --- a/include/numpy/numpyconfig.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef _NPY_NUMPYCONFIG_H_ -#define _NPY_NUMPYCONFIG_H_ - -#include "_numpyconfig.h" - -/* - * On Mac OS X, because there is only one configuration stage for all the archs - * in universal builds, any macro which depends on the arch needs to be - * harcoded - */ -#ifdef __APPLE__ - #undef NPY_SIZEOF_LONG - #undef NPY_SIZEOF_PY_INTPTR_T - - #ifdef __LP64__ - #define NPY_SIZEOF_LONG 8 - #define NPY_SIZEOF_PY_INTPTR_T 8 - #else - #define NPY_SIZEOF_LONG 4 - #define NPY_SIZEOF_PY_INTPTR_T 4 - #endif -#endif - -/** - * To help with the NPY_NO_DEPRECATED_API macro, we include API version - * numbers for specific versions of NumPy. To exclude all API that was - * deprecated as of 1.7, add the following before #including any NumPy - * headers: - * #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION - */ -#define NPY_1_7_API_VERSION 0x00000007 - -#endif diff --git a/include/numpy/old_defines.h b/include/numpy/old_defines.h deleted file mode 100644 index abf8159..0000000 --- a/include/numpy/old_defines.h +++ /dev/null @@ -1,187 +0,0 @@ -/* This header is deprecated as of NumPy 1.7 */ -#ifndef OLD_DEFINES_H -#define OLD_DEFINES_H - -#if defined(NPY_NO_DEPRECATED_API) && NPY_NO_DEPRECATED_API >= NPY_1_7_API_VERSION -#error The header "old_defines.h" is deprecated as of NumPy 1.7. -#endif - -#define NDARRAY_VERSION NPY_VERSION - -#define PyArray_MIN_BUFSIZE NPY_MIN_BUFSIZE -#define PyArray_MAX_BUFSIZE NPY_MAX_BUFSIZE -#define PyArray_BUFSIZE NPY_BUFSIZE - -#define PyArray_PRIORITY NPY_PRIORITY -#define PyArray_SUBTYPE_PRIORITY NPY_PRIORITY -#define PyArray_NUM_FLOATTYPE NPY_NUM_FLOATTYPE - -#define NPY_MAX PyArray_MAX -#define NPY_MIN PyArray_MIN - -#define PyArray_TYPES NPY_TYPES -#define PyArray_BOOL NPY_BOOL -#define PyArray_BYTE NPY_BYTE -#define PyArray_UBYTE NPY_UBYTE -#define PyArray_SHORT NPY_SHORT -#define PyArray_USHORT NPY_USHORT -#define PyArray_INT NPY_INT -#define PyArray_UINT NPY_UINT -#define PyArray_LONG NPY_LONG -#define PyArray_ULONG NPY_ULONG -#define PyArray_LONGLONG NPY_LONGLONG -#define PyArray_ULONGLONG NPY_ULONGLONG -#define PyArray_HALF NPY_HALF -#define PyArray_FLOAT NPY_FLOAT -#define PyArray_DOUBLE NPY_DOUBLE -#define PyArray_LONGDOUBLE NPY_LONGDOUBLE -#define PyArray_CFLOAT NPY_CFLOAT -#define PyArray_CDOUBLE NPY_CDOUBLE -#define PyArray_CLONGDOUBLE NPY_CLONGDOUBLE -#define PyArray_OBJECT NPY_OBJECT -#define PyArray_STRING NPY_STRING -#define PyArray_UNICODE NPY_UNICODE -#define PyArray_VOID NPY_VOID -#define PyArray_DATETIME NPY_DATETIME -#define PyArray_TIMEDELTA NPY_TIMEDELTA -#define PyArray_NTYPES NPY_NTYPES -#define PyArray_NOTYPE NPY_NOTYPE -#define PyArray_CHAR NPY_CHAR -#define PyArray_USERDEF NPY_USERDEF -#define PyArray_NUMUSERTYPES NPY_NUMUSERTYPES - -#define PyArray_INTP NPY_INTP -#define PyArray_UINTP NPY_UINTP - -#define PyArray_INT8 NPY_INT8 -#define PyArray_UINT8 NPY_UINT8 -#define PyArray_INT16 NPY_INT16 -#define PyArray_UINT16 NPY_UINT16 -#define PyArray_INT32 NPY_INT32 -#define PyArray_UINT32 NPY_UINT32 - -#ifdef NPY_INT64 -#define PyArray_INT64 NPY_INT64 -#define PyArray_UINT64 NPY_UINT64 -#endif - -#ifdef NPY_INT128 -#define PyArray_INT128 NPY_INT128 -#define PyArray_UINT128 NPY_UINT128 -#endif - -#ifdef NPY_FLOAT16 -#define PyArray_FLOAT16 NPY_FLOAT16 -#define PyArray_COMPLEX32 NPY_COMPLEX32 -#endif - -#ifdef NPY_FLOAT80 -#define PyArray_FLOAT80 NPY_FLOAT80 -#define PyArray_COMPLEX160 NPY_COMPLEX160 -#endif - -#ifdef NPY_FLOAT96 -#define PyArray_FLOAT96 NPY_FLOAT96 -#define PyArray_COMPLEX192 NPY_COMPLEX192 -#endif - -#ifdef NPY_FLOAT128 -#define PyArray_FLOAT128 NPY_FLOAT128 -#define PyArray_COMPLEX256 NPY_COMPLEX256 -#endif - -#define PyArray_FLOAT32 NPY_FLOAT32 -#define PyArray_COMPLEX64 NPY_COMPLEX64 -#define PyArray_FLOAT64 NPY_FLOAT64 -#define PyArray_COMPLEX128 NPY_COMPLEX128 - - -#define PyArray_TYPECHAR NPY_TYPECHAR -#define PyArray_BOOLLTR NPY_BOOLLTR -#define PyArray_BYTELTR NPY_BYTELTR -#define PyArray_UBYTELTR NPY_UBYTELTR -#define PyArray_SHORTLTR NPY_SHORTLTR -#define PyArray_USHORTLTR NPY_USHORTLTR -#define PyArray_INTLTR NPY_INTLTR -#define PyArray_UINTLTR NPY_UINTLTR -#define PyArray_LONGLTR NPY_LONGLTR -#define PyArray_ULONGLTR NPY_ULONGLTR -#define PyArray_LONGLONGLTR NPY_LONGLONGLTR -#define PyArray_ULONGLONGLTR NPY_ULONGLONGLTR -#define PyArray_HALFLTR NPY_HALFLTR -#define PyArray_FLOATLTR NPY_FLOATLTR -#define PyArray_DOUBLELTR NPY_DOUBLELTR -#define PyArray_LONGDOUBLELTR NPY_LONGDOUBLELTR -#define PyArray_CFLOATLTR NPY_CFLOATLTR -#define PyArray_CDOUBLELTR NPY_CDOUBLELTR -#define PyArray_CLONGDOUBLELTR NPY_CLONGDOUBLELTR -#define PyArray_OBJECTLTR NPY_OBJECTLTR -#define PyArray_STRINGLTR NPY_STRINGLTR -#define PyArray_STRINGLTR2 NPY_STRINGLTR2 -#define PyArray_UNICODELTR NPY_UNICODELTR -#define PyArray_VOIDLTR NPY_VOIDLTR -#define PyArray_DATETIMELTR NPY_DATETIMELTR -#define PyArray_TIMEDELTALTR NPY_TIMEDELTALTR -#define PyArray_CHARLTR NPY_CHARLTR -#define PyArray_INTPLTR NPY_INTPLTR -#define PyArray_UINTPLTR NPY_UINTPLTR -#define PyArray_GENBOOLLTR NPY_GENBOOLLTR -#define PyArray_SIGNEDLTR NPY_SIGNEDLTR -#define PyArray_UNSIGNEDLTR NPY_UNSIGNEDLTR -#define PyArray_FLOATINGLTR NPY_FLOATINGLTR -#define PyArray_COMPLEXLTR NPY_COMPLEXLTR - -#define PyArray_QUICKSORT NPY_QUICKSORT -#define PyArray_HEAPSORT NPY_HEAPSORT -#define PyArray_MERGESORT NPY_MERGESORT -#define PyArray_SORTKIND NPY_SORTKIND -#define PyArray_NSORTS NPY_NSORTS - -#define PyArray_NOSCALAR NPY_NOSCALAR -#define PyArray_BOOL_SCALAR NPY_BOOL_SCALAR -#define PyArray_INTPOS_SCALAR NPY_INTPOS_SCALAR -#define PyArray_INTNEG_SCALAR NPY_INTNEG_SCALAR -#define PyArray_FLOAT_SCALAR NPY_FLOAT_SCALAR -#define PyArray_COMPLEX_SCALAR NPY_COMPLEX_SCALAR -#define PyArray_OBJECT_SCALAR NPY_OBJECT_SCALAR -#define PyArray_SCALARKIND NPY_SCALARKIND -#define PyArray_NSCALARKINDS NPY_NSCALARKINDS - -#define PyArray_ANYORDER NPY_ANYORDER -#define PyArray_CORDER NPY_CORDER -#define PyArray_FORTRANORDER NPY_FORTRANORDER -#define PyArray_ORDER NPY_ORDER - -#define PyDescr_ISBOOL PyDataType_ISBOOL -#define PyDescr_ISUNSIGNED PyDataType_ISUNSIGNED -#define PyDescr_ISSIGNED PyDataType_ISSIGNED -#define PyDescr_ISINTEGER PyDataType_ISINTEGER -#define PyDescr_ISFLOAT PyDataType_ISFLOAT -#define PyDescr_ISNUMBER PyDataType_ISNUMBER -#define PyDescr_ISSTRING PyDataType_ISSTRING -#define PyDescr_ISCOMPLEX PyDataType_ISCOMPLEX -#define PyDescr_ISPYTHON PyDataType_ISPYTHON -#define PyDescr_ISFLEXIBLE PyDataType_ISFLEXIBLE -#define PyDescr_ISUSERDEF PyDataType_ISUSERDEF -#define PyDescr_ISEXTENDED PyDataType_ISEXTENDED -#define PyDescr_ISOBJECT PyDataType_ISOBJECT -#define PyDescr_HASFIELDS PyDataType_HASFIELDS - -#define PyArray_LITTLE NPY_LITTLE -#define PyArray_BIG NPY_BIG -#define PyArray_NATIVE NPY_NATIVE -#define PyArray_SWAP NPY_SWAP -#define PyArray_IGNORE NPY_IGNORE - -#define PyArray_NATBYTE NPY_NATBYTE -#define PyArray_OPPBYTE NPY_OPPBYTE - -#define PyArray_MAX_ELSIZE NPY_MAX_ELSIZE - -#define PyArray_USE_PYMEM NPY_USE_PYMEM - -#define PyArray_RemoveLargest PyArray_RemoveSmallest - -#define PyArray_UCS4 npy_ucs4 - -#endif diff --git a/include/numpy/oldnumeric.h b/include/numpy/oldnumeric.h deleted file mode 100644 index 748f06d..0000000 --- a/include/numpy/oldnumeric.h +++ /dev/null @@ -1,23 +0,0 @@ -#include "arrayobject.h" - -#ifndef REFCOUNT -# define REFCOUNT NPY_REFCOUNT -# define MAX_ELSIZE 16 -#endif - -#define PyArray_UNSIGNED_TYPES -#define PyArray_SBYTE NPY_BYTE -#define PyArray_CopyArray PyArray_CopyInto -#define _PyArray_multiply_list PyArray_MultiplyIntList -#define PyArray_ISSPACESAVER(m) NPY_FALSE -#define PyScalarArray_Check PyArray_CheckScalar - -#define CONTIGUOUS NPY_CONTIGUOUS -#define OWN_DIMENSIONS 0 -#define OWN_STRIDES 0 -#define OWN_DATA NPY_OWNDATA -#define SAVESPACE 0 -#define SAVESPACEBIT 0 - -#undef import_array -#define import_array() { if (_import_array() < 0) {PyErr_Print(); PyErr_SetString(PyExc_ImportError, "numpy.core.multiarray failed to import"); } } diff --git a/include/numpy/ufunc_api.txt b/include/numpy/ufunc_api.txt deleted file mode 100644 index 3365433..0000000 --- a/include/numpy/ufunc_api.txt +++ /dev/null @@ -1,312 +0,0 @@ - -================= -Numpy Ufunc C-API -================= -:: - - PyObject * - PyUFunc_FromFuncAndData(PyUFuncGenericFunction *func, void - **data, char *types, int ntypes, int nin, int - nout, int identity, char *name, char *doc, int - check_return) - - -:: - - int - PyUFunc_RegisterLoopForType(PyUFuncObject *ufunc, int - usertype, PyUFuncGenericFunction - function, int *arg_types, void *data) - - -:: - - int - PyUFunc_GenericFunction(PyUFuncObject *ufunc, PyObject *args, PyObject - *kwds, PyArrayObject **op) - - -This generic function is called with the ufunc object, the arguments to it, -and an array of (pointers to) PyArrayObjects which are NULL. - -'op' is an array of at least NPY_MAXARGS PyArrayObject *. - -:: - - void - PyUFunc_f_f_As_d_d(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_d_d(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_f_f(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_g_g(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_F_F_As_D_D(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_F_F(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_D_D(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_G_G(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_O_O(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_ff_f_As_dd_d(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_ff_f(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_dd_d(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_gg_g(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_FF_F_As_DD_D(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_DD_D(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_FF_F(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_GG_G(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_OO_O(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_O_O_method(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_OO_O_method(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_On_Om(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - int - PyUFunc_GetPyValues(char *name, int *bufsize, int *errmask, PyObject - **errobj) - - -On return, if errobj is populated with a non-NULL value, the caller -owns a new reference to errobj. - -:: - - int - PyUFunc_checkfperr(int errmask, PyObject *errobj, int *first) - - -:: - - void - PyUFunc_clearfperr() - - -:: - - int - PyUFunc_getfperr(void ) - - -:: - - int - PyUFunc_handlefperr(int errmask, PyObject *errobj, int retstatus, int - *first) - - -:: - - int - PyUFunc_ReplaceLoopBySignature(PyUFuncObject - *func, PyUFuncGenericFunction - newfunc, int - *signature, PyUFuncGenericFunction - *oldfunc) - - -:: - - PyObject * - PyUFunc_FromFuncAndDataAndSignature(PyUFuncGenericFunction *func, void - **data, char *types, int - ntypes, int nin, int nout, int - identity, char *name, char - *doc, int check_return, const char - *signature) - - -:: - - int - PyUFunc_SetUsesArraysAsData(void **data, size_t i) - - -:: - - void - PyUFunc_e_e(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_e_e_As_f_f(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_e_e_As_d_d(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_ee_e(char **args, npy_intp *dimensions, npy_intp *steps, void - *func) - - -:: - - void - PyUFunc_ee_e_As_ff_f(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - void - PyUFunc_ee_e_As_dd_d(char **args, npy_intp *dimensions, npy_intp - *steps, void *func) - - -:: - - int - PyUFunc_DefaultTypeResolver(PyUFuncObject *ufunc, NPY_CASTING - casting, PyArrayObject - **operands, PyObject - *type_tup, PyArray_Descr **out_dtypes) - - -This function applies the default type resolution rules -for the provided ufunc. - -Returns 0 on success, -1 on error. - -:: - - int - PyUFunc_ValidateCasting(PyUFuncObject *ufunc, NPY_CASTING - casting, PyArrayObject - **operands, PyArray_Descr **dtypes) - - -Validates that the input operands can be cast to -the input types, and the output types can be cast to -the output operands where provided. - -Returns 0 on success, -1 (with exception raised) on validation failure. - diff --git a/include/numpy/ufuncobject.h b/include/numpy/ufuncobject.h deleted file mode 100644 index 076dd88..0000000 --- a/include/numpy/ufuncobject.h +++ /dev/null @@ -1,448 +0,0 @@ -#ifndef Py_UFUNCOBJECT_H -#define Py_UFUNCOBJECT_H - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * The legacy generic inner loop for a standard element-wise or - * generalized ufunc. - */ -typedef void (*PyUFuncGenericFunction) - (char **args, - npy_intp *dimensions, - npy_intp *strides, - void *innerloopdata); - -/* - * The most generic one-dimensional inner loop for - * a standard element-wise ufunc. This typedef is also - * more consistent with the other NumPy function pointer typedefs - * than PyUFuncGenericFunction. - */ -typedef void (PyUFunc_StridedInnerLoopFunc)( - char **dataptrs, npy_intp *strides, - npy_intp count, - NpyAuxData *innerloopdata); - -/* - * The most generic one-dimensional inner loop for - * a masked standard element-wise ufunc. "Masked" here means that it skips - * doing calculations on any items for which the maskptr array has a true - * value. - */ -typedef void (PyUFunc_MaskedStridedInnerLoopFunc)( - char **dataptrs, npy_intp *strides, - char *maskptr, npy_intp mask_stride, - npy_intp count, - NpyAuxData *innerloopdata); - -/* Forward declaration for the type resolver and loop selector typedefs */ -struct _tagPyUFuncObject; - -/* - * Given the operands for calling a ufunc, should determine the - * calculation input and output data types and return an inner loop function. - * This function should validate that the casting rule is being followed, - * and fail if it is not. - * - * For backwards compatibility, the regular type resolution function does not - * support auxiliary data with object semantics. The type resolution call - * which returns a masked generic function returns a standard NpyAuxData - * object, for which the NPY_AUXDATA_FREE and NPY_AUXDATA_CLONE macros - * work. - * - * ufunc: The ufunc object. - * casting: The 'casting' parameter provided to the ufunc. - * operands: An array of length (ufunc->nin + ufunc->nout), - * with the output parameters possibly NULL. - * type_tup: Either NULL, or the type_tup passed to the ufunc. - * out_dtypes: An array which should be populated with new - * references to (ufunc->nin + ufunc->nout) new - * dtypes, one for each input and output. These - * dtypes should all be in native-endian format. - * - * Should return 0 on success, -1 on failure (with exception set), - * or -2 if Py_NotImplemented should be returned. - */ -typedef int (PyUFunc_TypeResolutionFunc)( - struct _tagPyUFuncObject *ufunc, - NPY_CASTING casting, - PyArrayObject **operands, - PyObject *type_tup, - PyArray_Descr **out_dtypes); - -/* - * Given an array of DTypes as returned by the PyUFunc_TypeResolutionFunc, - * and an array of fixed strides (the array will contain NPY_MAX_INTP for - * strides which are not necessarily fixed), returns an inner loop - * with associated auxiliary data. - * - * For backwards compatibility, there is a variant of the inner loop - * selection which returns an inner loop irrespective of the strides, - * and with a void* static auxiliary data instead of an NpyAuxData * - * dynamically allocatable auxiliary data. - * - * ufunc: The ufunc object. - * dtypes: An array which has been populated with dtypes, - * in most cases by the type resolution funciton - * for the same ufunc. - * fixed_strides: For each input/output, either the stride that - * will be used every time the function is called - * or NPY_MAX_INTP if the stride might change or - * is not known ahead of time. The loop selection - * function may use this stride to pick inner loops - * which are optimized for contiguous or 0-stride - * cases. - * out_innerloop: Should be populated with the correct ufunc inner - * loop for the given type. - * out_innerloopdata: Should be populated with the void* data to - * be passed into the out_innerloop function. - * out_needs_api: If the inner loop needs to use the Python API, - * should set the to 1, otherwise should leave - * this untouched. - */ -typedef int (PyUFunc_LegacyInnerLoopSelectionFunc)( - struct _tagPyUFuncObject *ufunc, - PyArray_Descr **dtypes, - PyUFuncGenericFunction *out_innerloop, - void **out_innerloopdata, - int *out_needs_api); -typedef int (PyUFunc_InnerLoopSelectionFunc)( - struct _tagPyUFuncObject *ufunc, - PyArray_Descr **dtypes, - npy_intp *fixed_strides, - PyUFunc_StridedInnerLoopFunc **out_innerloop, - NpyAuxData **out_innerloopdata, - int *out_needs_api); -typedef int (PyUFunc_MaskedInnerLoopSelectionFunc)( - struct _tagPyUFuncObject *ufunc, - PyArray_Descr **dtypes, - PyArray_Descr *mask_dtype, - npy_intp *fixed_strides, - npy_intp fixed_mask_stride, - PyUFunc_MaskedStridedInnerLoopFunc **out_innerloop, - NpyAuxData **out_innerloopdata, - int *out_needs_api); - -typedef struct _tagPyUFuncObject { - PyObject_HEAD - /* - * nin: Number of inputs - * nout: Number of outputs - * nargs: Always nin + nout (Why is it stored?) - */ - int nin, nout, nargs; - - /* Identity for reduction, either PyUFunc_One or PyUFunc_Zero */ - int identity; - - /* Array of one-dimensional core loops */ - PyUFuncGenericFunction *functions; - /* Array of funcdata that gets passed into the functions */ - void **data; - /* The number of elements in 'functions' and 'data' */ - int ntypes; - - /* Does not appear to be used */ - int check_return; - - /* The name of the ufunc */ - char *name; - - /* Array of type numbers, of size ('nargs' * 'ntypes') */ - char *types; - - /* Documentation string */ - char *doc; - - void *ptr; - PyObject *obj; - PyObject *userloops; - - /* generalized ufunc parameters */ - - /* 0 for scalar ufunc; 1 for generalized ufunc */ - int core_enabled; - /* number of distinct dimension names in signature */ - int core_num_dim_ix; - - /* - * dimension indices of input/output argument k are stored in - * core_dim_ixs[core_offsets[k]..core_offsets[k]+core_num_dims[k]-1] - */ - - /* numbers of core dimensions of each argument */ - int *core_num_dims; - /* - * dimension indices in a flatted form; indices - * are in the range of [0,core_num_dim_ix) - */ - int *core_dim_ixs; - /* - * positions of 1st core dimensions of each - * argument in core_dim_ixs - */ - int *core_offsets; - /* signature string for printing purpose */ - char *core_signature; - - /* - * A function which resolves the types and fills an array - * with the dtypes for the inputs and outputs. - */ - PyUFunc_TypeResolutionFunc *type_resolver; - /* - * A function which returns an inner loop written for - * NumPy 1.6 and earlier ufuncs. This is for backwards - * compatibility, and may be NULL if inner_loop_selector - * is specified. - */ - PyUFunc_LegacyInnerLoopSelectionFunc *legacy_inner_loop_selector; - /* - * A function which returns an inner loop for the new mechanism - * in NumPy 1.7 and later. If provided, this is used, otherwise - * if NULL the legacy_inner_loop_selector is used instead. - */ - PyUFunc_InnerLoopSelectionFunc *inner_loop_selector; - /* - * A function which returns a masked inner loop for the ufunc. - */ - PyUFunc_MaskedInnerLoopSelectionFunc *masked_inner_loop_selector; -} PyUFuncObject; - -#include "arrayobject.h" - -#define UFUNC_ERR_IGNORE 0 -#define UFUNC_ERR_WARN 1 -#define UFUNC_ERR_RAISE 2 -#define UFUNC_ERR_CALL 3 -#define UFUNC_ERR_PRINT 4 -#define UFUNC_ERR_LOG 5 - - /* Python side integer mask */ - -#define UFUNC_MASK_DIVIDEBYZERO 0x07 -#define UFUNC_MASK_OVERFLOW 0x3f -#define UFUNC_MASK_UNDERFLOW 0x1ff -#define UFUNC_MASK_INVALID 0xfff - -#define UFUNC_SHIFT_DIVIDEBYZERO 0 -#define UFUNC_SHIFT_OVERFLOW 3 -#define UFUNC_SHIFT_UNDERFLOW 6 -#define UFUNC_SHIFT_INVALID 9 - - -/* platform-dependent code translates floating point - status to an integer sum of these values -*/ -#define UFUNC_FPE_DIVIDEBYZERO 1 -#define UFUNC_FPE_OVERFLOW 2 -#define UFUNC_FPE_UNDERFLOW 4 -#define UFUNC_FPE_INVALID 8 - -/* Error mode that avoids look-up (no checking) */ -#define UFUNC_ERR_DEFAULT 0 - -#define UFUNC_OBJ_ISOBJECT 1 -#define UFUNC_OBJ_NEEDS_API 2 - - /* Default user error mode */ -#define UFUNC_ERR_DEFAULT2 \ - (UFUNC_ERR_WARN << UFUNC_SHIFT_DIVIDEBYZERO) + \ - (UFUNC_ERR_WARN << UFUNC_SHIFT_OVERFLOW) + \ - (UFUNC_ERR_WARN << UFUNC_SHIFT_INVALID) - -#if NPY_ALLOW_THREADS -#define NPY_LOOP_BEGIN_THREADS do {if (!(loop->obj & UFUNC_OBJ_NEEDS_API)) _save = PyEval_SaveThread();} while (0); -#define NPY_LOOP_END_THREADS do {if (!(loop->obj & UFUNC_OBJ_NEEDS_API)) PyEval_RestoreThread(_save);} while (0); -#else -#define NPY_LOOP_BEGIN_THREADS -#define NPY_LOOP_END_THREADS -#endif - -/* - * UFunc has unit of 1, and the order of operations can be reordered - * This case allows reduction with multiple axes at once. - */ -#define PyUFunc_One 1 -/* - * UFunc has unit of 0, and the order of operations can be reordered - * This case allows reduction with multiple axes at once. - */ -#define PyUFunc_Zero 0 -/* - * UFunc has no unit, and the order of operations cannot be reordered. - * This case does not allow reduction with multiple axes at once. - */ -#define PyUFunc_None -1 -/* - * UFunc has no unit, and the order of operations can be reordered - * This case allows reduction with multiple axes at once. - */ -#define PyUFunc_ReorderableNone -2 - -#define UFUNC_REDUCE 0 -#define UFUNC_ACCUMULATE 1 -#define UFUNC_REDUCEAT 2 -#define UFUNC_OUTER 3 - - -typedef struct { - int nin; - int nout; - PyObject *callable; -} PyUFunc_PyFuncData; - -/* A linked-list of function information for - user-defined 1-d loops. - */ -typedef struct _loop1d_info { - PyUFuncGenericFunction func; - void *data; - int *arg_types; - struct _loop1d_info *next; -} PyUFunc_Loop1d; - - -#include "__ufunc_api.h" - -#define UFUNC_PYVALS_NAME "UFUNC_PYVALS" - -#define UFUNC_CHECK_ERROR(arg) \ - do {if ((((arg)->obj & UFUNC_OBJ_NEEDS_API) && PyErr_Occurred()) || \ - ((arg)->errormask && \ - PyUFunc_checkfperr((arg)->errormask, \ - (arg)->errobj, \ - &(arg)->first))) \ - goto fail;} while (0) - -/* This code checks the IEEE status flags in a platform-dependent way */ -/* Adapted from Numarray */ - -#if (defined(__unix__) || defined(unix)) && !defined(USG) -#include -#endif - -/* OSF/Alpha (Tru64) ---------------------------------------------*/ -#if defined(__osf__) && defined(__alpha) - -#include - -#define UFUNC_CHECK_STATUS(ret) { \ - unsigned long fpstatus; \ - \ - fpstatus = ieee_get_fp_control(); \ - /* clear status bits as well as disable exception mode if on */ \ - ieee_set_fp_control( 0 ); \ - ret = ((IEEE_STATUS_DZE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((IEEE_STATUS_OVF & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((IEEE_STATUS_UNF & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((IEEE_STATUS_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - } - -/* MS Windows -----------------------------------------------------*/ -#elif defined(_MSC_VER) - -#include - - /* Clear the floating point exception default of Borland C++ */ -#if defined(__BORLANDC__) -#define UFUNC_NOFPE _control87(MCW_EM, MCW_EM); -#endif - -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus = (int) _clearfp(); \ - \ - ret = ((SW_ZERODIVIDE & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((SW_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((SW_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((SW_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - } - -/* Solaris --------------------------------------------------------*/ -/* --------ignoring SunOS ieee_flags approach, someone else can -** deal with that! */ -#elif defined(sun) || defined(__BSD__) || defined(__OpenBSD__) || \ - (defined(__FreeBSD__) && (__FreeBSD_version < 502114)) || \ - defined(__NetBSD__) -#include - -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus; \ - \ - fpstatus = (int) fpgetsticky(); \ - ret = ((FP_X_DZ & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FP_X_OFL & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FP_X_UFL & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FP_X_INV & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - (void) fpsetsticky(0); \ - } - -#elif defined(__GLIBC__) || defined(__APPLE__) || \ - defined(__CYGWIN__) || defined(__MINGW32__) || \ - (defined(__FreeBSD__) && (__FreeBSD_version >= 502114)) - -#if defined(__GLIBC__) || defined(__APPLE__) || \ - defined(__MINGW32__) || defined(__FreeBSD__) -#include -#elif defined(__CYGWIN__) -#include "fenv/fenv.c" -#endif - -#define UFUNC_CHECK_STATUS(ret) { \ - int fpstatus = (int) fetestexcept(FE_DIVBYZERO | FE_OVERFLOW | \ - FE_UNDERFLOW | FE_INVALID); \ - ret = ((FE_DIVBYZERO & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FE_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FE_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FE_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - (void) feclearexcept(FE_DIVBYZERO | FE_OVERFLOW | \ - FE_UNDERFLOW | FE_INVALID); \ -} - -#elif defined(_AIX) - -#include -#include - -#define UFUNC_CHECK_STATUS(ret) { \ - fpflag_t fpstatus; \ - \ - fpstatus = fp_read_flag(); \ - ret = ((FP_DIV_BY_ZERO & fpstatus) ? UFUNC_FPE_DIVIDEBYZERO : 0) \ - | ((FP_OVERFLOW & fpstatus) ? UFUNC_FPE_OVERFLOW : 0) \ - | ((FP_UNDERFLOW & fpstatus) ? UFUNC_FPE_UNDERFLOW : 0) \ - | ((FP_INVALID & fpstatus) ? UFUNC_FPE_INVALID : 0); \ - fp_swap_flag(0); \ -} - -#else - -#define NO_FLOATING_POINT_SUPPORT -#define UFUNC_CHECK_STATUS(ret) { \ - ret = 0; \ - } - -#endif - -/* - * THESE MACROS ARE DEPRECATED. - * Use npy_set_floatstatus_* in the npymath library. - */ -#define generate_divbyzero_error() npy_set_floatstatus_divbyzero() -#define generate_overflow_error() npy_set_floatstatus_overflow() - - /* Make sure it gets defined if it isn't already */ -#ifndef UFUNC_NOFPE -#define UFUNC_NOFPE -#endif - - -#ifdef __cplusplus -} -#endif -#endif /* !Py_UFUNCOBJECT_H */ diff --git a/include/numpy/utils.h b/include/numpy/utils.h deleted file mode 100644 index cc968a3..0000000 --- a/include/numpy/utils.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef __NUMPY_UTILS_HEADER__ -#define __NUMPY_UTILS_HEADER__ - -#ifndef __COMP_NPY_UNUSED - #if defined(__GNUC__) - #define __COMP_NPY_UNUSED __attribute__ ((__unused__)) - # elif defined(__ICC) - #define __COMP_NPY_UNUSED __attribute__ ((__unused__)) - #else - #define __COMP_NPY_UNUSED - #endif -#endif - -/* Use this to tag a variable as not used. It will remove unused variable - * warning on support platforms (see __COM_NPY_UNUSED) and mangle the variable - * to avoid accidental use */ -#define NPY_UNUSED(x) (__NPY_UNUSED_TAGGED ## x) __COMP_NPY_UNUSED - -#endif diff --git a/requirements.txt b/requirements.txt index 320fcda..3c46fe2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy>=2.1.0 numpy>=1.15.0 -srsly>=0.1.0,<1.1.0 +srsly>=0.1.0 # Development requirements pytest>=4.1.0 diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index c9c3882..6365ea1 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -1,172 +1,13 @@ -# coding: utf8 -from __future__ import unicode_literals - +from typing import Union from pathlib import Path -from spacy.vectors import Vectors -from spacy.strings import StringStore -from spacy.tokens import Doc, Token, Span -import numpy -import srsly -from .util import transform_doc, get_phrases, make_key, split_key from .about import __version__ # noqa: F401 +from .sense2vec import Sense2Vec # noqa: F401 +from .component import Sense2VecComponent # noqa: F401 -def load(vectors_path): - vectors_path = Path(vectors_path) - if not vectors_path.exists(): - raise IOError("Can't find vectors: {}".format(vectors_path)) +def load(vectors_path: Union[Path, str]) -> Sense2Vec: + # TODO: remove this? + if not Path(vectors_path).exists(): + raise IOError(f"Can't find vectors: {vectors_path}") return Sense2Vec().from_disk(vectors_path) - - -class Sense2Vec(object): - def __init__(self, shape=(1000, 128), strings=None): - self.vectors = Vectors(shape=shape) - self.strings = StringStore() if strings is None else strings - - def __len__(self): - return len(self.vectors) - - def __contains__(self, key): - key = key if isinstance(key, int) else self.strings[key] - return key in self.vectors - - def __getitem__(self, key): - key = key if isinstance(key, int) else self.strings[key] - if key in self.vectors: - return self.vectors[key] - - def __iter__(self): - yield from self.items() - - def add(self, key, vector): - if not isinstance(key, int): - key = self.strings.add(key) - self.vectors.add(key, vector=vector) - - def items(self): - for key, value in self.vectors.items(): - yield self.strings[key], value - - def keys(self): - for key in self.vectors.keys(): - yield self.strings[key] - - def values(self): - yield from self.vectors.values() - - def most_similar(self, keys, n_similar=10): - if not isinstance(keys, (list, tuple)): - raise ValueError("Expected iterable of keys. Got: {}".format(type(keys))) - vecs = [self[key] for key in keys if key in self] - queries = numpy.asarray(vecs, dtype=numpy.float32) - result_keys, _, scores = self.vectors.most_similar(queries) - result = zip(result_keys, scores) - result = [(self.strings[key], score) for key, score in result if key] - result = [(key, score) for key, score in result if key not in keys] - # TODO: handle this better? - return result[:n_similar] - - def to_bytes(self, exclude=tuple()): - data = {"vectors": self.vectors.to_bytes()} - if "strings" not in exclude: - data["strings"] = self.strings.to_bytes() - return srsly.msgpack_dumps(data) - - def from_bytes(self, bytes_data, exclude=tuple()): - data = srsly.msgpack_loads(bytes_data) - self.vectors = Vectors().from_bytes(data["vectors"]) - if "strings" not in exclude and "strings" in data: - self.strings = StringStore().from_bytes(data["strings"]) - return self - - def from_disk(self, path, exclude=tuple()): - path = Path(path) - strings_path = path / "strings.json" - self.vectors = Vectors().from_disk(path) - if "strings" not in exclude and strings_path.exists(): - self.strings = StringStore().from_disk(strings_path) - return self - - def to_disk(self, path, exclude=tuple()): - path = Path(path) - self.vectors.to_disk(path) - if "strings" not in exclude: - self.strings.to_disk(path / "strings.json") - return self - - -class Sense2VecComponent(object): - name = "sense2vec" - - def __init__( - self, - vocab=None, - shape=(1000, 128), - merge_phrases=False, - make_key=make_key, - split_key=split_key, - ): - strings = vocab.strings if vocab is not None else None - self.s2v = Sense2Vec(shape=shape, strings=strings) - self.first_run = True - self.merge_phrases = merge_phrases - self.make_key = make_key - self.split_key = split_key - - @classmethod - def from_nlp(cls, nlp, **kwargs): - return cls(vocab=nlp.vocab) - - def __call__(self, doc): - if self.first_run: - self.init_component(doc) - self.first_run = False - # Store reference to s2v object on Doc to make sure it's right - doc._._s2v = self.s2v - if self.merge_phrases: - doc = transform_doc(doc) - return doc - - def init_component(self, doc): - # initialise the attributes here only if the component is added to the - # pipeline and used – otherwise, tokens will still get the attributes - # even if the component is only created and not added - Doc.set_extension("_s2v", default=None) - Doc.set_extension("s2v_phrases", getter=get_phrases) - Token.set_extension("s2v_key", getter=self.s2v_key) - Token.set_extension("in_s2v", getter=self.in_s2v) - Token.set_extension("s2v_vec", getter=self.s2v_vec) - Token.set_extension("s2v_most_similar", method=self.s2v_most_sim) - Span.set_extension("s2v_key", getter=self.s2v_key) - Span.set_extension("in_s2v", getter=self.in_s2v) - Span.set_extension("s2v_vec", getter=self.s2v_vec) - Span.set_extension("s2v_most_similar", method=self.s2v_most_sim) - - def in_s2v(self, obj): - return self.make_key(obj) in obj.doc._._s2v - - def s2v_vec(self, obj): - return obj.doc._._s2v[self.make_key(obj)] - - def s2v_key(self, obj): - return self.make_key(obj) - - def s2v_most_sim(self, obj, n_similar=10): - key = self.make_key(obj) - results = obj.doc._._s2v.most_similar([key], n_similar=n_similar) - return [(self.split_key(result), score) for result, score in results] - - def to_bytes(self): - return self.s2v.to_bytes(exclude=["strings"]) - - def from_bytes(self, bytes_data): - self.s2v = Sense2Vec().from_bytes(bytes_data, exclude=["strings"]) - return self - - def to_disk(self, path): - self.s2v.to_bytes(path, exclude=["strings"]) - - def from_disk(self, path): - self.s2v = Sense2Vec().from_disk(path, exclude=["strings"]) - return self diff --git a/sense2vec/component.py b/sense2vec/component.py new file mode 100644 index 0000000..d7754b8 --- /dev/null +++ b/sense2vec/component.py @@ -0,0 +1,92 @@ +from typing import Tuple, Union, List +from spacy.tokens import Doc, Token, Span +from spacy.vocab import Vocab +from spacy.language import Language +from pathlib import Path +import numpy + +from .sense2vec import Sense2Vec +from .util import merge_phrases, get_phrases, make_spacy_key + + +class Sense2VecComponent(object): + name = "sense2vec" + + def __init__( + self, + vocab: Vocab = None, + shape: Tuple[int, int] = (1000, 128), + merge_phrases: bool = False, + ): + strings = vocab.strings if vocab is not None else None + self.s2v = Sense2Vec(shape=shape, strings=strings) + self.first_run = True + self.merge_phrases = merge_phrases + + @classmethod + def from_nlp(cls, nlp: Language, **kwargs): + return cls(vocab=nlp.vocab) + + def __call__(self, doc: Doc) -> Doc: + if self.first_run: + self.init_component(doc) + self.first_run = False + # Store reference to s2v object on Doc to make sure it's right + doc._._s2v = self.s2v + if self.merge_phrases: + doc = merge_phrases(doc) + return doc + + def init_component(self, doc: Doc): + # initialise the attributes here only if the component is added to the + # pipeline and used – otherwise, tokens will still get the attributes + # even if the component is only created and not added + Doc.set_extension("_s2v", default=None) + Doc.set_extension("s2v_phrases", getter=get_phrases) + Token.set_extension("s2v_key", getter=self.s2v_key) + Token.set_extension("in_s2v", getter=self.in_s2v) + Token.set_extension("s2v_vec", getter=self.s2v_vec) + Token.set_extension("s2v_other_senses", getter=self.s2v_other_senses) + Token.set_extension("s2v_most_similar", method=self.s2v_most_similar) + Span.set_extension("s2v_key", getter=self.s2v_key) + Span.set_extension("in_s2v", getter=self.in_s2v) + Span.set_extension("s2v_vec", getter=self.s2v_vec) + Span.set_extension("s2v_other_senses", getter=self.s2v_other_senses) + Span.set_extension("s2v_most_similar", method=self.s2v_most_similar) + + def get_key(self, obj: Union[Token, Span]) -> str: + return make_spacy_key(obj, self.s2v.make_key) + + def in_s2v(self, obj: Union[Token, Span]) -> bool: + return self.get_key(obj) in obj.doc._._s2v + + def s2v_vec(self, obj: Union[Token, Span]) -> numpy.ndarray: + return obj.doc._._s2v[self.get_key(obj)] + + def s2v_key(self, obj: Union[Token, Span]) -> str: + return self.get_key(obj) + + def s2v_most_similar( + self, obj: Union[Token, Span], n_similar: int = 10 + ) -> List[Tuple[Tuple[str, str], float]]: + key = self.get_key(obj) + results = obj.doc._._s2v.most_similar([key], n_similar=n_similar) + return [(self.s2v.split_key(result), score) for result, score in results] + + def s2v_other_senses(self, obj: Union[Token, Span]) -> List[str]: + key = self.get_key(obj) + return obj._._s2v.get_other_senses(key) + + def to_bytes(self) -> bytes: + return self.s2v.to_bytes(exclude=["strings"]) + + def from_bytes(self, bytes_data: bytes): + self.s2v = Sense2Vec().from_bytes(bytes_data, exclude=["strings"]) + return self + + def to_disk(self, path: Union[str, Path]): + self.s2v.to_disk(path, exclude=["strings"]) + + def from_disk(self, path: Union[str, Path]): + self.s2v = Sense2Vec().from_disk(path, exclude=["strings"]) + return self diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py new file mode 100644 index 0000000..89074d8 --- /dev/null +++ b/sense2vec/sense2vec.py @@ -0,0 +1,125 @@ +from typing import Callable, Tuple, List, Union, Iterable +from pathlib import Path +from spacy.vectors import Vectors +from spacy.strings import StringStore +import numpy +import srsly + +from .util import make_key, split_key + + +class Sense2Vec(object): + def __init__( + self, + shape: tuple = (1000, 128), + strings: StringStore = None, + make_key: Callable[[str, str], str] = make_key, + split_key: Callable[[str], Tuple[str, str]] = split_key, + senses: List[str] = [], + ): + self.make_key = make_key + self.split_key = split_key + self.vectors = Vectors(shape=shape) + self.strings = StringStore() if strings is None else strings + self.cfg = {"senses": senses} + + @property + def senses(self) -> List[str]: + return self.cfg.get("senses", []) + + def __len__(self) -> int: + return len(self.vectors) + + def __contains__(self, key: Union[str, int]) -> bool: + key = key if isinstance(key, int) else self.strings[key] + return key in self.vectors + + def __getitem__(self, key: Union[str, int]) -> numpy.ndarray: + key = key if isinstance(key, int) else self.strings[key] + if key in self.vectors: + return self.vectors[key] + + def __iter__(self): + yield from self.items() + + def add(self, key: Union[str, int], vector: numpy.ndarray): + if not isinstance(key, int): + key = self.strings.add(key) + self.vectors.add(key, vector=vector) + + def items(self): + for key, value in self.vectors.items(): + yield self.strings[key], value + + def keys(self): + for key in self.vectors.keys(): + yield self.strings[key] + + def values(self): + yield from self.vectors.values() + + def most_similar( + self, keys: Iterable[str], n_similar: int = 10 + ) -> List[Tuple[str, float]]: + if not isinstance(keys, (list, tuple)): + raise ValueError(f"Expected iterable of keys. Got: {type(keys)}") + vecs = [self[key] for key in keys if key in self] + queries = numpy.asarray(vecs, dtype=numpy.float32) + result_keys, _, scores = self.vectors.most_similar(queries) + result = list(zip(result_keys, scores)) + result = [(self.strings[key], score) for key, score in result if key] + result = [(key, score) for key, score in result if key not in keys] + # TODO: handle this better? + return result[:n_similar] + + def get_other_senses(self, key: str) -> List[str]: + result = [] + word, orig_sense = self.split_key(key) + for sense in self.senses: + new_key = self.make_key(word, sense) + if sense != orig_sense and new_key in self: + result.append(new_key) + return result + + def get_best_sense(self, word: str) -> Tuple[str, Union[str, None]]: + # TODO: implement properly? + if not self.senses: + return (word, None) + versions = [word, word.upper(), word.title()] if word.islower() else [word] + freqs = [] + for text in versions: + for sense in self.senses: + key = self.make_key(text, sense) + row = self.vectors.find(key=key) + freqs.append((row, (text, sense))) + return max(freqs)[1] if freqs else (word, None) + + def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + data = {"vectors": self.vectors.to_bytes(), "cfg": self.cfg} + if "strings" not in exclude: + data["strings"] = self.strings.to_bytes() + return srsly.msgpack_dumps(data) + + def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): + data = srsly.msgpack_loads(bytes_data) + self.vectors = Vectors().from_bytes(data["vectors"]) + if "strings" not in exclude and "strings" in data: + self.strings = StringStore().from_bytes(data["strings"]) + self.cfg = data["cfg"] + return self + + def from_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): + path = Path(path) + strings_path = path / "strings.json" + self.vectors = Vectors().from_disk(path) + if "strings" not in exclude and strings_path.exists(): + self.strings = StringStore().from_disk(strings_path) + self.cfg = srsly.read_json(path / "cfg") + return self + + def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): + path = Path(path) + self.vectors.to_disk(path) + if "strings" not in exclude: + self.strings.to_disk(path / "strings.json") + srsly.write_json(path / "cfg") diff --git a/sense2vec/util.py b/sense2vec/util.py index 7f31cf0..e459bbc 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -1,11 +1,13 @@ -# coding: utf8 -from __future__ import unicode_literals - -from spacy.tokens import Token, Span +from typing import Union, Callable, List, Tuple +import re +from spacy.tokens import Doc, Token, Span from spacy.util import filter_spans -def transform_doc(doc): +DEFAULT_SENSE = "?" + + +def merge_phrases(doc: Doc) -> Doc: """ Transform a spaCy Doc to match the sense2vec format: merge entities into one token and merge noun chunks without determiners. @@ -20,33 +22,32 @@ def transform_doc(doc): return doc -def make_key(obj): - text = obj.text.replace(" ", "_") - if isinstance(obj, Token): - return text + "|" + obj.pos_ - elif isinstance(obj, Span): - if obj.label_: - return text + "|" + obj.label_ - return text + "|" + obj.root.pos_ - return text +def make_key(word: str, sense: str) -> str: + text = re.sub(r"\s", "_", word) + return text + "|" + sense -def split_key(key): - return tuple(key.replace("_", " ").rsplit("|", 1)) +def split_key(key: str) -> Tuple[str, str]: + word, sense = key.replace("_", " ").rsplit("|", 1) + return word, sense -def make_token_key(token): - return token.text.replace(" ", "_") + "|" + token.pos_ - - -def make_span_key(span): - text = span.text.replace(" ", "_") - if span.label_: - return text + "|" + span.label_ - return text + "|" + span.root.pos_ +def make_spacy_key( + obj: Union[Token, Span], make_key: Callable[[str, str], str] = make_key +) -> str: + text = obj.text + if isinstance(obj, Token): + if obj.like_url: + text = "%%URL" + sense = "X" + else: + sense = obj.pos_ + elif isinstance(obj, Span): + sense = obj.label_ or obj.root.pos_ + return make_key(text, sense or DEFAULT_SENSE) -def get_phrases(doc): +def get_phrases(doc: Doc) -> List[Span]: spans = list(doc.ents) if doc.is_parsed: for np in doc.noun_chunks: diff --git a/setup.py b/setup.py index 6429f75..6d113fd 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def setup_package(): version=about["__version__"], license=about["__license__"], packages=find_packages(), - install_requires=["spacy>=2.1.0", "numpy>=1.15.0", "srsly>=0.1.0,<1.1.0"], + install_requires=["spacy>=2.1.0", "numpy>=1.15.0", "srsly>=0.1.0"], python_requires=">=3.6", entry_points={ "spacy_factories": ["sense2vec = sense2vec:Sense2VecComponent.from_nlp"] diff --git a/tests/data/cfg b/tests/data/cfg new file mode 100644 index 0000000..0967ef4 --- /dev/null +++ b/tests/data/cfg @@ -0,0 +1 @@ +{} diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index d051e23..ce137a6 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -24,6 +24,19 @@ def test_sense2vec_object(): assert sorted(list(s2v.keys())) == ["test", "test2"] +def test_sense2vec_other_senses(): + s2v = Sense2Vec(shape=(6, 4)) + s2v.cfg["senses"] = ["A", "B", "C", "D"] + for key in ["a|A", "a|B", "a|C", "b|A", "b|C", "c|A"]: + s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) + others = s2v.get_other_senses("a|A") + assert sorted(others) == ["a|B", "a|C"] + others = s2v.get_other_senses("b|C") + assert others == ["b|A"] + others = s2v.get_other_senses("c|A") + assert others == [] + + def test_sense2vec_most_similar(): s2v = Sense2Vec(shape=(6, 4)) s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) From 03369a29caa3bd3e198c0a0b5f96335c7e939ca4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 01:45:17 +0200 Subject: [PATCH 062/297] Tidy up --- tests/test_component.py | 3 --- tests/test_model.py | 3 --- tests/test_sense2vec.py | 3 --- 3 files changed, 9 deletions(-) diff --git a/tests/test_component.py b/tests/test_component.py index dc9a982..7866c15 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import numpy from spacy.vocab import Vocab diff --git a/tests/test_model.py b/tests/test_model.py index 70621e7..9da7fb9 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest from pathlib import Path from sense2vec import Sense2Vec diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index ce137a6..1401e44 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -1,6 +1,3 @@ -# coding: utf8 -from __future__ import unicode_literals - import pytest import numpy from sense2vec import Sense2Vec From 5e772a5c8f69426723299cacd47f04c91b88fe86 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 01:45:25 +0200 Subject: [PATCH 063/297] Add frequencies --- sense2vec/sense2vec.py | 51 +++++++++++++++++++++++++++++------------ tests/test_sense2vec.py | 33 ++++++++++++++++++++++++-- 2 files changed, 67 insertions(+), 17 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 89074d8..6c98a66 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -1,4 +1,4 @@ -from typing import Callable, Tuple, List, Union, Iterable +from typing import Callable, Tuple, List, Union, Iterable, Dict from pathlib import Path from spacy.vectors import Vectors from spacy.strings import StringStore @@ -21,6 +21,7 @@ def __init__( self.split_key = split_key self.vectors = Vectors(shape=shape) self.strings = StringStore() if strings is None else strings + self.freqs: Dict[int, int] = {} self.cfg = {"senses": senses} @property @@ -31,21 +32,23 @@ def __len__(self) -> int: return len(self.vectors) def __contains__(self, key: Union[str, int]) -> bool: - key = key if isinstance(key, int) else self.strings[key] + key = self.ensure_int_key(key) return key in self.vectors def __getitem__(self, key: Union[str, int]) -> numpy.ndarray: - key = key if isinstance(key, int) else self.strings[key] + key = self.ensure_int_key(key) if key in self.vectors: return self.vectors[key] def __iter__(self): yield from self.items() - def add(self, key: Union[str, int], vector: numpy.ndarray): + def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): if not isinstance(key, int): key = self.strings.add(key) self.vectors.add(key, vector=vector) + if freq is not None: + self.set_freq(key, freq) def items(self): for key, value in self.vectors.items(): @@ -58,6 +61,17 @@ def keys(self): def values(self): yield from self.vectors.values() + def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]: + key = self.ensure_int_key(key) + return self.freqs.get(key, default) + + def set_freq(self, key: Union[str, int], value: int): + key = self.ensure_int_key(key) + self.freqs[key] = value + + def ensure_int_key(self, key: Union[str, int]) -> int: + return key if isinstance(key, int) else self.strings[key] + def most_similar( self, keys: Iterable[str], n_similar: int = 10 ) -> List[Tuple[str, float]]: @@ -81,21 +95,23 @@ def get_other_senses(self, key: str) -> List[str]: result.append(new_key) return result - def get_best_sense(self, word: str) -> Tuple[str, Union[str, None]]: - # TODO: implement properly? + def get_best_sense(self, word: str, ignore_case: bool = True) -> Union[str, None]: if not self.senses: - return (word, None) - versions = [word, word.upper(), word.title()] if word.islower() else [word] + return None + versions = [word, word.upper(), word.title()] if ignore_case else [word] freqs = [] for text in versions: for sense in self.senses: key = self.make_key(text, sense) - row = self.vectors.find(key=key) - freqs.append((row, (text, sense))) - return max(freqs)[1] if freqs else (word, None) + if key in self: + freq = self.get_freq(key, -1) + freqs.append((freq, key)) + return max(freqs)[1] if freqs else None def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: - data = {"vectors": self.vectors.to_bytes(), "cfg": self.cfg} + vectors_bytes = self.vectors.to_bytes() + freqs = list(self.freqs.items()) + data = {"vectors": vectors_bytes, "cfg": self.cfg, "freqs": freqs} if "strings" not in exclude: data["strings"] = self.strings.to_bytes() return srsly.msgpack_dumps(data) @@ -103,23 +119,28 @@ def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) + self.freqs = dict(data.get("freqs", [])) + self.cfg = data.get("cfg", {}) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) - self.cfg = data["cfg"] return self def from_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): path = Path(path) strings_path = path / "strings.json" + freqs_path = path / "freqs.json" self.vectors = Vectors().from_disk(path) + self.cfg = srsly.read_json(path / "cfg") + if freqs_path.exists(): + self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) - self.cfg = srsly.read_json(path / "cfg") return self def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): path = Path(path) self.vectors.to_disk(path) + srsly.write_json(path / "cfg", self.cfg) + srsly.write_json(path / "freqs.json", list(self.freqs.items())) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") - srsly.write_json(path / "cfg") diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 1401e44..7f8093b 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -21,6 +21,20 @@ def test_sense2vec_object(): assert sorted(list(s2v.keys())) == ["test", "test2"] +def test_sense2vec_freqs(): + s2v = Sense2Vec(shape=(10, 4)) + vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) + s2v.add("test1", vector, 123) + s2v.add("test2", vector, 456) + assert len(s2v.freqs) == 2 + assert s2v.get_freq("test1") == 123 + assert s2v.get_freq("test2") == 456 + assert s2v.get_freq("test3") is None + assert s2v.get_freq("test3", 100) == 100 + s2v.set_freq("test3", 200) + assert s2v.get_freq("test3") == 200 + + def test_sense2vec_other_senses(): s2v = Sense2Vec(shape=(6, 4)) s2v.cfg["senses"] = ["A", "B", "C", "D"] @@ -34,6 +48,19 @@ def test_sense2vec_other_senses(): assert others == [] +def test_sense2vec_best_sense(): + s2v = Sense2Vec(shape=(5, 4)) + s2v.cfg["senses"] = ["A", "B", "C"] + for key, freq in [("a|A", 100), ("a|B", 50), ("a|C", 10), ("b|A", 1), ("B|C", 2)]: + s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32), freq) + assert s2v.get_best_sense("a") == "a|A" + assert s2v.get_best_sense("b") == "B|C" + assert s2v.get_best_sense("b", ignore_case=False) == "b|A" + assert s2v.get_best_sense("c") is None + s2v.cfg["senses"] = [] + assert s2v.get_best_sense("a") is None + + def test_sense2vec_most_similar(): s2v = Sense2Vec(shape=(6, 4)) s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) @@ -54,14 +81,16 @@ def test_sense2vec_to_from_bytes(): s2v = Sense2Vec(shape=(2, 4)) test_vector1 = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32) test_vector2 = numpy.asarray([5, 6, 7, 8], dtype=numpy.float32) - s2v.add("test1", test_vector1) - s2v.add("test2", test_vector2) + s2v.add("test1", test_vector1, 123) + s2v.add("test2", test_vector2, 456) s2v_bytes = s2v.to_bytes() new_s2v = Sense2Vec().from_bytes(s2v_bytes) assert len(new_s2v) == 2 assert new_s2v.vectors.shape == (2, 4) assert "test1" in new_s2v assert "test2" in new_s2v + assert new_s2v.get_freq("test1") == 123 + assert new_s2v.get_freq("test2") == 456 assert numpy.array_equal(new_s2v["test1"], test_vector1) assert numpy.array_equal(new_s2v["test2"], test_vector2) assert s2v_bytes == new_s2v.to_bytes() From 1b80c68d9096158f3254b9fed4704e4826291bfb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 02:27:21 +0200 Subject: [PATCH 064/297] Tidy up and add docstrings --- sense2vec/component.py | 113 +++++++++++++++++++++++++------- sense2vec/sense2vec.py | 144 ++++++++++++++++++++++++++++++++++------- sense2vec/util.py | 58 ++++++++++++----- 3 files changed, 254 insertions(+), 61 deletions(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index d7754b8..6a91f7c 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -18,6 +18,13 @@ def __init__( shape: Tuple[int, int] = (1000, 128), merge_phrases: bool = False, ): + """Initialize the pipeline component. + + vocab (Vocab): The shared vocab. Mostly used for the shared StringStore. + shape (tuple): The vector shape. + merge_phrases (bool): Merge sense2vec phrases into one token. + RETURNS (Sense2VecComponent): The newly constructed object. + """ strings = vocab.strings if vocab is not None else None self.s2v = Sense2Vec(shape=shape, strings=strings) self.first_run = True @@ -25,11 +32,22 @@ def __init__( @classmethod def from_nlp(cls, nlp: Language, **kwargs): + """Initialize the component from an nlp object. Mostly used as the + component factory for the entry point (see setup.py). + + nlp (Language): The nlp object. + RETURNS (Sense2VecComponent): The newly constructed object. + """ return cls(vocab=nlp.vocab) def __call__(self, doc: Doc) -> Doc: + """Process a Doc object with the component. + + doc (Doc): The document to process. + RETURNS (Doc): The processed document. + """ if self.first_run: - self.init_component(doc) + self.init_component() self.first_run = False # Store reference to s2v object on Doc to make sure it's right doc._._s2v = self.s2v @@ -37,56 +55,105 @@ def __call__(self, doc: Doc) -> Doc: doc = merge_phrases(doc) return doc - def init_component(self, doc: Doc): - # initialise the attributes here only if the component is added to the - # pipeline and used – otherwise, tokens will still get the attributes - # even if the component is only created and not added + def init_component(self): + """Register the component-specific extension attributes here and only + if the component is added to the pipeline and used – otherwise, tokens + will still get the attributes even if the component is only created and + not added. + """ Doc.set_extension("_s2v", default=None) Doc.set_extension("s2v_phrases", getter=get_phrases) - Token.set_extension("s2v_key", getter=self.s2v_key) - Token.set_extension("in_s2v", getter=self.in_s2v) - Token.set_extension("s2v_vec", getter=self.s2v_vec) - Token.set_extension("s2v_other_senses", getter=self.s2v_other_senses) - Token.set_extension("s2v_most_similar", method=self.s2v_most_similar) - Span.set_extension("s2v_key", getter=self.s2v_key) - Span.set_extension("in_s2v", getter=self.in_s2v) - Span.set_extension("s2v_vec", getter=self.s2v_vec) - Span.set_extension("s2v_other_senses", getter=self.s2v_other_senses) - Span.set_extension("s2v_most_similar", method=self.s2v_most_similar) - - def get_key(self, obj: Union[Token, Span]) -> str: - return make_spacy_key(obj, self.s2v.make_key) + for obj in [Token, Span]: + obj.set_extension("s2v_key", getter=self.s2v_key) + obj.set_extension("in_s2v", getter=self.in_s2v) + obj.set_extension("s2v_vec", getter=self.s2v_vec) + obj.set_extension("s2v_freq", getter=self.s2v_freq) + obj.set_extension("s2v_other_senses", getter=self.s2v_other_senses) + obj.set_extension("s2v_most_similar", method=self.s2v_most_similar) def in_s2v(self, obj: Union[Token, Span]) -> bool: - return self.get_key(obj) in obj.doc._._s2v + """Extension attribute getter. Check if a token or span has a vector. + + obj (Token / Span): The object the attribute is called on. + RETURNS (bool): Whether the key of that object is in the table. + """ + return self.s2v_key(obj) in obj.doc._._s2v def s2v_vec(self, obj: Union[Token, Span]) -> numpy.ndarray: - return obj.doc._._s2v[self.get_key(obj)] + """Extension attribute getter. Get the vector for a given object. + + obj (Token / Span): The object the attribute is called on. + RETURNS (numpy.ndarray): The vector. + """ + return obj.doc._._s2v[self.s2v_key(obj)] + + def s2v_freq(self, obj: Union[Token, Span]) -> int: + """Extension attribute getter. Get the frequency for a given object. + + obj (Token / Span): The object the attribute is called on. + RETURNS (int): The frequency. + """ + return obj.doc._._s2v.get_freq(self.s2v_key(obj)) def s2v_key(self, obj: Union[Token, Span]) -> str: - return self.get_key(obj) + """Extension attribute getter and helper method. Create a Sense2Vec key + like "duck|NOUN" from a spaCy object. + + obj (Token / Span): The object to create the key for. + RETURNS (unicode): The key. + """ + return make_spacy_key(obj, obj.doc._._s2v.make_key) def s2v_most_similar( self, obj: Union[Token, Span], n_similar: int = 10 ) -> List[Tuple[Tuple[str, str], float]]: - key = self.get_key(obj) + """Extension attribute method. Get the most similar entries. + + n_similar (int): The number of similar entries to return. + RETURNS (list): The most similar entries as a list of + ((word, sense), score) tuples. + """ + key = self.s2v_key(obj) results = obj.doc._._s2v.most_similar([key], n_similar=n_similar) return [(self.s2v.split_key(result), score) for result, score in results] def s2v_other_senses(self, obj: Union[Token, Span]) -> List[str]: - key = self.get_key(obj) + """Extension attribute getter. Get other senses for an object. + + obj (Token / Span): The object the attribute is called on. + RETURNS (list): A list of other senses. + """ + key = self.s2v_key(obj) return obj._._s2v.get_other_senses(key) def to_bytes(self) -> bytes: + """Serialize the component to a bytestring. + + RETURNS (bytes): The serialized component. + """ return self.s2v.to_bytes(exclude=["strings"]) def from_bytes(self, bytes_data: bytes): + """Load the component from a bytestring. + + bytes_data (bytes): The data to load. + RETURNS (Sense2VecComponent): The loaded object. + """ self.s2v = Sense2Vec().from_bytes(bytes_data, exclude=["strings"]) return self def to_disk(self, path: Union[str, Path]): + """Serialize the component to a directory. + + path (unicode / Path): The path to save to. + """ self.s2v.to_disk(path, exclude=["strings"]) def from_disk(self, path: Union[str, Path]): + """Load the component from a directory. + + path (unicode / Path): The path to load from. + RETURNS (Sense2VecComponent): The loaded object. + """ self.s2v = Sense2Vec().from_disk(path, exclude=["strings"]) return self diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 6c98a66..1fdea3b 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -17,6 +17,19 @@ def __init__( split_key: Callable[[str], Tuple[str, str]] = split_key, senses: List[str] = [], ): + """Initialize the Sense2Vec object. + + shape (tuple): The vector shape. + strings (StringStore): Optional string store. Will be created if it + doesn't exist. + make_key (callable): Optional custom function that takes a word and + sense string and creates the key (e.g. "word|sense"). + split_key (callable): Optional custom function that takes a key and + returns the word and sense (e.g. ("word", "sense")). + senses (list): Optional list of all available senses. Used in methods + that generate the best sense or other senses. + RETURNS (Sense2Vec): The newly constructed object. + """ self.make_key = make_key self.split_key = split_key self.vectors = Vectors(shape=shape) @@ -26,55 +39,99 @@ def __init__( @property def senses(self) -> List[str]: + """RETURNS (list): The available senses.""" return self.cfg.get("senses", []) def __len__(self) -> int: + """RETURNS (int): The number of rows in the vectors table.""" return len(self.vectors) def __contains__(self, key: Union[str, int]) -> bool: + """Check if a key is in the vectors table. + + key (unicode / int): The key to look up. + RETURNS (bool): Whether the key is in the table. + """ key = self.ensure_int_key(key) return key in self.vectors def __getitem__(self, key: Union[str, int]) -> numpy.ndarray: + """Retrieve a vector for a given key. + + key (unicode / int): The key to look up. + RETURNS (numpy.ndarray): The vector. + """ key = self.ensure_int_key(key) if key in self.vectors: return self.vectors[key] def __iter__(self): + """YIELDS (tuple): String key and vector pairs in the table.""" yield from self.items() - def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): - if not isinstance(key, int): - key = self.strings.add(key) - self.vectors.add(key, vector=vector) - if freq is not None: - self.set_freq(key, freq) - def items(self): + """YIELDS (tuple): String key and vector pairs in the table.""" for key, value in self.vectors.items(): yield self.strings[key], value def keys(self): + """YIELDS (unicode): The keys in the table.""" for key in self.vectors.keys(): yield self.strings[key] def values(self): + """YIELDS (numpy.ndarray): The vectors in the table.""" yield from self.vectors.values() + def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): + """Add a new vector to the table. + + key (unicode / int): The key to add. + vector (numpy.ndarray): The vector to add. + freq (int): Optional frequency count. + """ + if not isinstance(key, int): + key = self.strings.add(key) + self.vectors.add(key, vector=vector) + if freq is not None: + self.set_freq(key, freq) + def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]: + """Get the frequency count for a given key. + + key (unicode / int): They key to look up. + default: Default value to return if no frequency is found. + RETURNS (int): The frequency count. + """ key = self.ensure_int_key(key) return self.freqs.get(key, default) - def set_freq(self, key: Union[str, int], value: int): + def set_freq(self, key: Union[str, int], freq: int): + """Set a frequency count for a given key. + + key (unicode / int): The key to set the count for. + freq (int): The frequency count. + """ key = self.ensure_int_key(key) - self.freqs[key] = value + self.freqs[key] = freq def ensure_int_key(self, key: Union[str, int]) -> int: + """Ensure that a key is an int by looking it up in the string store. + + key (unicode / int): The key. + RETURNS (int): The integer key. + """ return key if isinstance(key, int) else self.strings[key] def most_similar( - self, keys: Iterable[str], n_similar: int = 10 + self, keys: Iterable[Union[str, int]], n_similar: int = 10 ) -> List[Tuple[str, float]]: + """Get the most similar entries in the table. + + key (iterable): The string or integer keys to compare to. + n_similar (int): The number of similar keys to return. + RETURNS (list): The keys of the most similar vectors. + """ if not isinstance(keys, (list, tuple)): raise ValueError(f"Expected iterable of keys. Got: {type(keys)}") vecs = [self[key] for key in keys if key in self] @@ -86,16 +143,35 @@ def most_similar( # TODO: handle this better? return result[:n_similar] - def get_other_senses(self, key: str) -> List[str]: + def get_other_senses( + self, key: Union[str, int], ignore_case: bool = True + ) -> List[str]: + """Find other entries for the same word with a different sense, e.g. + "duck|VERB" for "duck|NOUN". + + key (unicode / int): The key to check. + ignore_case (bool): Check for uppercase, lowercase and titlecase. + RETURNS (list): Other entries with different senses. + """ result = [] + key = key if isinstance(key, str) else self.strings[key] word, orig_sense = self.split_key(key) - for sense in self.senses: - new_key = self.make_key(word, sense) - if sense != orig_sense and new_key in self: - result.append(new_key) + versions = [word, word.upper(), word.title()] if ignore_case else [word] + for text in versions: + for sense in self.senses: + new_key = self.make_key(text, sense) + if sense != orig_sense and new_key in self: + result.append(new_key) return result def get_best_sense(self, word: str, ignore_case: bool = True) -> Union[str, None]: + """Find the best-matching sense for a given word based on the available + senses and frequency counts. Returns None if no match is found. + + word (unicode): The word to check. + ignore_case (bool): Check for uppercase, lowercase and titlecase. + RETURNS (unicode): The best-matching sense or None. + """ if not self.senses: return None versions = [word, word.upper(), word.title()] if ignore_case else [word] @@ -109,6 +185,11 @@ def get_best_sense(self, word: str, ignore_case: bool = True) -> Union[str, None return max(freqs)[1] if freqs else None def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + """Serialize a Sense2Vec object to a bytestring. + + exclude (list): Names of serialization fields to exclude. + RETURNS (bytes): The serialized Sense2Vec object. + """ vectors_bytes = self.vectors.to_bytes() freqs = list(self.freqs.items()) data = {"vectors": vectors_bytes, "cfg": self.cfg, "freqs": freqs} @@ -117,6 +198,12 @@ def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: return srsly.msgpack_dumps(data) def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): + """Load a Sense2Vec object from a bytestring. + + bytes_data (bytes): The data to load. + exclude (list): Names of serialization fields to exclude. + RETURNS (Sense2Vec): The loaded object. + """ data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) self.freqs = dict(data.get("freqs", [])) @@ -125,7 +212,26 @@ def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): self.strings = StringStore().from_bytes(data["strings"]) return self + def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): + """Serialize a Sense2Vec object to a directory. + + path (unicode / Path): The path. + exclude (list): Names of serialization fields to exclude. + """ + path = Path(path) + self.vectors.to_disk(path) + srsly.write_json(path / "cfg", self.cfg) + srsly.write_json(path / "freqs.json", list(self.freqs.items())) + if "strings" not in exclude: + self.strings.to_disk(path / "strings.json") + def from_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): + """Load a Sense2Vec object from a directory. + + path (unicode / Path): The path to load from. + exclude (list): Names of serialization fields to exclude. + RETURNS (Sense2Vec): The loaded object. + """ path = Path(path) strings_path = path / "strings.json" freqs_path = path / "freqs.json" @@ -136,11 +242,3 @@ def from_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) return self - - def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): - path = Path(path) - self.vectors.to_disk(path) - srsly.write_json(path / "cfg", self.cfg) - srsly.write_json(path / "freqs.json", list(self.freqs.items())) - if "strings" not in exclude: - self.strings.to_disk(path / "strings.json") diff --git a/sense2vec/util.py b/sense2vec/util.py index e459bbc..fb75f04 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -7,27 +7,23 @@ DEFAULT_SENSE = "?" -def merge_phrases(doc: Doc) -> Doc: - """ - Transform a spaCy Doc to match the sense2vec format: merge entities - into one token and merge noun chunks without determiners. - """ - spans = get_phrases(doc) - spans = filter_spans(spans) - with doc.retokenize() as retokenizer: - for span in spans: - root = span.root - attrs = {"tag": root.tag_, "lemma": root.lemma_, "ent_type": root.ent_type_} - retokenizer.merge(span, attrs=attrs) - return doc - - def make_key(word: str, sense: str) -> str: + """Create a key from a word and sense, e.g. "usage_example|NOUN". + + word (unicode): The word. + sense (unicode): The sense. + RETURNS (unicode): The key. + """ text = re.sub(r"\s", "_", word) return text + "|" + sense def split_key(key: str) -> Tuple[str, str]: + """Split a key into word and sense, e.g. ("usage example", "NOUN"). + + key (unicode): The key to split. + RETURNS (tuple): The split (word, sense) tuple. + """ word, sense = key.replace("_", " ").rsplit("|", 1) return word, sense @@ -35,6 +31,17 @@ def split_key(key: str) -> Tuple[str, str]: def make_spacy_key( obj: Union[Token, Span], make_key: Callable[[str, str], str] = make_key ) -> str: + """Create a key from a spaCy object, i.e. a Token or Span. If the object + is a token, the part-of-speech tag (Token.pos_) is used for the sense + and a special string is created for URLs. If the object is a Span and + has a label (i.e. is an entity span), the label is used. Otherwise, the + span's root part-of-speech tag becomes the sense. + + obj (Token / Span): The spaCy object to create the key for. + make_key (callable): function that takes a word and sense string and + creates the key (e.g. "word|sense"). + RETURNS (unicode): The key. + """ text = obj.text if isinstance(obj, Token): if obj.like_url: @@ -48,6 +55,12 @@ def make_spacy_key( def get_phrases(doc: Doc) -> List[Span]: + """Compile a list of sense2vec phrases based on a processed Doc: named + entities and noun chunks without determiners. + + doc (Doc): The Doc to get phrases from. + RETURNS (list): The phrases as a list of Span objects. + """ spans = list(doc.ents) if doc.is_parsed: for np in doc.noun_chunks: @@ -55,3 +68,18 @@ def get_phrases(doc: Doc) -> List[Span]: np = np[1:] spans.append(np) return spans + + +def merge_phrases(doc: Doc) -> Doc: + """Transform a spaCy Doc to match the sense2vec format: merge entities + into one token and merge noun chunks without determiners. + + doc (Doc): The document to merge phrases in. + RETURNS (Doc): The Doc with merged tokens. + """ + spans = get_phrases(doc) + spans = filter_spans(spans) + with doc.retokenize() as retokenizer: + for span in spans: + retokenizer.merge(span) + return doc From 20d8cb0abe07e44e1a4672b0ea013286eb9d6f8f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 02:27:27 +0200 Subject: [PATCH 065/297] Add more tests --- tests/test_component.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_component.py b/tests/test_component.py index 7866c15..8978796 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -17,12 +17,14 @@ def doc(): def test_component_attributes(doc): s2v = Sense2VecComponent(doc.vocab, shape=(10, 4)) vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) - s2v.s2v.add("world|NOUN", vector) + s2v.s2v.add("world|NOUN", vector, 123) doc = s2v(doc) assert doc[0]._.s2v_key == "hello|INTJ" assert doc[1]._.s2v_key == "world|NOUN" assert doc[0]._.in_s2v is False assert doc[1]._.in_s2v is True + assert doc[0]._.s2v_freq is None + assert doc[1]._.s2v_freq == 123 assert numpy.array_equal(doc[1]._.s2v_vec, vector) From d97ea6f195186e1aed4c9ba57df85b8012531850 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 02:29:43 +0200 Subject: [PATCH 066/297] Update train.py --- bin/train.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/train.py b/bin/train.py index 4ac425c..cb97a80 100644 --- a/bin/train.py +++ b/bin/train.py @@ -1,3 +1,4 @@ +#!/usr/bin/env python from gensim.models import Word2Vec from gensim.models.word2vec import PathLineSentences from sense2vec import Sense2Vec From 688d1429ff878704d7e664f803948a724f78699c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 02:42:09 +0200 Subject: [PATCH 067/297] Update train.py --- bin/train.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/bin/train.py b/bin/train.py index cb97a80..420c9c6 100644 --- a/bin/train.py +++ b/bin/train.py @@ -4,28 +4,35 @@ from sense2vec import Sense2Vec from sense2vec.util import split_key import plac +import logging @plac.annotations( in_dir=("Location of input directory", "positional", None, str), - out_file=("Location of output file", "positional", None, str), + out_dir=("Location of output directory", "positional", None, str), n_workers=("Number of workers", "option", "n", int), size=("Dimension of the word vectors", "option", "d", int), window=("Context window size", "option", "w", int), min_count=("Min count", "option", "m", int), negative=("Number of negative samples", "option", "g", int), nr_iter=("Number of iterations", "option", "i", int), + verbose=("Log debugging info", "flag", "V", bool), ) def main( in_dir, - out_file, + out_dir, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2, + verbose=False, ): + if verbose: + logging.basicConfig( + format="%(asctime)s - %(message)s", datefmt="%H:%M:%S", level=logging.INFO + ) w2v_model = Word2Vec( size=size, window=window, @@ -55,11 +62,11 @@ def main( _, sense = split_key(string) all_senses.add(sense) s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses) - for string, _, vector in vectors: - s2v.add(string, vector) + for string, freq, vector in vectors: + s2v.add(string, vector, freq) print("Saving the model...") - s2v.to_disk(out_file) - print(f"Saved model to file: {out_file}") + s2v.to_disk(out_dir) + print(f"Saved model to directory: {out_dir}") if __name__ == "__main__": From cd10a5928da5f6a2097b26b837329d3d7f69fa4c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 02:53:20 +0200 Subject: [PATCH 068/297] Tidy up --- .gitignore | 1 + bin/push-tag.sh => push-tag.sh | 0 requirements.txt | 4 ++-- {bin => scripts}/preprocess.py | 0 scripts/requirements.txt | 3 +++ {bin => scripts}/train.py | 0 6 files changed, 6 insertions(+), 2 deletions(-) rename bin/push-tag.sh => push-tag.sh (100%) rename {bin => scripts}/preprocess.py (100%) create mode 100644 scripts/requirements.txt rename {bin => scripts}/train.py (100%) diff --git a/.gitignore b/.gitignore index fade098..ea09aa7 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ cythonize.dat .pytest_cache .vscode .mypy_cache +.prettierrc # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/bin/push-tag.sh b/push-tag.sh similarity index 100% rename from bin/push-tag.sh rename to push-tag.sh diff --git a/requirements.txt b/requirements.txt index 3c46fe2..8968295 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ -spacy>=2.1.0 +spacy>=2.1.0,<2.3.0 numpy>=1.15.0 srsly>=0.1.0 # Development requirements -pytest>=4.1.0 +pytest>=4.1.0,<4.2.0 diff --git a/bin/preprocess.py b/scripts/preprocess.py similarity index 100% rename from bin/preprocess.py rename to scripts/preprocess.py diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..88c05a9 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1,3 @@ +plac>=0.9.6,<1.0.0 +tqdm>=4.36.1,<5.0.0 +gensim>=3.8.1,<4.0.0 diff --git a/bin/train.py b/scripts/train.py similarity index 100% rename from bin/train.py rename to scripts/train.py From 37556a004542db6f4620fee5c1c839d1c31e5bce Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 03:16:54 +0200 Subject: [PATCH 069/297] Update setup.py --- setup.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6d113fd..fe6e3d5 100644 --- a/setup.py +++ b/setup.py @@ -35,7 +35,11 @@ def setup_package(): install_requires=["spacy>=2.1.0", "numpy>=1.15.0", "srsly>=0.1.0"], python_requires=">=3.6", entry_points={ - "spacy_factories": ["sense2vec = sense2vec:Sense2VecComponent.from_nlp"] + "spacy_factories": ["sense2vec = sense2vec:Sense2VecComponent.from_nlp"], + "prodigy_recipes": [ + "sense2vec_teach = prodigy_recipes:teach", + "sens2vec_to_patterns = prodigy_recipes:to_patterns", + ], }, classifiers=[ "Development Status :: 5 - Production/Stable", From 67526aace74dba85d94fc05700e5f99102f97da4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 03:34:58 +0200 Subject: [PATCH 070/297] Update prodigy_recipes.py --- prodigy_recipes.py | 246 +++++++++++++++------------------------------ 1 file changed, 79 insertions(+), 167 deletions(-) diff --git a/prodigy_recipes.py b/prodigy_recipes.py index 0cab5f1..dc6d7c6 100644 --- a/prodigy_recipes.py +++ b/prodigy_recipes.py @@ -1,204 +1,116 @@ -# coding: utf8 -from __future__ import unicode_literals -import sys -from pathlib import Path - import prodigy -from prodigy.core import recipe_args from prodigy.components.db import connect -from prodigy.util import log, prints, split_string, set_hashes -import sense2vec -from spacy.lang.en import English +from prodigy.util import log, split_string, set_hashes +from sense2vec import Sense2Vec import srsly +import spacy -@prodigy.recipe('sense2vec.teach', - dataset=recipe_args["dataset"], - vectors_path=("Path to pretrained sense2vec vectors"), +@prodigy.recipe( + "sense2vec.teach", + dataset=("Dataset to save annotations to", "positional", None, str), + vectors_path=("Path to pretrained sense2vec vectors", "positional", None, str), seeds=("One or more comma-separated seed terms", "option", "se", split_string), threshold=("Similarity threshold for sense2vec", "option", "t", float), - top_n=("Only get the top n results for each accepted sense2vec term", "option", "n", int), + top_n=("Only get the top n results for each accepted term", "option", "n", int), batch_size=("Batch size for submitting annotations", "option", "bs", int), - resume=("Resume from existing phrases dataset", "flag", "R", bool) + resume=("Resume from existing phrases dataset", "flag", "R", bool), ) -def teach(dataset, vectors_path, seeds, threshold=0.85, top_n=200, batch_size=5, resume=False): +def teach( + dataset, vectors_path, seeds, threshold=0.85, top_n=200, batch_size=5, resume=False +): """ - Bootstrap a terminology list sense2vec. Prodigy - will suggest similar terms based on the the most similar - phrases from sense2vec + Bootstrap a terminology list sense2vec. Prodigy will suggest similar terms + based on the the most similar phrases from sense2vec. """ - SENSES = ["auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", - "NUM", "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", - "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", - "WORK_OF_ART", "LANGUAGE"] - - log("RECIPE: Starting recipe phrases.to-patterns", locals()) - LEMMATIZER = English().vocab.morphology.lemmatizer - S2V = sense2vec.load(vectors_path) - log("RECIPE: Finished loading sense2vec", locals()) - - # Seems to be a bug in sense2vec which gets < n similar senses not <= n - batch_size = min(batch_size, top_n * len(seeds)) - top_n = top_n + 1 - + log("RECIPE: Starting recipe sense2vec.teach", locals()) + s2v = Sense2Vec().from_disk(vectors_path) + log("RECIPE: Loaded sense2vec", locals()) + seed_keys = [] + for seed in seeds: + best_word, best_sense = s2v.get_best_sense(seed) + if best_sense is None: + raise ValueError(f"Can't find seed term '{seed}' in vectors") + seed_keys.append(s2v.make_key(best_word, best_sense)) + print(f"Starting with seed keys: {seed_keys}") DB = connect() - seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds] + seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seed_keys] DB.add_examples(seed_tasks, datasets=[dataset]) - - accept_phrases = seeds - reject_phrases = [] - - seen = set(accept_phrases) - sensed = set() + accept_keys = seed_keys + reject_keys = [] + seen = set(accept_keys) if resume: prev = DB.get_dataset(dataset) prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"] - accept_phrases += prev_accept - reject_phrases += prev_reject - - seen.update(set(accept_phrases)) - seen.update(set(reject_phrases)) - log("RECIPE: Resuming from {} previous examples in dataset {}".format(len(prev), dataset)) - - def format_for_s2v(word, sense): - return word.replace(" ", "_") + "|" + sense - - def get_best(word, sense): - if sense != "auto": # if sense is specified, find respective entry - if format_for_s2v(word, sense) in S2V: - return (word, sense) - return (None, None) - freqs = [] - casings = [word, word.upper(), word.title()] if word.islower() else [word] - for text in casings: # try options - for tag in SENSES: - query = format_for_s2v(text, tag) - if query in S2V: - freqs.append((S2V[query][0], (text, tag))) - return max(freqs)[1] if freqs else (None, None) - - def get_similar(word, sense, n=100): - query = format_for_s2v(word, sense) - if query not in S2V: - return [] - freq, query_vector = S2V[query] - words, scores = S2V.most_similar(query_vector, n) - words = [word.rsplit("|", 1) for word in words] - # Don't know why we'd be getting unsensed entries, but fix. - words = [entry for entry in words if len(entry) == 2] - words = [(word.replace("_", " "), sense) for word, sense in words] - return zip(words, scores) - - def find_similar(word: str, sense: str = "auto", n_results: int = top_n): - """Find similar terms for a given term and optional sense.""" - best_word, best_sense = get_best(word, sense) - results = [] - if not word or not best_word: - return results - seen = set([best_word, min(LEMMATIZER(best_word, best_sense))]) - similar = get_similar(best_word, best_sense, n_results) - for (word_entry, sense_entry), score in similar: - head = min(LEMMATIZER(word_entry, sense_entry)) - if head not in seen and score > threshold: - freq, _ = S2V[format_for_s2v(word_entry, sense_entry)] - results.append((score, word_entry)) - seen.add(head) - if len(results) >= n_results: - break - return results + accept_keys += prev_accept + reject_keys += prev_reject + seen.update(set(accept_keys)) + seen.update(set(reject_keys)) + log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}") def update(answers): - """Updates accept_phrases so that the stream can find new phrases""" + """Updates accept_keys so that the stream can find new phrases.""" + log(f"RECIPE: Updating with {len(answers)} answers") for answer in answers: - if answer['answer'] == 'accept': - accept_phrases.append(answer['text']) - elif answer['answer'] == 'reject': - reject_phrases.append(answer['text']) - + phrase = answer["text"] + if answer["answer"] == "accept": + accept_keys.append(phrase) + elif answer["answer"] == "reject": + reject_keys.append(phrase) + def get_stream(): - """Continue querying sense2vec whenever we get a new phrase and presenting - examples to the user with a similarity above the threshold parameter""" + """Continue querying sense2vec whenever we get a new phrase and + presenting examples to the user with a similarity above the threshold + parameter.""" while True: - seen.update(set([rp.lower() for rp in reject_phrases])) - for p in accept_phrases: - if p.lower() not in sensed: - sensed.add(p.lower()) - for score, phrase in find_similar(p): - if phrase.lower() not in seen: - seen.add(phrase.lower()) - yield {"text": phrase, 'meta': {'score': score}} + log(f"RECIPE: Getting {top_n} similar phrases") + most_similar = s2v.most_similar(accept_keys, n_similar=top_n) + for key, score in most_similar: + if key not in seen and score > threshold: + seen.add(key) + word, sense = s2v.split_key(key) + yield { + "text": key, + "word": word, + "sense": sense, + "meta": {"score": score}, + } stream = get_stream() return { - 'view_id': 'text', - 'dataset': dataset, - 'stream': stream, - 'update': update, - 'config': { - 'batch_size': batch_size - } + "view_id": "html", + "dataset": dataset, + "stream": stream, + "update": update, + "config": {"batch_size": batch_size, "html_template": "{{word}} ({{sense}})"}, } @prodigy.recipe( "sense2vec.to-patterns", - dataset=recipe_args["dataset"], - label=recipe_args["label"], - output_file=recipe_args["output_file"], + dataset=("Dataset to save annotations to", "positional", None, str), + spacy_model=("spaCy model for tokenization", "positional", None, str), + label=("Label to apply to all patterns", "positional", None, str), + output_file=("Optional output file. Defaults to stdout", "option", "o", str), ) -def to_patterns(dataset=None, label=None, output_file=None): +def to_patterns(dataset, spacy_model, label, output_file="-"): """ Convert a list of seed phrases to a list of match patterns that can be used - with ner.match. If no output file is specified, each pattern is printed - so the recipe's output can be piped forward to ner.match. - - This is pretty much an exact copy of terms.to-patterns. - The pattern for each example is just split on whitespace so instead of: - - {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new balance"}]} - - - which won't match anything you'll get: - - {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} + with ner.match. If no output file is specified, each pattern is printed. + The examples are tokenized to make sure that multi-token terms are + represented correctly, e.g.: + {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ - if label is None: - prints( - "--label is a required argument", - "This is the label that will be assigned to all patterns " - "created from terms collected in this dataset. ", - exits=1, - error=True, - ) - + log("RECIPE: Starting recipe sense2vec.to-patterns", locals()) + nlp = spacy.load(spacy_model) + log(f"RECIPE: Loaded spaCy model '{spacy_model}'") DB = connect() - - def get_pattern(term, label): - return {"label": label, "pattern": [{"lower": t.lower()} for t in term["text"].split()]} - - log("RECIPE: Starting recipe phrases.to-patterns", locals()) - if dataset is None: - log("RECIPE: Reading input terms from sys.stdin") - terms = (srsly.json_loads(line) for line in sys.stdin) - else: - if dataset not in DB: - prints("Can't find dataset '{}'".format(dataset), exits=1, error=True) - terms = DB.get_dataset(dataset) - log( - "RECIPE: Reading {} input phrases from dataset {}".format(len(terms), dataset) - ) - if output_file: - patterns = [ - get_pattern(term, label) for term in terms if term["answer"] == "accept" - ] - log("RECIPE: Generated {} patterns".format(len(patterns))) - srsly.write_jsonl(output_file, patterns) - prints("Exported {} patterns".format(len(patterns)), output_file) - else: - log("RECIPE: Outputting patterns") - for term in terms: - if term["answer"] == "accept": - print(srsly.json_dumps(get_pattern(term, label))) + examples = DB.get_dataset(dataset) + terms = [eg["text"] for eg in examples if eg["answer"] == "accept"] + patterns = [{"lower": t.lower_ for t in nlp.make_doc(term)} for term in terms] + patterns = [{"label": label, "pattern": pattern} for pattern in patterns] + log(f"RECIPE: Generated {len(patterns)} patterns") + srsly.write_jsonl(output_file, patterns) From 4b7620328ade3e4f24f2697822f5bc7ae7337b88 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 03:37:22 +0200 Subject: [PATCH 071/297] Update prodigy_recipes.py --- prodigy_recipes.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/prodigy_recipes.py b/prodigy_recipes.py index dc6d7c6..3ad9b8e 100644 --- a/prodigy_recipes.py +++ b/prodigy_recipes.py @@ -27,14 +27,17 @@ def teach( s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec", locals()) seed_keys = [] + seed_tasks = [] for seed in seeds: best_word, best_sense = s2v.get_best_sense(seed) if best_sense is None: raise ValueError(f"Can't find seed term '{seed}' in vectors") - seed_keys.append(s2v.make_key(best_word, best_sense)) + key = s2v.make_key(best_word, best_sense) + seed_keys.append(key) + task = {"text": key, "word": best_word, "sense": best_sense, "answer": "accept"} + seed_tasks.append(set_hashes(task)) print(f"Starting with seed keys: {seed_keys}") DB = connect() - seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seed_keys] DB.add_examples(seed_tasks, datasets=[dataset]) accept_keys = seed_keys reject_keys = [] @@ -71,12 +74,8 @@ def get_stream(): if key not in seen and score > threshold: seen.add(key) word, sense = s2v.split_key(key) - yield { - "text": key, - "word": word, - "sense": sense, - "meta": {"score": score}, - } + meta = {"score": score} + yield {"text": key, "word": word, "sense": sense, "meta": meta} stream = get_stream() From 1993e85b345da9a66a6dea3d4e877e2f70336d38 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 28 Sep 2019 03:38:40 +0200 Subject: [PATCH 072/297] Fix recipes --- prodigy_recipes.py => sense2vec/prodigy_recipes.py | 0 setup.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename prodigy_recipes.py => sense2vec/prodigy_recipes.py (100%) diff --git a/prodigy_recipes.py b/sense2vec/prodigy_recipes.py similarity index 100% rename from prodigy_recipes.py rename to sense2vec/prodigy_recipes.py diff --git a/setup.py b/setup.py index fe6e3d5..27b87a6 100644 --- a/setup.py +++ b/setup.py @@ -37,8 +37,8 @@ def setup_package(): entry_points={ "spacy_factories": ["sense2vec = sense2vec:Sense2VecComponent.from_nlp"], "prodigy_recipes": [ - "sense2vec_teach = prodigy_recipes:teach", - "sens2vec_to_patterns = prodigy_recipes:to_patterns", + "sense2vec_teach = sense2vec:prodigy_recipes.teach", + "sens2vec_to_patterns = sense2vec:prodigy_recipes.to_patterns", ], }, classifiers=[ From 8a4706ec186894cfa9743b57c1fc9449380b9d0d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 4 Oct 2019 18:01:03 +0200 Subject: [PATCH 073/297] Use setup.cfg --- .flake8 | 8 ------- sense2vec/about.py | 6 ------ setup.cfg | 52 ++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 41 +----------------------------------- 4 files changed, 53 insertions(+), 54 deletions(-) delete mode 100644 .flake8 create mode 100644 setup.cfg diff --git a/.flake8 b/.flake8 deleted file mode 100644 index 55b597f..0000000 --- a/.flake8 +++ /dev/null @@ -1,8 +0,0 @@ -[flake8] -ignore = E203, E266, E501, E731, W503 -max-line-length = 80 -select = B,C,E,F,W,T4,B9 -exclude = - .env, - .git, - __pycache__, diff --git a/sense2vec/about.py b/sense2vec/about.py index bae518e..6559039 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1,7 +1 @@ -__title__ = "sense2vec" __version__ = "1.0.0a2" -__summary__ = "Use NLP to go beyond vanilla word2vec" -__uri__ = "https://github.com/explosion/sense2vec" -__author__ = "Explosion" -__email__ = "contact@explosion.ai" -__license__ = "MIT" diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..610da40 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,52 @@ +[metadata] +description = Use NLP to go beyond vanilla word2vec +url = https://github.com/explosion/sense2vec +author = Explosion +author_email = contact@explosion.ai +license = MIT +long_description = file: README.md +long_description_content_type = text/markdown +classifiers = + Development Status :: 5 - Production/Stable + Environment :: Console + Intended Audience :: Developers + Intended Audience :: Science/Research + License :: OSI Approved :: MIT License + Operating System :: POSIX :: Linux + Operating System :: MacOS :: MacOS X + Operating System :: Microsoft :: Windows + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Topic :: Scientific/Engineering + +[options] +zip_safe = false +include_package_data = true +python_requires = >=3.6 +install_requires = + spacy>=2.1.0 + numpy>=1.15.0 + srsly>=0.1.0 + +[options.entry_points] +spacy_factories = + sense2vec = sense2vec:Sense2VecComponent.from_nlp +prodigy_recipes = + sense2vec_teach = sense2vec:prodigy_recipes.teach + sens2vec_to_patterns = sense2vec:prodigy_recipes.to_patterns + +[bdist_wheel] +universal = true + +[sdist] +formats = gztar + +[flake8] +ignore = E203, E266, E501, E731, W503 +max-line-length = 80 +select = B,C,E,F,W,T4,B9 +exclude = + .env, + .git, + __pycache__, diff --git a/setup.py b/setup.py index 27b87a6..eed8126 100644 --- a/setup.py +++ b/setup.py @@ -9,52 +9,13 @@ def setup_package(): package_name = "sense2vec" root = os.path.abspath(os.path.dirname(__file__)) - # Read in package meta from about.py about_path = os.path.join(root, package_name, "about.py") with io.open(about_path, encoding="utf8") as f: about = {} exec(f.read(), about) - # Get readme - readme_path = os.path.join(root, "README.md") - with io.open(readme_path, encoding="utf8") as f: - readme = f.read() - - setup( - name="sense2vec", - description=about["__summary__"], - long_description=readme, - long_description_content_type="text/markdown", - author=about["__author__"], - author_email=about["__email__"], - url=about["__uri__"], - version=about["__version__"], - license=about["__license__"], - packages=find_packages(), - install_requires=["spacy>=2.1.0", "numpy>=1.15.0", "srsly>=0.1.0"], - python_requires=">=3.6", - entry_points={ - "spacy_factories": ["sense2vec = sense2vec:Sense2VecComponent.from_nlp"], - "prodigy_recipes": [ - "sense2vec_teach = sense2vec:prodigy_recipes.teach", - "sens2vec_to_patterns = sense2vec:prodigy_recipes.to_patterns", - ], - }, - classifiers=[ - "Development Status :: 5 - Production/Stable", - "Intended Audience :: Developers", - "Intended Audience :: Science/Research", - "Topic :: Scientific/Engineering :: Artificial Intelligence", - "License :: OSI Approved :: MIT License", - "Operating System :: POSIX :: Linux", - "Operating System :: MacOS :: MacOS X", - "Operating System :: Microsoft :: Windows", - "Programming Language :: Python :: 3.6", - "Programming Language :: Python :: 3.7", - ], - zip_safe=False, - ) + setup(name="sense2vec", version=about["__version__"], packages=find_packages()) if __name__ == "__main__": From a7f1204ad12a8c07a6e8113409588310e433cde1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 4 Oct 2019 18:02:21 +0200 Subject: [PATCH 074/297] Update README.md --- README.md | 148 ++++++++++++++++++++++++++---------------------------- 1 file changed, 72 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index 1838013..0a4d583 100644 --- a/README.md +++ b/README.md @@ -3,77 +3,75 @@ # sense2vec: Use NLP to go beyond vanilla word2vec sense2vec [Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice -twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you -learn more interesting, detailed and context-sensitive word vectors. For an +twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn +more interesting, detailed and context-sensitive word vectors. For an interactive example of the technology, see our [sense2vec demo](https://demos.explosion.ai/sense2vec) that lets you explore -semantic similarities across all Reddit comments of 2015. +semantic similarities across all Reddit comments of 2015. This library is a +simple Python implementation for loading and querying sense2vec models. -This library is a simple Python/Cython implementation for loading and querying -sense2vec models. While it's best used in combination with -[spaCy](https://spacy.io), the `sense2vec` library itself is very lightweight -and can also be used as a standalone module. See below for usage details. +🦆 **Version 1.0 out now!** +[Read the release notes here.](https://github.com/explosion/sense2vec/releases/) -🦆 **Version 1.0 alpha out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) - -[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/12/master.svg?logo=azure-devops&style=flat-square)](https://dev.azure.com/explosion-ai/public/_build?definitionId=12) -[![Current Release Version](https://img.shields.io/github/v/release/explosion/sense2vec.svg?style=flat-square&include_prereleases)](https://github.com/explosion/sense2vec/releases) -[![pypi Version](https://img.shields.io/pypi/v/sense2vec.svg?style=flat-square)](https://pypi.org/project/sense2vec/) +[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/12/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=12) +[![Current Release Version](https://img.shields.io/github/v/release/explosion/sense2vec.svg?style=flat-square&include_prereleases&logo=github)](https://github.com/explosion/sense2vec/releases) +[![pypi Version](https://img.shields.io/pypi/v/sense2vec.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/sense2vec/) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) ## Usage Examples -### Usage with spaCy +### Standalone usage + +```python +from sense2vec import Sense2Vec + +s2v = Sense2Vec().from_disk("/path/to/reddit_vectors-1.1.0") +query = "natural_language_processing|NOUN" +assert query in s2v +vector = s2v[query] +freq = s2v.get_freq(query) +most_similar = s2v.most_similar(query, 3) +# [('natural_language_processing|NOUN', 1.0), +# ('machine_learning|NOUN', 0.8986966609954834), +# ('computer_vision|NOUN', 0.8636297583580017)] +``` + +### Usage as a spaCy pipeline component ```python import spacy from sense2vec import Sense2VecComponent nlp = spacy.load("en_core_web_sm") -s2v = Sense2VecComponent("/path/to/reddit_vectors-1.1.0") +s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/reddit_vectors-1.1.0") nlp.add_pipe(s2v) doc = nlp("A sentence about natural language processing.") -assert doc[3].text == "natural language processing" -freq = doc[3]._.s2v_freq -vector = doc[3]._.s2v_vec -most_similar = doc[3]._.s2v_most_similar(3) +assert doc[3:6].text == "natural language processing" +freq = doc[3:6]._.s2v_freq +vector = doc[3:6]._.s2v_vec +most_similar = doc[3:6]._.s2v_most_similar(3) # [(('natural language processing', 'NOUN'), 1.0), # (('machine learning', 'NOUN'), 0.8986966609954834), # (('computer vision', 'NOUN'), 0.8636297583580017)] ``` -### Standalone usage without spaCy - -```python -import sense2vec - -s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") -query = "natural_language_processing|NOUN" -assert query in s2v -freq, vector = s2v[query] -words, scores = s2v.most_similar(vector, 3) -most_similar = list(zip(words, scores)) -# [('natural_language_processing|NOUN', 1.0), -# ('machine_learning|NOUN', 0.8986966609954834), -# ('computer_vision|NOUN', 0.8636297583580017)] -``` - ## Installation & Setup sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a1 +pip install sense2vec ``` The Reddit vectors model is attached to the [latest release](https://github.com/explosion/sense2vec/releases). To load it -in, download the `.tar.gz` archive, unpack it and point `sense2vec.load` to -the extracted data directory: +in, download the `.tar.gz` archive, unpack it and point `from_disk` to the +extracted data directory: ```python -import sense2vec -s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") +from sense2vec import Sense2Vec +s2v = Sense2Vec.from_disk("/path/to/reddit_vectors-1.1.0") ``` ## Usage @@ -85,12 +83,12 @@ pipeline. Note that `sense2vec` doesn't depend on spaCy, so you'll have to install it separately and download the English model. ```bash -pip install -U spacy==2.0.0 +pip install -U spacy python -m spacy download en_core_web_sm ``` -The `sense2vec` package exposes a `Sense2VecComponent`, which can be -initialised with the data path and added to your spaCy pipeline as a +The `sense2vec` package exposes a `Sense2VecComponent`, which can be initialised +with the shared vocab and added to your spaCy pipeline as a [custom pipeline component](https://spacy.io/usage/processing-pipelines#custom-components). By default, components are added to the _end of the pipeline_, which is the recommended position for this component, since it needs access to the dependency @@ -101,29 +99,28 @@ import spacy from sense2vec import Sense2VecComponent nlp = spacy.load("en_core_web_sm") -s2v = Sense2VecComponent("/path/to/reddit_vectors-1.1.0") +s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/reddit_vectors-1.1.0") nlp.add_pipe(s2v) ``` -The pipeline component will **merge noun phrases and entities** according to -the same schema used when training the sense2vec models (e.g. noun chunks -without determiners like "the"). This ensures that you'll be able to retrieve -meaningful vectors for phrases in your text. The component will also add -serveral [extension attributes and methods](https://spacy.io/usage/processing-pipelines#custom-components-attributes) +The pipeline component will **merge noun phrases and entities** according to the +same schema used when training the sense2vec models (e.g. noun chunks without +determiners like "the"). This ensures that you'll be able to retrieve meaningful +vectors for phrases in your text. The component will also add serveral +[extension attributes and methods](https://spacy.io/usage/processing-pipelines#custom-components-attributes) to spaCy's `Token` and `Span` objects that let you retrieve vectors and frequencies, as well as most similar terms. ```python doc = nlp("A sentence about natural language processing.") -assert doc[3].text == "natural language processing" -assert doc[3]._.in_s2v -freq = doc[3]._.s2v_freq -vector = doc[3]._.s2v_vec -most_similar = doc[3]._.s2v_most_similar(10) +assert doc[3:6].text == "natural language processing" +freq = doc[3:6]._.s2v_freq +vector = doc[3:6]._.s2v_vec +most_similar = doc[3:6]._.s2v_most_similar(3) ``` -For entities, the entity labels are used as the "sense" (instead of the -token's part-of-speech tag): +For entities, the entity labels are used as the "sense" (instead of the token's +part-of-speech tag): ```python doc = nlp("A sentence about Facebook and Google.") @@ -147,10 +144,10 @@ The following attributes are available via the `._` property – for example > ⚠️ **A note on span attributes:** Under the hood, entities in `doc.ents` are > `Span` objects. This is why the pipeline component also adds attributes and > methods to spans and not just tokens. However, it's not recommended to use the -> sense2vec attributes on arbitrary slices of the document, since the model likely -> won't have a key for the respective text. `Span` objects also don't have a -> part-of-speech tag, so if no entity label is present, the "sense" defaults to -> the root's part-of-speech tag. +> sense2vec attributes on arbitrary slices of the document, since the model +> likely won't have a key for the respective text. `Span` objects also don't +> have a part-of-speech tag, so if no entity label is present, the "sense" +> defaults to the root's part-of-speech tag. ### Standalone usage @@ -162,12 +159,12 @@ import sense2vec s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") ``` -`sense2vec.load` returns an instance of the `VectorMap` class, which you -can interact with via the following methods. +`sense2vec.load` returns an instance of the `VectorMap` class, which you can +interact with via the following methods. > ⚠️ **Important note:** When interacting with the `VectorMap` directly, the -> keys need to follow the scheme of `phrase_text|SENSE` (note the `_` instead -> of spaces and the `|` before the tag or label) – for example, +> keys need to follow the scheme of `phrase_text|SENSE` (note the `_` instead of +> spaces and the `|` before the tag or label) – for example, > `machine_learning|NOUN`. Also note that the underlying vector table is > case-sensitive. @@ -186,11 +183,10 @@ assert len(s2v) == 1195261 #### method `VectorMap.__contains__` -Check whether the `VectorMap` has a given key. Keys consist of the word -string, a pipe and the "sense", i.e. the part-of-speech tag or entity label. -For example: `'duck|NOUN'` or `'duck|VERB'`. See the section on "Senses" -below for more details. Also note that the underlying vector table is -**case-sensitive**. +Check whether the `VectorMap` has a given key. Keys consist of the word string, +a pipe and the "sense", i.e. the part-of-speech tag or entity label. For +example: `'duck|NOUN'` or `'duck|VERB'`. See the section on "Senses" below for +more details. Also note that the underlying vector table is **case-sensitive**. | Argument | Type | Description | | ----------- | ------- | ----------------------------------- | @@ -205,9 +201,9 @@ assert "dkdksl|VERB" not in s2v #### method `VectorMap.__getitem__` -Retrieve a `(frequency, vector)` tuple from the vector map. The frequency is -an integer, the vector a `numpy.ndarray(dtype='float32')`. If the key is not -found, a `KeyError` is raised. +Retrieve a `(frequency, vector)` tuple from the vector map. The frequency is an +integer, the vector a `numpy.ndarray(dtype='float32')`. If the key is not found, +a `KeyError` is raised. | Argument | Type | Description | | ----------- | ------- | ------------------------------------------------- | @@ -220,8 +216,8 @@ freq, vector = s2v["duck|NOUN"] #### method `VectorMap.__setitem__` -Assign a `(frequency, vector)` tuple to the vector map. The frequency should -be an integer, the vector a `numpy.ndarray(dtype='float32')`. +Assign a `(frequency, vector)` tuple to the vector map. The frequency should be +an integer, the vector a `numpy.ndarray(dtype='float32')`. | Argument | Type | Description | | -------- | ------- | ---------------------------------------------- | @@ -263,9 +259,9 @@ integer, the vector a `numpy.ndarray(dtype='float32')` #### method `VectorMap.most_similar` -Find the keys of the `n` most similar entries, given a vector. Note that -the _most_ similar entry with a score of `1.0` will be the key of the query -vector itself. +Find the keys of the `n` most similar entries, given a vector. Note that the +_most_ similar entry with a score of `1.0` will be the key of the query vector +itself. | Argument | Type | Description | | ----------- | -------------------------------- | -------------------------------------------------- | From 4e7aceebe2a24ce70b0736ad50e0345533314550 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 4 Oct 2019 18:02:53 +0200 Subject: [PATCH 075/297] Update most_similar for new spaCy version --- sense2vec/component.py | 6 +++--- sense2vec/prodigy_recipes.py | 2 +- sense2vec/sense2vec.py | 37 +++++++++++++++++++++++++----------- setup.cfg | 2 +- tests/test_sense2vec.py | 20 ++++++++++++++----- 5 files changed, 46 insertions(+), 21 deletions(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index 6a91f7c..93a6ee8 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -105,16 +105,16 @@ def s2v_key(self, obj: Union[Token, Span]) -> str: return make_spacy_key(obj, obj.doc._._s2v.make_key) def s2v_most_similar( - self, obj: Union[Token, Span], n_similar: int = 10 + self, obj: Union[Token, Span], n: int = 10 ) -> List[Tuple[Tuple[str, str], float]]: """Extension attribute method. Get the most similar entries. - n_similar (int): The number of similar entries to return. + n (int): The number of similar entries to return. RETURNS (list): The most similar entries as a list of ((word, sense), score) tuples. """ key = self.s2v_key(obj) - results = obj.doc._._s2v.most_similar([key], n_similar=n_similar) + results = obj.doc._._s2v.most_similar([key], n=n) return [(self.s2v.split_key(result), score) for result, score in results] def s2v_other_senses(self, obj: Union[Token, Span]) -> List[str]: diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 3ad9b8e..7b1f913 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -69,7 +69,7 @@ def get_stream(): parameter.""" while True: log(f"RECIPE: Getting {top_n} similar phrases") - most_similar = s2v.most_similar(accept_keys, n_similar=top_n) + most_similar = s2v.most_similar(accept_keys, n=top_n) for key, score in most_similar: if key not in seen and score > threshold: seen.add(key) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 1fdea3b..eab4e2e 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -1,4 +1,5 @@ from typing import Callable, Tuple, List, Union, Iterable, Dict +from collections import OrderedDict from pathlib import Path from spacy.vectors import Vectors from spacy.strings import StringStore @@ -124,24 +125,38 @@ def ensure_int_key(self, key: Union[str, int]) -> int: return key if isinstance(key, int) else self.strings[key] def most_similar( - self, keys: Iterable[Union[str, int]], n_similar: int = 10 + self, + keys: Union[Iterable[Union[str, int]], str, int], + n: int = 10, + batch_size: int = 16, ) -> List[Tuple[str, float]]: """Get the most similar entries in the table. key (iterable): The string or integer keys to compare to. - n_similar (int): The number of similar keys to return. + n (int): The number of similar keys to return. + batch_size (int): The batch size to use. RETURNS (list): The keys of the most similar vectors. """ - if not isinstance(keys, (list, tuple)): - raise ValueError(f"Expected iterable of keys. Got: {type(keys)}") - vecs = [self[key] for key in keys if key in self] - queries = numpy.asarray(vecs, dtype=numpy.float32) - result_keys, _, scores = self.vectors.most_similar(queries) - result = list(zip(result_keys, scores)) - result = [(self.strings[key], score) for key, score in result if key] + if isinstance(keys, (str, int)): + keys = [keys] + # Always ask for more because we'll always get the keys themselves + n_similar = n + len(keys) + for key in keys: + if key not in self: + raise ValueError(f"Can't find key {key} in table") + if len(self.vectors) < n_similar: + raise ValueError( + f"Can't get {n} most similar out of {len(self.vectors)} total " + f"entries in the table while excluding the {len(keys)} keys" + ) + vecs = [self[key] for key in keys] + result_keys, _, scores = self.vectors.most_similar( + numpy.vstack(vecs), n=n_similar, batch_size=batch_size + ) + result = OrderedDict(zip(result_keys.flatten(), scores.flatten())) + result = [(self.strings[key], score) for key, score in result.items() if key] result = [(key, score) for key, score in result if key not in keys] - # TODO: handle this better? - return result[:n_similar] + return result def get_other_senses( self, key: Union[str, int], ignore_case: bool = True diff --git a/setup.cfg b/setup.cfg index 610da40..1c7917e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -25,7 +25,7 @@ zip_safe = false include_package_data = true python_requires = >=3.6 install_requires = - spacy>=2.1.0 + spacy>=2.2.0 numpy>=1.15.0 srsly>=0.1.0 diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 7f8093b..5a5f389 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -69,12 +69,22 @@ def test_sense2vec_most_similar(): s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32)) s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32)) - result1 = s2v.most_similar(["x"]) - assert len(result1) + result1 = s2v.most_similar(["x"], n=2) + assert len(result1) == 2 assert result1[0][0] == "a" - # assert result1[0][1] == 1.0 - result2 = s2v.most_similar(["y"]) - assert len(result2) == 0 + # TODO: assert result1[0][1] == 1.0 + assert result1[0][1] == pytest.approx(1.0) + assert result1[1][0] == "b" + result2 = s2v.most_similar(["a", "x"], n=2) + assert len(result2) == 2 + assert sorted([key for key, _ in result2]) == ["b", "d"] + result3 = s2v.most_similar(["a", "b"], n=3) + assert len(result3) == 3 + assert "y" not in [key for key, _ in result3] + with pytest.raises(ValueError): + s2v.most_similar(["a", "b"], n=10) # not enough keys left in the table + with pytest.raises(ValueError): + s2v.most_similar(["z"], n=1) # key not in table def test_sense2vec_to_from_bytes(): From 7e1d10188ea68039b173783858b9ea849ac9650b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 19 Oct 2019 17:50:47 +0200 Subject: [PATCH 076/297] Update sense2vec.py --- sense2vec/sense2vec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index eab4e2e..efdd61f 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -96,6 +96,7 @@ def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): self.vectors.add(key, vector=vector) if freq is not None: self.set_freq(key, freq) + # TODO: add sense to senses? def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]: """Get the frequency count for a given key. From 5e9df403f7fe232666a451145772394dd442ae7d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 19 Oct 2019 17:50:59 +0200 Subject: [PATCH 077/297] Update test data with cfg and freqs --- tests/data/cfg | 7 ++++++- tests/data/freqs.json | 22 ++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) create mode 100644 tests/data/freqs.json diff --git a/tests/data/cfg b/tests/data/cfg index 0967ef4..35aa2e9 100644 --- a/tests/data/cfg +++ b/tests/data/cfg @@ -1 +1,6 @@ -{} +{ + "senses":[ + "NOUN", + "VERB" + ] +} \ No newline at end of file diff --git a/tests/data/freqs.json b/tests/data/freqs.json new file mode 100644 index 0000000..cb6c225 --- /dev/null +++ b/tests/data/freqs.json @@ -0,0 +1,22 @@ +[ + [ + 1729617160722737612, + 498 + ], + [ + 5277779877049457024, + 6718 + ], + [ + 8106363108491243548, + 495 + ], + [ + 2803970341986411846, + 87 + ], + [ + 7493120824676996139, + 33985 + ] +] \ No newline at end of file From bdbfa9bfb4e3aa6012643d3d8e565ced486a27b6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 19 Oct 2019 17:51:09 +0200 Subject: [PATCH 078/297] Update model tests --- tests/test_model.py | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 9da7fb9..5f54748 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -9,6 +9,19 @@ def s2v(): return Sense2Vec().from_disk(data_path) -def test_most_similar(s2v): +def test_model_most_similar(s2v): assert "beekeepers|NOUN" in s2v - result = s2v.most_similar(["beekeepers|NOUN"]) + result = s2v.most_similar(["beekeepers|NOUN"], n=2) + assert result[0][0] == "honey_bees|NOUN" + assert result[1][0] == "Beekeepers|NOUN" + + +def test_model_other_senses(s2v): + others = s2v.get_other_senses("duck|NOUN") + assert len(others) == 1 + assert others[0] == "duck|VERB" + + +def test_model_best_sense(s2v): + assert s2v.get_best_sense("duck") == "duck|NOUN" + assert s2v.get_best_sense("honey bees") == "honey_bees|NOUN" From a69d70be8d0f06eb0dbeecd999a8089e794c1b92 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 14:15:35 +0200 Subject: [PATCH 079/297] Make sure most_similar doesn't return scores > 1.0 --- sense2vec/sense2vec.py | 2 ++ tests/test_sense2vec.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index efdd61f..11faebb 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -157,6 +157,8 @@ def most_similar( result = OrderedDict(zip(result_keys.flatten(), scores.flatten())) result = [(self.strings[key], score) for key, score in result.items() if key] result = [(key, score) for key, score in result if key not in keys] + # TODO: normalize scores properly + result = [(key, 1.0 if score > 1.0 else score) for key, score in result] return result def get_other_senses( diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 5a5f389..19f0acf 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -72,7 +72,7 @@ def test_sense2vec_most_similar(): result1 = s2v.most_similar(["x"], n=2) assert len(result1) == 2 assert result1[0][0] == "a" - # TODO: assert result1[0][1] == 1.0 + assert result1[0][1] == 1.0 assert result1[0][1] == pytest.approx(1.0) assert result1[1][0] == "b" result2 = s2v.most_similar(["a", "x"], n=2) From ec20318fe02be4efbd8a32118bc7444737944408 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 14:15:44 +0200 Subject: [PATCH 080/297] Tidy up --- sense2vec/__init__.py | 10 ---------- sense2vec/sense2vec.py | 1 - setup.cfg | 1 + setup.py | 2 +- 4 files changed, 2 insertions(+), 12 deletions(-) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 6365ea1..9db2b5a 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -1,13 +1,3 @@ -from typing import Union -from pathlib import Path - from .about import __version__ # noqa: F401 from .sense2vec import Sense2Vec # noqa: F401 from .component import Sense2VecComponent # noqa: F401 - - -def load(vectors_path: Union[Path, str]) -> Sense2Vec: - # TODO: remove this? - if not Path(vectors_path).exists(): - raise IOError(f"Can't find vectors: {vectors_path}") - return Sense2Vec().from_disk(vectors_path) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 11faebb..8692da4 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -96,7 +96,6 @@ def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): self.vectors.add(key, vector=vector) if freq is not None: self.set_freq(key, freq) - # TODO: add sense to senses? def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]: """Get the frequency count for a given key. diff --git a/setup.cfg b/setup.cfg index 1c7917e..5b31dea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -18,6 +18,7 @@ classifiers = Programming Language :: Python :: 3 Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 Topic :: Scientific/Engineering [options] diff --git a/setup.py b/setup.py index eed8126..fa6192a 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ def setup_package(): about = {} exec(f.read(), about) - setup(name="sense2vec", version=about["__version__"], packages=find_packages()) + setup(name=package_name, version=about["__version__"], packages=find_packages()) if __name__ == "__main__": From 5f3a997a9bea97b26ee16228c9cbadc92ed994d4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 16:04:11 +0200 Subject: [PATCH 081/297] WIP: add hack for 0 division issue --- sense2vec/sense2vec.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 8692da4..a77b4e5 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -253,6 +253,9 @@ def from_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): strings_path = path / "strings.json" freqs_path = path / "freqs.json" self.vectors = Vectors().from_disk(path) + # TODO: this is a hack preventing division by 0 errors when getting + # the most similar vectors + self.vectors.data[self.vectors.data == 0] = 1e-10 self.cfg = srsly.read_json(path / "cfg") if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) From 65564e3bf68cce40f1d82d96ea4220893d33324d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 16:04:22 +0200 Subject: [PATCH 082/297] Update Prodigy recipes --- sense2vec/prodigy_recipes.py | 74 ++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 20 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 7b1f913..9c3af8d 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -1,11 +1,17 @@ import prodigy from prodigy.components.db import connect -from prodigy.util import log, split_string, set_hashes +from prodigy.util import log, split_string, set_hashes, TASK_HASH_ATTR from sense2vec import Sense2Vec import srsly import spacy +HTML_TEMPLATE = """ +{{word}} +{{sense}} +""" + + @prodigy.recipe( "sense2vec.teach", dataset=("Dataset to save annotations to", "positional", None, str), @@ -17,7 +23,7 @@ resume=("Resume from existing phrases dataset", "flag", "R", bool), ) def teach( - dataset, vectors_path, seeds, threshold=0.85, top_n=200, batch_size=5, resume=False + dataset, vectors_path, seeds, threshold=0.85, top_n=20, batch_size=5, resume=False ): """ Bootstrap a terminology list sense2vec. Prodigy will suggest similar terms @@ -25,23 +31,34 @@ def teach( """ log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) - log("RECIPE: Loaded sense2vec", locals()) - seed_keys = [] + log("RECIPE: Loaded sense2vec vectors", vectors_path) + accept_keys = [] + reject_keys = [] + seen = set(accept_keys) seed_tasks = [] for seed in seeds: - best_word, best_sense = s2v.get_best_sense(seed) - if best_sense is None: + key = s2v.get_best_sense(seed) + if key is None: raise ValueError(f"Can't find seed term '{seed}' in vectors") - key = s2v.make_key(best_word, best_sense) - seed_keys.append(key) - task = {"text": key, "word": best_word, "sense": best_sense, "answer": "accept"} + accept_keys.append(key) + best_word, best_sense = s2v.split_key(key) + task = { + "text": key, + "word": best_word, + "sense": best_sense, + "meta": {"score": 1.0}, + "answer": "accept", + } seed_tasks.append(set_hashes(task)) - print(f"Starting with seed keys: {seed_keys}") + print(f"Starting with seed keys: {accept_keys}") DB = connect() - DB.add_examples(seed_tasks, datasets=[dataset]) - accept_keys = seed_keys - reject_keys = [] - seen = set(accept_keys) + if dataset not in DB: + DB.add_dataset(dataset) + dataset_hashes = DB.get_task_hashes(dataset) + DB.add_examples( + [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes], + datasets=[dataset], + ) if resume: prev = DB.get_dataset(dataset) @@ -68,13 +85,19 @@ def get_stream(): presenting examples to the user with a similarity above the threshold parameter.""" while True: - log(f"RECIPE: Getting {top_n} similar phrases") + log( + f"RECIPE: Looking for {top_n} phrases most similar to " + f"{len(accept_keys)} accepted keys" + ) most_similar = s2v.most_similar(accept_keys, n=top_n) + log(f"RECIPE: Found {len(most_similar)} most similar phrases") for key, score in most_similar: if key not in seen and score > threshold: seen.add(key) word, sense = s2v.split_key(key) - meta = {"score": score} + # Make sure the score is a regular float, otherwise server + # may fail when trying to serialize it to/from JSON + meta = {"score": float(score)} yield {"text": key, "word": word, "sense": sense, "meta": meta} stream = get_stream() @@ -84,7 +107,7 @@ def get_stream(): "dataset": dataset, "stream": stream, "update": update, - "config": {"batch_size": batch_size, "html_template": "{{word}} ({{sense}})"}, + "config": {"batch_size": batch_size, "html_template": HTML_TEMPLATE}, } @@ -94,8 +117,12 @@ def get_stream(): spacy_model=("spaCy model for tokenization", "positional", None, str), label=("Label to apply to all patterns", "positional", None, str), output_file=("Optional output file. Defaults to stdout", "option", "o", str), + case_sensitive=("Make patterns case-sensitive", "flag", "CS", bool), + dry=("Perform a dry run and don't output anything", "flag", "D", bool), ) -def to_patterns(dataset, spacy_model, label, output_file="-"): +def to_patterns( + dataset, spacy_model, label, output_file="-", case_sensitive=False, dry=False +): """ Convert a list of seed phrases to a list of match patterns that can be used with ner.match. If no output file is specified, each pattern is printed. @@ -107,9 +134,16 @@ def to_patterns(dataset, spacy_model, label, output_file="-"): nlp = spacy.load(spacy_model) log(f"RECIPE: Loaded spaCy model '{spacy_model}'") DB = connect() + if dataset not in DB: + raise ValueError(f"Can't find dataset '{dataset}'") examples = DB.get_dataset(dataset) terms = [eg["text"] for eg in examples if eg["answer"] == "accept"] - patterns = [{"lower": t.lower_ for t in nlp.make_doc(term)} for term in terms] + if case_sensitive: + patterns = [{"text": t.text for t in nlp.make_doc(term)} for term in terms] + else: + patterns = [{"lower": t.lower_ for t in nlp.make_doc(term)} for term in terms] patterns = [{"label": label, "pattern": pattern} for pattern in patterns] log(f"RECIPE: Generated {len(patterns)} patterns") - srsly.write_jsonl(output_file, patterns) + if not dry: + srsly.write_jsonl(output_file, patterns) + return patterns From 584b4331e9979b752e2446f82f7b52c9f5a78119 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 17:42:13 +0200 Subject: [PATCH 083/297] Add __setitem__ for consistency --- sense2vec/sense2vec.py | 12 ++++++++++++ tests/test_sense2vec.py | 3 +++ 2 files changed, 15 insertions(+) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index a77b4e5..e9a6b95 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -66,6 +66,18 @@ def __getitem__(self, key: Union[str, int]) -> numpy.ndarray: if key in self.vectors: return self.vectors[key] + def __setitem__(self, key: Union[str, int], vector: numpy.ndarray): + """Set a vector for a given key. Will raise an error if the key + doesn't exist. + + key (unicode / int): The key. + vector (numpy.ndarray): The vector to set. + """ + key = self.ensure_int_key(key) + if key not in self.vectors: + raise ValueError(f"Can't find key {key} in table") + self.vectors[key] = vector + def __iter__(self): """YIELDS (tuple): String key and vector pairs in the table.""" yield from self.items() diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 19f0acf..6aaf5ca 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -19,6 +19,9 @@ def test_sense2vec_object(): s2v.add("test2", test_vector) assert "test2" in s2v assert sorted(list(s2v.keys())) == ["test", "test2"] + with pytest.raises(ValueError): + s2v["test3"] = test_vector + s2v["test2"] = test_vector def test_sense2vec_freqs(): From 682517e9f8de1652ffb7030aabb1ce06d175ac2c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 17:42:20 +0200 Subject: [PATCH 084/297] Update docstrings and docs --- README.md | 377 ++++++++++++++++++++++++++++------------- sense2vec/sense2vec.py | 15 +- 2 files changed, 271 insertions(+), 121 deletions(-) diff --git a/README.md b/README.md index 0a4d583..686eaf6 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,10 @@ query = "natural_language_processing|NOUN" assert query in s2v vector = s2v[query] freq = s2v.get_freq(query) -most_similar = s2v.most_similar(query, 3) -# [('natural_language_processing|NOUN', 1.0), -# ('machine_learning|NOUN', 0.8986966609954834), -# ('computer_vision|NOUN', 0.8636297583580017)] +most_similar = s2v.most_similar(query, n=3) +# [('machine_learning|NOUN', 0.8986967), +# ('computer_vision|NOUN', 0.8636297), +# ('deep_learning|NOUN', 0.8573361)] ``` ### Usage as a spaCy pipeline component @@ -51,9 +51,9 @@ assert doc[3:6].text == "natural language processing" freq = doc[3:6]._.s2v_freq vector = doc[3:6]._.s2v_vec most_similar = doc[3:6]._.s2v_most_similar(3) -# [(('natural language processing', 'NOUN'), 1.0), -# (('machine learning', 'NOUN'), 0.8986966609954834), -# (('computer vision', 'NOUN'), 0.8636297583580017)] +# [(('machine learning', 'NOUN'), 0.8986967), +# (('computer vision', 'NOUN'), 0.8636297), +# (('deep learning', 'NOUN'), 0.8573361)] ``` ## Installation & Setup @@ -71,7 +71,7 @@ extracted data directory: ```python from sense2vec import Sense2Vec -s2v = Sense2Vec.from_disk("/path/to/reddit_vectors-1.1.0") +s2v = Sense2Vec().from_disk("/path/to/reddit_vectors-1.1.0") ``` ## Usage @@ -131,14 +131,23 @@ for ent in doc.ents: ### Available attributes -The following attributes are available via the `._` property – for example -`token._.in_s2v`: +The following extension attributes are exposed on the `Doc` object via the `._` +property: + +| Name | Attribute Type | Type | Description | +| ------------- | -------------- | ---- | ----------------------------------------------------------------------------------- | +| `s2v_phrases` | property | list | All sense2vec-compatible phrases in the given `Doc` (noun phrases, named entities). | + +The following attributes are available via the `._` property of `Token` and +`Span` objects – for example `token._.in_s2v`: | Name | Attribute Type | Type | Description | | ------------------ | -------------- | ------------------ | ---------------------------------------------------------------------------------- | | `in_s2v` | property | bool | Whether a key exists in the vector map. | -| `s2v_freq` | property | int | The frequency of the given key. | +| `s2v_key` | property | unicode | The sense2vec key of the given object, e.g. `"duck|NOUN"`. | | `s2v_vec` | property | `ndarray[float32]` | The vector of the given key. | +| `s2v_freq` | property | int | The frequency of the given key. | +| `s2v_other_senses` | property | list | Available other senses, e.g. `"duck|VERB"` for `"duck|NOUN"`. | | `s2v_most_similar` | method | list | Get the `n` most similar terms. Returns a list of `((word, sense), score)` tuples. | > ⚠️ **A note on span attributes:** Under the hood, entities in `doc.ents` are @@ -151,155 +160,300 @@ The following attributes are available via the `._` property – for example ### Standalone usage -To use only the `sense2vec` library, you can import the package and then call -its `load()` method to load in the vectors. +You can also use the underlying `Sense2Vec` class directly and load in the +vectors using the `from_disk` method. See below for the available API methods. + +```python +from sense2vec import Sense2Vec +s2v = Sense2Vec().from_disk("/path/to/reddit_vectors-1.1.0") +most_similar = s2v.most_similar("natural_language_processing|NOUN", n=10) +``` + +> ⚠️ **Important note:** To look up entries in the vectors table, the keys need +> to follow the scheme of `phrase_text|SENSE` (note the `_` instead of spaces +> and the `|` before the tag or label) – for example, `machine_learning|NOUN`. +> Also note that the underlying vector table is case-sensitive. + +## 🎛 API + +### method `Sense2Vec.__init__` + +Initialize the `Sense2Vec` object. + +| Argument | Type | Description | +| ----------- | --------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `shape` | tuple | The vector shape. Defaults to `(1000, 128)`. | +| `strings` | `spacy.strings.StringStore` | Optional string store. Will be created if it doesn't exist. | +| `make_key` | callable | Optional custom function that takes a word and sense string and creates the key (e.g. `"some_word|sense"`). | +| `split_key` | callable | Optional custom function that takes a key and returns the word and sense (e.g. `("some word", "sense")`). | +| `senses` | list | Optional list of all available senses. Used in methods that generate the best sense or other senses. | +| **RETURNS** | `Sense2Vec` | The newly constructed object. | + +```python +s2v = Sense2Vec(shape=(300, 128), senses=["VERB", "NOUN"]) +``` + +### method `Sense2Vec.__len__` + +The number of rows in the vectors table. + +| Argument | Type | Description | +| ----------- | ---- | ---------------------------------------- | +| **RETURNS** | int | The number of rows in the vectors table. | + +```python +s2v = Sense2Vec(shape=(300, 128)) +assert len(s2v) == 300 +``` + +### method `Sense2Vec.__contains__` + +Check if a key is in the vectors table. + +| Argument | Type | Description | +| ----------- | ------------- | -------------------------------- | +| `key` | unicode / int | The key to look up. | +| **RETURNS** | bool | Whether the key is in the table. | ```python -import sense2vec -s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") +s2v = Sense2Vec(shape=(10, 4)) +s2v.add("avocado|NOUN", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) +assert "avocado|NOUN" in s2v +assert "avocado|VERB" not in s2v ``` -`sense2vec.load` returns an instance of the `VectorMap` class, which you can -interact with via the following methods. +### method `Sense2Vec.__getitem__` + +Retrieve a vector for a given key. -> ⚠️ **Important note:** When interacting with the `VectorMap` directly, the -> keys need to follow the scheme of `phrase_text|SENSE` (note the `_` instead of -> spaces and the `|` before the tag or label) – for example, -> `machine_learning|NOUN`. Also note that the underlying vector table is -> case-sensitive. +| Argument | Type | Description | +| ----------- | --------------- | ------------------- | +| `key` | unicode / int | The key to look up. | +| **RETURNS** | `numpy.ndarray` | The vector. | + +```python +vec = s2v["avocado|NOUN"] +``` -#### method `VectorMap.__len__` +### method `Sense2Vec.__setitem__` -The total number of entries in the map. +Set a vector for a given key. Will raise an error if the key doesn't exist. To +add a new entry, use `Sense2Vec.add`. -| Argument | Type | Description | -| ----------- | ---- | --------------------------------- | -| **RETURNS** | int | The number of entries in the map. | +| Argument | Type | Description | +| -------- | --------------- | ------------------ | +| `key` | unicode / int | The key. | +| `vector` | `numpy.ndarray` | The vector to set. | ```python -s2v = sense2vec.load("/path/to/reddit_vectors-1.1.0") -assert len(s2v) == 1195261 +vec = s2v["avocado|NOUN"] +s2v["avacado|NOUN"] = vec ``` -#### method `VectorMap.__contains__` +### method `Sense2Vec.add` -Check whether the `VectorMap` has a given key. Keys consist of the word string, -a pipe and the "sense", i.e. the part-of-speech tag or entity label. For -example: `'duck|NOUN'` or `'duck|VERB'`. See the section on "Senses" below for -more details. Also note that the underlying vector table is **case-sensitive**. +Add a new vector to the table. -| Argument | Type | Description | -| ----------- | ------- | ----------------------------------- | -| `string` | unicode | The key to check. | -| **RETURNS** | bool | Whether the key is part of the map. | +| Argument | Type | Description | +| -------- | --------------- | ------------------------------------------------------------ | +| `key` | unicode / int | The key to add. | +| `vector` | `numpy.ndarray` | The vector to add. | +| `freq` | int | Optional frequency count. Used to find best matching senses. | ```python -assert "duck|NOUN" in s2v -assert "duck|VERB" in s2v -assert "dkdksl|VERB" not in s2v +vec = s2v["avocado|NOUN"] +s2v.add("🥑|NOUN", vec, 1234) ``` -#### method `VectorMap.__getitem__` +### method `Sense2Vec.get_freq` -Retrieve a `(frequency, vector)` tuple from the vector map. The frequency is an -integer, the vector a `numpy.ndarray(dtype='float32')`. If the key is not found, -a `KeyError` is raised. +Get the frequency count for a given key. -| Argument | Type | Description | -| ----------- | ------- | ------------------------------------------------- | -| `string` | unicode | The key to retrieve the frequency and vector for. | -| **RETURNS** | tuple | The `(frequency, vector)` tuple. | +| Argument | Type | Description | +| ----------- | ------------- | ------------------------------------------------- | +| `key` | unicode / int | The key to look up. | +| `default` | - | Default value to return if no frequency is found. | +| **RETURNS** | int | The frequency count. | ```python -freq, vector = s2v["duck|NOUN"] +vec = s2v["avocado|NOUN"] +s2v.add("🥑|NOUN", vec, 1234) +assert s2v.get_freq("🥑|NOUN") == 1234 ``` -#### method `VectorMap.__setitem__` +### method `Sense2Vec.set_freq` -Assign a `(frequency, vector)` tuple to the vector map. The frequency should be -an integer, the vector a `numpy.ndarray(dtype='float32')`. +Set a frequency count for a given key. -| Argument | Type | Description | -| -------- | ------- | ---------------------------------------------- | -| `key` | unicode | The key to assign the frequency and vector to. | -| `value` | tuple | The `(frequency, vector)` tuple to assign. | +| Argument | Type | Description | +| -------- | ------------- | ----------------------------- | +| `key` | unicode / int | The key to set the count for. | +| `freq` | int | The frequency count. | ```python -freq, vector = s2v["avocado|NOUN"] -s2v["🥑|NOUN"] = (freq, vector) +s2v.set_freq("avocado|NOUN", 104294) ``` -#### method `VectorMap.__iter__`, `VectorMap.keys` +### method `Sense2Vec.__iter__`, `Sense2Vec.items` -Iterate over the keys in the map, in order of insertion. +Iterate over the entries in the vectors table. -| Argument | Type | Description | -| ---------- | ------- | -------------------- | -| **YIELDS** | unicode | The keys in the map. | +| Argument | Type | Description | +| ---------- | ----- | ----------------------------------------- | +| **YIELDS** | tuple | String key and vector pairs in the table. | -#### method `VectorMap.values` +```python +for key, vec in s2v: + print(key, vec) -Iterate over the values in the map, in order of insertion and yield -`(frequency, vector)` tuples from the vector map. The frequency is an integer, -the vector a `numpy.ndarray(dtype='float32')` +for key, vec in s2v.items(): + print(key, vec) +``` -| Argument | Type | Description | -| ---------- | ----- | ---------------------- | -| **YIELDS** | tuple | The values in the map. | +### method `Sense2Vec.keys` -#### method `VectorMap.items` +Iterate over the keys in the table. -Iterate over the items in the map, in order of insertion and yield -`(key, (frequency, vector))` tuples from the vector map. The frequency is an -integer, the vector a `numpy.ndarray(dtype='float32')` +| Argument | Type | Description | +| ---------- | ------- | ----------------------------- | +| **YIELDS** | unicode | The string keys in the table. | -| Argument | Type | Description | -| ---------- | ----- | --------------------- | -| **YIELDS** | tuple | The items in the map. | +```python +all_keys = list(s2v.keys()) +``` -#### method `VectorMap.most_similar` +### method `Sense2Vec.values` -Find the keys of the `n` most similar entries, given a vector. Note that the -_most_ similar entry with a score of `1.0` will be the key of the query vector -itself. +Iterate over the vectors in the table. -| Argument | Type | Description | -| ----------- | -------------------------------- | -------------------------------------------------- | -| `vector` | `numpy.ndarray(dtype='float32')` | The vector to compare to. | -| `n` | int | The number of entries to return. Defaults to `10`. | -| **RETURNS** | tuple | A `(words, scores)` tuple. | +| Argument | Type | Description | +| ---------- | --------------- | ------------------------- | +| **YIELDS** | `numpy.ndarray` | The vectors in the table. | ```python -freq, vector = s2v["avocado|NOUN"] -words, scores = s2v.most_similar(vector, n=3) -for word, score in zip(words, scores): - print(word, score) -# avocado|NOUN 1.0 -# avacado|NOUN 0.970944344997406 -# spinach|NOUN 0.962776780128479 +all_vecs = list(s2v.values()) ``` -#### method `VectorMap.save` +### property `Sense2Vec.senses` -Serialize the model to a directory. This will export three files to the output -directory: a `strings.json` containing the keys in insertion order, a -`freqs.json` containing the frequencies and a `vectors.bin` containing the -vectors. +The available senses in the table, e.g. `"NOUN"` or `"VERB"` (added at +initialization). -| Argument | Type | Description | -| ---------- | ------- | --------------------------------- | -| `data_dir` | unicode | The path to the output directory. | +| Argument | Type | Description | +| ----------- | ---- | --------------------- | +| **RETURNS** | list | The available senses. | -#### method `VectorMap.load` +```python +s2v = Sense2Vec(senses=["VERB", "NOUN"]) +assert "VERB" in s2v.senses +``` + +### method `Sense2Vec.most_similar` -Load a model from a directory. Expects three files in the directory (see -`VectorMap.save` for details). +Get the most similar entries in the table. -| Argument | Type | Description | -| ---------- | ------- | -------------------------------- | -| `data_dir` | unicode | The path to load the model from. | +| Argument | Type | Description | +| ------------ | ------------------------- | ------------------------------------------------------- | +| `keys` | unicode / int / iterable  | The string or integer key(s) to compare to. | +| `n` | int | The number of similar keys to return. Defaults to `10`. | +| `batch_size` | int | The batch size to use. Defaults to `16`. | +| **RETURNS** | list | The `(key, score)` tuples of the most similar vectors. | -## Senses +```python +most_similar = s2v.most_similar("natural_language_processing|NOUN", n=3) +# [('machine_learning|NOUN', 0.8986967), +# ('computer_vision|NOUN', 0.8636297), +# ('deep_learning|NOUN', 0.8573361)] +``` + +### method `Sense2Vec.get_other_senses` + +Find other entries for the same word with a different sense, e.g. `"duck|VERB"` +for `"duck|NOUN"`. + +| Argument | Type | Description | +| ------------- | ------------- | ----------------------------------------------------------------- | +| `key` | unicode / int | The key to check. | +| `ignore_case` | bool | Check for uppercase, lowercase and titlecase. Defaults to `True`. | +| **RETURNS** | list | The string keys of other entries with different senses. | + +```python +other_senses = s2v.get_other_senses("duck|NOUN") +# ['duck|VERB', 'Duck|ORG', 'Duck|VERB', 'Duck|PERSON', 'Duck|ADJ'] +``` + +### method `Sense2Vec.get_best_sense` + +Find the best-matching sense for a given word based on the available senses and +frequency counts. Returns `None` if no match is found. + +| Argument | Type | Description | +| ------------- | ------- | ----------------------------------------------------------------- | +| `word` | unicode | The word to check. | +| `ignore_case` | bool | Check for uppercase, lowercase and titlecase. Defaults to `True`. | +| **RETURNS** | unicode | The best-matching key or None. | + +```python +assert s2v.get_best_sense("duck") == "duck|NOUN" +``` + +### method `Sense2Vec.to_bytes` + +Serialize a `Sense2Vec` object to a bytestring. + +| Argument | Type | Description | +| ----------- | ----- | ----------------------------------------- | +| `exclude` | list | Names of serialization fields to exclude. | +| **RETURNS** | bytes | The serialized `Sense2Vec` object. | + +```python +s2v_bytes = s2v.to_bytes() +``` + +### method `Sense2Vec.from_bytes` + +Load a `Sense2Vec` object from a bytestring. + +| Argument | Type | Description | +| ------------ | ----------- | ----------------------------------------- | +| `bytes_data` | bytes | The data to load. | +| `exclude` | list | Names of serialization fields to exclude. | +| **RETURNS** | `Sense2Vec` | The loaded object. | + +```python +s2v_bytes = s2v.to_bytes() +new_s2v = Sense2Vec().from_bytes(s2v_bytes) +``` + +### method `Sense2Vec.to_disk` + +Serialize a `Sense2Vec` object to a directory. + +| Argument | Type | Description | +| --------- | ---------------- | ----------------------------------------- | +| `path` | unicode / `Path` | The path. | +| `exclude` | list | Names of serialization fields to exclude. | + +```python +s2v.to_disk("/path/to/sense2vec") +``` + +### method `Sense2Vec.from_disk` + +Load a `Sense2Vec` object from a directory. + +| Argument | Type | Description | +| ----------- | ---------------- | ----------------------------------------- | +| `path` | unicode / `Path` | The path. to load from | +| `exclude` | list | Names of serialization fields to exclude. | +| **RETURNS** | `Sense2Vec` | The loaded object. | + +```python +s2v.to_disk("/path/to/sense2vec") +new_s2v = Sense2Vec().from_disk("/path/to/sense2vec") +``` + +## Pre-trained vectors The pre-trained Reddit vectors support the following "senses", either part-of-speech tags or entity labels. For more details, see spaCy's @@ -336,8 +490,3 @@ part-of-speech tags or entity labels. For more details, see spaCy's | `EVENT` | Named hurricanes, battles, wars, sports events, etc. | | `WORK_OF_ART` | Titles of books, songs, etc. | | `LANGUAGE` | Any named language. | - -## Training a sense2vec model - -> **🚧 Under construction:** We're currently updating the training scripts for -> spaCy v2.x. diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index e9a6b95..5076778 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -24,9 +24,9 @@ def __init__( strings (StringStore): Optional string store. Will be created if it doesn't exist. make_key (callable): Optional custom function that takes a word and - sense string and creates the key (e.g. "word|sense"). + sense string and creates the key (e.g. "some_word|sense"). split_key (callable): Optional custom function that takes a key and - returns the word and sense (e.g. ("word", "sense")). + returns the word and sense (e.g. ("some word", "sense")). senses (list): Optional list of all available senses. Used in methods that generate the best sense or other senses. RETURNS (Sense2Vec): The newly constructed object. @@ -88,7 +88,7 @@ def items(self): yield self.strings[key], value def keys(self): - """YIELDS (unicode): The keys in the table.""" + """YIELDS (unicode): The string keys in the table.""" for key in self.vectors.keys(): yield self.strings[key] @@ -144,11 +144,12 @@ def most_similar( ) -> List[Tuple[str, float]]: """Get the most similar entries in the table. - key (iterable): The string or integer keys to compare to. + keys (unicode / int / iterable): The string or integer key(s) to compare to. n (int): The number of similar keys to return. batch_size (int): The batch size to use. - RETURNS (list): The keys of the most similar vectors. + RETURNS (list): The (key, score) tuples of the most similar vectors. """ + # TODO: this isn't always returning the correct number? if isinstance(keys, (str, int)): keys = [keys] # Always ask for more because we'll always get the keys themselves @@ -180,7 +181,7 @@ def get_other_senses( key (unicode / int): The key to check. ignore_case (bool): Check for uppercase, lowercase and titlecase. - RETURNS (list): Other entries with different senses. + RETURNS (list): The string keys of other entries with different senses. """ result = [] key = key if isinstance(key, str) else self.strings[key] @@ -199,7 +200,7 @@ def get_best_sense(self, word: str, ignore_case: bool = True) -> Union[str, None word (unicode): The word to check. ignore_case (bool): Check for uppercase, lowercase and titlecase. - RETURNS (unicode): The best-matching sense or None. + RETURNS (unicode): The best-matching key or None. """ if not self.senses: return None From 9ae10d34b21dc26d8d0889307248f73a8045e6d4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 17:51:10 +0200 Subject: [PATCH 085/297] Update README.md [ci skip] --- README.md | 27 ++++++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 686eaf6..2d39e1f 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,20 @@ simple Python implementation for loading and querying sense2vec models. [![pypi Version](https://img.shields.io/pypi/v/sense2vec.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/sense2vec/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) -## Usage Examples +## ✨ Features + +- Query **context-sensitive vectors** for **multi-word phrases** based on + part-of-speech tags and entity labels. +- spaCy **pipeline component** and **extension attributes**. +- Fully **serializable** so you can easily ship your sense2vec vectors with your + spaCy model packages. +- **Train your own vectors** using a pre-trained spaCy model and raw text of + your choice. +- [Prodigy](https://prodi.gy) annotation recipes for creating lists of similar + multi-word phrases and converting them to match patterns, e.g. for rule-based + NER or to boostrap NER annotation ([details & examples](#prodigy-recipes)). + +## 🚀 Usage Examples ### Standalone usage @@ -56,7 +69,7 @@ most_similar = doc[3:6]._.s2v_most_similar(3) # (('deep learning', 'NOUN'), 0.8573361)] ``` -## Installation & Setup +## ⏳ Installation & Setup sense2vec releases are available on pip: @@ -74,9 +87,9 @@ from sense2vec import Sense2Vec s2v = Sense2Vec().from_disk("/path/to/reddit_vectors-1.1.0") ``` -## Usage +## 👩‍💻 Usage -## Usage with spaCy v2.x +### Usage with spaCy v2.x The easiest way to use the library and vectors is to plug it into your spaCy pipeline. Note that `sense2vec` doesn't depend on spaCy, so you'll have to @@ -129,7 +142,7 @@ for ent in doc.ents: most_similar = ent._.s2v_most_similar(3) ``` -### Available attributes +#### Available attributes The following extension attributes are exposed on the `Doc` object via the `._` property: @@ -453,6 +466,10 @@ s2v.to_disk("/path/to/sense2vec") new_s2v = Sense2Vec().from_disk("/path/to/sense2vec") ``` +## 🍳 Prodigy recipes + +TODO + ## Pre-trained vectors The pre-trained Reddit vectors support the following "senses", either From f247a8d76c933ce7f38f94533bc73142268df2b3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 18:08:46 +0200 Subject: [PATCH 086/297] Update recipes --- sense2vec/prodigy_recipes.py | 39 +++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 9c3af8d..dc79111 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -16,24 +16,32 @@ "sense2vec.teach", dataset=("Dataset to save annotations to", "positional", None, str), vectors_path=("Path to pretrained sense2vec vectors", "positional", None, str), - seeds=("One or more comma-separated seed terms", "option", "se", split_string), + seeds=("One or more comma-separated seed phrases", "option", "se", split_string), threshold=("Similarity threshold for sense2vec", "option", "t", float), - top_n=("Only get the top n results for each accepted term", "option", "n", int), + n_similar=("Number of similar items to get at once", "option", "n", int), batch_size=("Batch size for submitting annotations", "option", "bs", int), resume=("Resume from existing phrases dataset", "flag", "R", bool), ) def teach( - dataset, vectors_path, seeds, threshold=0.85, top_n=20, batch_size=5, resume=False + dataset, + vectors_path, + seeds, + threshold=0.85, + n_similar=20, + batch_size=5, + resume=False, ): """ - Bootstrap a terminology list sense2vec. Prodigy will suggest similar terms - based on the the most similar phrases from sense2vec. + Bootstrap a terminology list using sense2vec. Prodigy will suggest similar + terms based on the the most similar phrases from sense2vec, and the + suggestions will be adjusted as you annotate and accept similar phrases. For + each seed term, the best matching sense according to the sense2vec vectors + will be used. """ log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) accept_keys = [] - reject_keys = [] seen = set(accept_keys) seed_tasks = [] for seed in seeds: @@ -63,11 +71,8 @@ def teach( if resume: prev = DB.get_dataset(dataset) prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] - prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"] accept_keys += prev_accept - reject_keys += prev_reject seen.update(set(accept_keys)) - seen.update(set(reject_keys)) log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}") def update(answers): @@ -77,8 +82,6 @@ def update(answers): phrase = answer["text"] if answer["answer"] == "accept": accept_keys.append(phrase) - elif answer["answer"] == "reject": - reject_keys.append(phrase) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and @@ -86,10 +89,10 @@ def get_stream(): parameter.""" while True: log( - f"RECIPE: Looking for {top_n} phrases most similar to " + f"RECIPE: Looking for {n_similar} phrases most similar to " f"{len(accept_keys)} accepted keys" ) - most_similar = s2v.most_similar(accept_keys, n=top_n) + most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") for key, score in most_similar: if key not in seen and score > threshold: @@ -113,7 +116,7 @@ def get_stream(): @prodigy.recipe( "sense2vec.to-patterns", - dataset=("Dataset to save annotations to", "positional", None, str), + dataset=("Phrase dataset to convert", "positional", None, str), spacy_model=("spaCy model for tokenization", "positional", None, str), label=("Label to apply to all patterns", "positional", None, str), output_file=("Optional output file. Defaults to stdout", "option", "o", str), @@ -124,10 +127,10 @@ def to_patterns( dataset, spacy_model, label, output_file="-", case_sensitive=False, dry=False ): """ - Convert a list of seed phrases to a list of match patterns that can be used - with ner.match. If no output file is specified, each pattern is printed. - The examples are tokenized to make sure that multi-token terms are - represented correctly, e.g.: + Convert a list of seed phrases to a list of token-based match patterns that + can be used with spaCy's EntityRuler or recipes like ner.match. If no output + file is specified, the patterns are written to stdout. The examples are + tokenized so that multi-token terms are represented correctly, e.g.: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ log("RECIPE: Starting recipe sense2vec.to-patterns", locals()) From f6c319db64374b7a128da72b8bf0b4205ea5eef4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 20 Oct 2019 18:08:54 +0200 Subject: [PATCH 087/297] Update README.md [ci skip] --- README.md | 74 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 69 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2d39e1f..4fd1d40 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ simple Python implementation for loading and querying sense2vec models. - spaCy **pipeline component** and **extension attributes**. - Fully **serializable** so you can easily ship your sense2vec vectors with your spaCy model packages. -- **Train your own vectors** using a pre-trained spaCy model and raw text of - your choice. +- **Train your own vectors** using a pretrained spaCy model and raw text of your + choice. - [Prodigy](https://prodi.gy) annotation recipes for creating lists of similar multi-word phrases and converting them to match patterns, e.g. for rule-based NER or to boostrap NER annotation ([details & examples](#prodigy-recipes)). @@ -468,11 +468,75 @@ new_s2v = Sense2Vec().from_disk("/path/to/sense2vec") ## 🍳 Prodigy recipes -TODO +This package also seamlessly integrates with the [Prodigy](https://prodi.gy) +annotation tool and exposes recipes for using sense2vec vectors to quickly +generate lists of multi-word phrases and bootstrap NER annotations. To use a +recipe, `sense2vec` needs to be installed in the same environment as Prodigy. +The following recipes are available: -## Pre-trained vectors +### recipe `sense2vec.teach` -The pre-trained Reddit vectors support the following "senses", either +Bootstrap a terminology list using sense2vec. Prodigy will suggest similar terms +based on the the most similar phrases from sense2vec, and the suggestions will +be adjusted as you annotate and accept similar phrases. For each seed term, the +best matching sense according to the sense2vec vectors will be used. + +```bash +prodigy sense2vec.teach [dataset] [vectors_path] [--seeds] [--threshold] +[--n-similar] [--batch-size] [--resume] +``` + +| Argument | Type | Description | +| -------------------- | ---------- | ----------------------------------------- | +| `dataset` | positional | Dataset to save annotations to. | +| `vectors_path` | positional | Path to pretrained sense2vec vectors. | +| `--seeds`, `-s` | option | One or more comma-separated seed phrases. | +| `--threshold`, `-t` | option | Similarity threshold. Defaults to `0.85`. | +| `--n-similar`, `-n` | option | Number of similar items to get at once. | +| `--batch-size`, `-b` | option | Batch size for submitting annotations. | +| `--resume`, `-R` | flag | Resume from an existing phrases dataset. | + +#### Example + +```bash +prodigy sense2vec.teach tech_phrases /path/to/reddit_vectors-1.1.0 +--seeds "natural language processing, machine learning, artificial intelligence" +``` + +### recipe `sense2vec.to-patterns` + +Convert a list of seed phrases to a list of token-based match patterns that can +be used with +[spaCy's `EntityRuler`](https://spacy.io/usage/rule-based-matching#entityruler) +or recipes like `ner.match`. If no output file is specified, the patterns are +written to stdout. The examples are tokenized so that multi-token terms are +represented correctly, e.g.: +`{"label": "SHOE_BRAND", "pattern": [{ "LOWER": "new" }, { "LOWER": "balance" }]}`. + +```bash +prodigy sense2vec.to-patterns [dataset] [spacy_model] [label] [--output-file] +[--case-sensitive] [--dry] +``` + +| Argument | Type | Description | +| ------------------------- | ---------- | -------------------------------------------- | +| `dataset` | positional | Phrase dataset to convert. | +| `spacy_model` | positional | spaCy model for tokenization. | +| `label` | positional | Label to apply to all patterns. | +| `--output-file`, `-o` | option | Optional output file. Defaults to stdout. | +| `--case-sensitive`, `-CS` | flag | Make patterns case-sensitive. | +| `--dry`, `-D` | flag | Perform a dry run and don't output anything. | + +#### Example + +```bash +prodigy sense2vec.to-patterns tech_phrases en_core_web_sm TECHNOLOGY +--output-file /path/to/patterns.jsonl +``` + +## Pretrained vectors + +The pretrained Reddit vectors support the following "senses", either part-of-speech tags or entity labels. For more details, see spaCy's [annotation scheme overview](https://spacy.io/api/annotation). From 888d4a9f032eea791d5648065e143130b61a1618 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 21 Oct 2019 15:24:05 +0200 Subject: [PATCH 088/297] Update pytest requirement --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 8968295..800e178 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,4 @@ spacy>=2.1.0,<2.3.0 numpy>=1.15.0 srsly>=0.1.0 # Development requirements -pytest>=4.1.0,<4.2.0 +pytest>=5.2.0,<6.0.0 From e878d77f498bb515aea46cfc06e94a43b64578ef Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 22 Oct 2019 17:23:25 +0200 Subject: [PATCH 089/297] Add vectors_name --- README.md | 17 +++++++++-------- sense2vec/sense2vec.py | 4 +++- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 4fd1d40..0e6cbea 100644 --- a/README.md +++ b/README.md @@ -193,14 +193,15 @@ most_similar = s2v.most_similar("natural_language_processing|NOUN", n=10) Initialize the `Sense2Vec` object. -| Argument | Type | Description | -| ----------- | --------------------------- | ----------------------------------------------------------------------------------------------------------- | -| `shape` | tuple | The vector shape. Defaults to `(1000, 128)`. | -| `strings` | `spacy.strings.StringStore` | Optional string store. Will be created if it doesn't exist. | -| `make_key` | callable | Optional custom function that takes a word and sense string and creates the key (e.g. `"some_word|sense"`). | -| `split_key` | callable | Optional custom function that takes a key and returns the word and sense (e.g. `("some word", "sense")`). | -| `senses` | list | Optional list of all available senses. Used in methods that generate the best sense or other senses. | -| **RETURNS** | `Sense2Vec` | The newly constructed object. | +| Argument | Type | Description | +| -------------- | --------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `shape` | tuple | The vector shape. Defaults to `(1000, 128)`. | +| `strings` | `spacy.strings.StringStore` | Optional string store. Will be created if it doesn't exist. | +| `make_key` | callable | Optional custom function that takes a word and sense string and creates the key (e.g. `"some_word|sense"`). | +| `split_key` | callable | Optional custom function that takes a key and returns the word and sense (e.g. `("some word", "sense")`). | +| `senses` | list | Optional list of all available senses. Used in methods that generate the best sense or other senses. | +| `vectors_name` | unicode | Optional name to assign to the `Vectors` table, to prevent clashes. Defaults to `"sense2vec"`. | +| **RETURNS** | `Sense2Vec` | The newly constructed object. | ```python s2v = Sense2Vec(shape=(300, 128), senses=["VERB", "NOUN"]) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 5076778..5e92ad2 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -17,6 +17,7 @@ def __init__( make_key: Callable[[str, str], str] = make_key, split_key: Callable[[str], Tuple[str, str]] = split_key, senses: List[str] = [], + vectors_name: str = "sense2vec", ): """Initialize the Sense2Vec object. @@ -29,11 +30,12 @@ def __init__( returns the word and sense (e.g. ("some word", "sense")). senses (list): Optional list of all available senses. Used in methods that generate the best sense or other senses. + vectors_name (unicode): Optional name to assign to the Vectors object. RETURNS (Sense2Vec): The newly constructed object. """ self.make_key = make_key self.split_key = split_key - self.vectors = Vectors(shape=shape) + self.vectors = Vectors(shape=shape, name=vectors_name) self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} self.cfg = {"senses": senses} From d57a415b139c46a551b52be4b587e94731f2e344 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 22 Oct 2019 17:27:56 +0200 Subject: [PATCH 090/297] Make None return value more explicit --- README.md | 10 +++++----- sense2vec/sense2vec.py | 6 ++++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 0e6cbea..5a6d2a3 100644 --- a/README.md +++ b/README.md @@ -238,12 +238,12 @@ assert "avocado|VERB" not in s2v ### method `Sense2Vec.__getitem__` -Retrieve a vector for a given key. +Retrieve a vector for a given key. Returns None if the key is not in the table. -| Argument | Type | Description | -| ----------- | --------------- | ------------------- | -| `key` | unicode / int | The key to look up. | -| **RETURNS** | `numpy.ndarray` | The vector. | +| Argument | Type | Description | +| ----------- | --------------- | --------------------- | +| `key` | unicode / int | The key to look up. | +| **RETURNS** | `numpy.ndarray` | The vector or `None`. | ```python vec = s2v["avocado|NOUN"] diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 5e92ad2..a5c7910 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -58,8 +58,9 @@ def __contains__(self, key: Union[str, int]) -> bool: key = self.ensure_int_key(key) return key in self.vectors - def __getitem__(self, key: Union[str, int]) -> numpy.ndarray: - """Retrieve a vector for a given key. + def __getitem__(self, key: Union[str, int]) -> Union[numpy.ndarray, None]: + """Retrieve a vector for a given key. Returns None if the key is not + in the table. key (unicode / int): The key to look up. RETURNS (numpy.ndarray): The vector. @@ -67,6 +68,7 @@ def __getitem__(self, key: Union[str, int]) -> numpy.ndarray: key = self.ensure_int_key(key) if key in self.vectors: return self.vectors[key] + return None def __setitem__(self, key: Union[str, int], vector: numpy.ndarray): """Set a vector for a given key. Will raise an error if the key From e35dd33b3daced41a95c8eae55338a0247933435 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 22 Oct 2019 17:41:21 +0200 Subject: [PATCH 091/297] Average over vectors in most_similar --- README.md | 3 ++- sense2vec/sense2vec.py | 9 +++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5a6d2a3..0e12b24 100644 --- a/README.md +++ b/README.md @@ -364,7 +364,8 @@ assert "VERB" in s2v.senses ### method `Sense2Vec.most_similar` -Get the most similar entries in the table. +Get the most similar entries in the table. If more than one key is provided, the +average of the vectors is used. | Argument | Type | Description | | ------------ | ------------------------- | ------------------------------------------------------- | diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index a5c7910..be3d16c 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -146,14 +146,14 @@ def most_similar( n: int = 10, batch_size: int = 16, ) -> List[Tuple[str, float]]: - """Get the most similar entries in the table. + """Get the most similar entries in the table. If more than one key is + provided, the average of the vectors is used. keys (unicode / int / iterable): The string or integer key(s) to compare to. n (int): The number of similar keys to return. batch_size (int): The batch size to use. RETURNS (list): The (key, score) tuples of the most similar vectors. """ - # TODO: this isn't always returning the correct number? if isinstance(keys, (str, int)): keys = [keys] # Always ask for more because we'll always get the keys themselves @@ -166,9 +166,10 @@ def most_similar( f"Can't get {n} most similar out of {len(self.vectors)} total " f"entries in the table while excluding the {len(keys)} keys" ) - vecs = [self[key] for key in keys] + vecs = numpy.vstack([self[key] for key in keys]) + average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( - numpy.vstack(vecs), n=n_similar, batch_size=batch_size + average, n=n_similar, batch_size=batch_size ) result = OrderedDict(zip(result_keys.flatten(), scores.flatten())) result = [(self.strings[key], score) for key, score in result.items() if key] From 10dd4a659b0ca36245a37c015455160d804bdab6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 22 Oct 2019 17:41:27 +0200 Subject: [PATCH 092/297] Pass kwargs through --- sense2vec/component.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index 93a6ee8..fc8c71d 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -17,6 +17,7 @@ def __init__( vocab: Vocab = None, shape: Tuple[int, int] = (1000, 128), merge_phrases: bool = False, + **kwargs ): """Initialize the pipeline component. @@ -38,7 +39,7 @@ def from_nlp(cls, nlp: Language, **kwargs): nlp (Language): The nlp object. RETURNS (Sense2VecComponent): The newly constructed object. """ - return cls(vocab=nlp.vocab) + return cls(vocab=nlp.vocab, **kwargs) def __call__(self, doc: Doc) -> Doc: """Process a Doc object with the component. From 6040aacab299e57575fbc3241d108c918bd8917d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 22 Oct 2019 18:27:42 +0200 Subject: [PATCH 093/297] Remove hack (resolved in spaCy v2.2.2) --- sense2vec/sense2vec.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index be3d16c..ff98226 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -271,9 +271,6 @@ def from_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): strings_path = path / "strings.json" freqs_path = path / "freqs.json" self.vectors = Vectors().from_disk(path) - # TODO: this is a hack preventing division by 0 errors when getting - # the most similar vectors - self.vectors.data[self.vectors.data == 0] = 1e-10 self.cfg = srsly.read_json(path / "cfg") if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) From 15f7b797e123324fd862da63d000be056ff250b2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Oct 2019 18:28:15 +0100 Subject: [PATCH 094/297] Pin to spaCy dev version --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 800e178..b5c4346 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -spacy>=2.1.0,<2.3.0 +spacy==2.2.2.dev3 numpy>=1.15.0 srsly>=0.1.0 # Development requirements diff --git a/setup.cfg b/setup.cfg index 5b31dea..9b2f885 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,7 +26,7 @@ zip_safe = false include_package_data = true python_requires = >=3.6 install_requires = - spacy>=2.2.0 + spacy==2.2.2.dev3 numpy>=1.15.0 srsly>=0.1.0 From 8be5107422626a34d7afda89df1c0471b373fa4c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Oct 2019 18:28:29 +0100 Subject: [PATCH 095/297] Remove scores hack Now supported within spaCy --- sense2vec/sense2vec.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index ff98226..6c1c93c 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -174,8 +174,6 @@ def most_similar( result = OrderedDict(zip(result_keys.flatten(), scores.flatten())) result = [(self.strings[key], score) for key, score in result.items() if key] result = [(key, score) for key, score in result if key not in keys] - # TODO: normalize scores properly - result = [(key, 1.0 if score > 1.0 else score) for key, score in result] return result def get_other_senses( From 9a938add968455275781ce823d0500add2c645c8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 29 Oct 2019 18:29:20 +0100 Subject: [PATCH 096/297] Update version --- sense2vec/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/about.py b/sense2vec/about.py index 6559039..e3ac18c 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1 +1 @@ -__version__ = "1.0.0a2" +__version__ = "1.0.0.dev0" From 73596fc7a2149e11fad5c68d6cac96e63fe9b928 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 14:15:39 +0100 Subject: [PATCH 097/297] Update scripts --- scripts/preprocess.py | 14 +++++---- scripts/requirements.txt | 3 +- scripts/train.py | 67 ++++++++++++++++++++++++---------------- 3 files changed, 50 insertions(+), 34 deletions(-) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index 76dd6e3..e88102a 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -2,8 +2,9 @@ from sense2vec.util import merge_phrases, make_spacy_key import spacy from pathlib import Path -from tqdm import tqdm import plac +import tqdm +from wasabi import Printer def represent_doc(doc): @@ -40,23 +41,24 @@ def main(in_file, out_file, spacy_model="en_core_web_sm", n_workers=4): efficiency yet and doesn't paralellize or batch up any of the work, so you might have to add this functionality yourself for now. """ + msg = Printer() input_path = Path(in_file) output_path = Path(out_file) if not input_path.exists(): - raise IOError(f"Can't find input file: {in_file}") + msg.fail("Can't find input file", in_file, exits=1) nlp = spacy.load(spacy_model) - print(f"Using spaCy model {spacy_model}") + msg.info(f"Using spaCy model {spacy_model}") nlp.add_pipe(merge_phrases, name="merge_sense2vec_phrases") lines_count = 0 + msg.text("Preprocessing text...") with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_threads=n_workers) lines = (represent_doc(doc) for doc in docs) with output_path.open("w", encoding="utf8") as f: - for line in tqdm(lines, desc="Lines", unit=""): + for line in tqdm.tqdm(lines, desc="Lines", unit=""): lines_count += 1 f.write(line) - print(f"Successfully preprocessed {lines_count} lines") - print(output_path.resolve()) + msg.good(f"Successfully preprocessed {lines_count} lines", output_path.resolve()) if __name__ == "__main__": diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 88c05a9..b3b680b 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,3 +1,4 @@ -plac>=0.9.6,<1.0.0 +plac>=0.9.6,<1.2.0 tqdm>=4.36.1,<5.0.0 gensim>=3.8.1,<4.0.0 +wasabi>=0.2.0,<1.1.0 diff --git a/scripts/train.py b/scripts/train.py index 420c9c6..31a61f1 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -3,13 +3,15 @@ from gensim.models.word2vec import PathLineSentences from sense2vec import Sense2Vec from sense2vec.util import split_key +from pathlib import Path import plac import logging +from wasabi import Printer @plac.annotations( - in_dir=("Location of input directory", "positional", None, str), - out_dir=("Location of output directory", "positional", None, str), + input_data=("Location of input directory or text file", "positional", None, str), + output_dir=("Location of output directory", "positional", None, str), n_workers=("Number of workers", "option", "n", int), size=("Dimension of the word vectors", "option", "d", int), window=("Context window size", "option", "w", int), @@ -19,8 +21,8 @@ verbose=("Log debugging info", "flag", "V", bool), ) def main( - in_dir, - out_dir, + input_data, + output_dir, negative=5, n_workers=4, window=5, @@ -29,6 +31,9 @@ def main( nr_iter=2, verbose=False, ): + msg = Printer(hide_animation=verbose) + if not Path(input_data).exists(): + msg.fail("Can't find input data (file or directory)", input_data, exits=1) if verbose: logging.basicConfig( format="%(asctime)s - %(message)s", datefmt="%H:%M:%S", level=logging.INFO @@ -42,31 +47,39 @@ def main( negative=negative, iter=nr_iter, ) - sentences = PathLineSentences(in_dir) - print("Building the vocabulary...") - w2v_model.build_vocab(sentences) - print("Training the model...") - w2v_model.train( - sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter - ) - print("Creating the sense2vec model...") + sentences = PathLineSentences(input_data) + msg.info(f"Using input data from {len(sentences.input_files)} file(s)") + with msg.loading("Building the vocabulary..."): + w2v_model.build_vocab(sentences) + msg.good("Built the vocabulary") + with msg.loading("Training the model..."): + w2v_model.train( + sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter + ) + msg.good("Trained the model") vectors = [] all_senses = set() - for string in w2v_model.wv.vocab: - vocab = w2v_model.wv.vocab[string] - freq, idx = vocab.count, vocab.index - if freq < min_count: - continue - vector = w2v_model.wv.vectors[idx] - vectors.append((string, freq, vector)) - _, sense = split_key(string) - all_senses.add(sense) - s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses) - for string, freq, vector in vectors: - s2v.add(string, vector, freq) - print("Saving the model...") - s2v.to_disk(out_dir) - print(f"Saved model to directory: {out_dir}") + with msg.loading("Creating the sense2vec model..."): + for string in w2v_model.wv.vocab: + vocab = w2v_model.wv.vocab[string] + freq, idx = vocab.count, vocab.index + if freq < min_count: + continue + vector = w2v_model.wv.vectors[idx] + vectors.append((string, freq, vector)) + _, sense = split_key(string) + all_senses.add(sense) + s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses) + for string, freq, vector in vectors: + s2v.add(string, vector, freq) + msg.good("Created the sense2vec model") + msg.info(f"{len(vectors)} vectors, {len(all_senses)} total senses") + with msg.loading("Saving the model..."): + output_path = Path(output_dir) + if not output_path.exists(): + output_path.mkdir(parents=True) + s2v.to_disk(output_path) + msg.good("Saved model to directory", output_dir) if __name__ == "__main__": From 11393069e38d797ed86237df3abccf2ebb587404 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 30 Oct 2019 14:49:18 +0100 Subject: [PATCH 098/297] Add code to merge phrasal verbs --- sense2vec/util.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sense2vec/util.py b/sense2vec/util.py index fb75f04..0184cf0 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -51,6 +51,9 @@ def make_spacy_key( sense = obj.pos_ elif isinstance(obj, Span): sense = obj.label_ or obj.root.pos_ + if obj.doc.is_parsed and isinstance(obj, Token) and sense == "VERB": + particles = [child.text for child in obj.children if is_particle(child)] + text = "_".join([obj.text] + particles) return make_key(text, sense or DEFAULT_SENSE) @@ -70,6 +73,11 @@ def get_phrases(doc: Doc) -> List[Span]: return spans +def is_particle(word, tags=("RP",), deps=("prt",)): + """Determine whether a word is a 'particle', for phrasal verb detection.""" + return (word.tag_ in tags or word.dep_ in deps) + + def merge_phrases(doc: Doc) -> Doc: """Transform a spaCy Doc to match the sense2vec format: merge entities into one token and merge noun chunks without determiners. From 6c59835cf40913e7c3001a8dbccb767e2181f2bd Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 30 Oct 2019 14:55:04 +0100 Subject: [PATCH 099/297] Fix entity senses --- sense2vec/util.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index fb75f04..9b9a391 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -47,6 +47,8 @@ def make_spacy_key( if obj.like_url: text = "%%URL" sense = "X" + elif obj.ent_type_: + sense = obj.ent_type_ else: sense = obj.pos_ elif isinstance(obj, Span): @@ -62,11 +64,16 @@ def get_phrases(doc: Doc) -> List[Span]: RETURNS (list): The phrases as a list of Span objects. """ spans = list(doc.ents) + ent_words = set() + for span in spans: + ent_words.update(token.i for token in span) if doc.is_parsed: for np in doc.noun_chunks: - while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): - np = np[1:] - spans.append(np) + # Prefer entities over noun chunks if there's overlap. + if not any(w.i in ent_words for w in np): + while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): + np = np[1:] + spans.append(np) return spans From d03a4e0c9af0303b25bb45c6ace6a0b6ef70d3a9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 14:57:11 +0100 Subject: [PATCH 100/297] Update scripts and document [ci skip] --- README.md | 71 +++++++++++++++++++++++++++++++++++++++++++ scripts/preprocess.py | 17 ++++------- scripts/train.py | 16 ++++++---- 3 files changed, 87 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 0e12b24..8255bc1 100644 --- a/README.md +++ b/README.md @@ -468,6 +468,77 @@ s2v.to_disk("/path/to/sense2vec") new_s2v = Sense2Vec().from_disk("/path/to/sense2vec") ``` +## 🚂 Training your own sense2vec vectors + +The [`/scripts`](/scripts) directory contains command line utilities for +preprocessing text and training your own vectors. To train your own sense2vec +vectors, you'll need the following: + +- A **very large** source of raw text (ideally more than you'd use for word2vec, + since the senses make the vocabulary more sparse). We recommend at least 1 + billion words. +- A [pretrained spaCy model](https://spacy.io/models) that assigns + part-of-speech tags, dependencies and named entities, and populates the + `doc.noun_chunks`. If the language you need doesn't provide a built in + [syntax iterator for noun phrases](https://spacy.io/usage/adding-languages#syntax-iterators), + you'll need to write your own. (The `doc.noun_chunks` and `doc.ents` are what + sense2vec uses to determine what's a phrase.) + +### script `preprocess.py` + +Preprocess a corpus for training a sense2vec model. It takes a text file with +one sentence per line, and outputs a text file with one sentence per line in the +expected sense2vec format (merged noun phrases, concatenated phrases with +underscores and added "senses"). + +```bash +python preprocess.py [in_file] [out_file] [spacy_model] [--n-process] +``` + +| Argument | Type | Description | +| ------------------- | ---------- | ------------------------------------------------------------------------------------ | +| `in_file` | positional | Path to input file. | +| `out_file` | positional | Path to output file. | +| `spacy_model` | positional | Name of [spaCy model](https://spacy.io/models) to use. Defaults to `en_core_web_sm`. | +| `--n-process`, `-n` | option | Number of processes (multiprocessing). Defaults to `1`. | + +#### Example input + +``` +Rats, mould and broken furniture: the scandal of the UK's refugee housing +``` + +#### Example output + +``` +Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT +the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN +``` + +### script `train.py` + +Train a sense2vec model using [Gensim](https://radimrehurek.com/gensim/). +Accepts a text file or a directory of text files in the format created by the +preprocessing script. Saves out a sense2vec model component that can be loaded +via `Sense2Vec.from_disk`. + +```bash +python train.py [input_data] [output_dir] [--n-workers] [--size] [--window] +[--min-count] [--negative] [--n-iter] [--verbose] +``` + +| Argument | Type | Description | +| ------------------- | ---------- | ------------------------------------------------------------------- | +| `input_data` | positional | Location of input directory or text file. | +| `output_dir` | positional | Location of output directory. Will be created if it doesn't exist. | +| `--n-workers`, `-n` | option | Number of workers. Defaults to `4`. | +| `--size`, `-s` | option | Dimension of the vectors. Defaults to `128`. | +| `--window`, `-w` | option | Context window size. Defaults to `5`. | +| `--min-count`, `-m` | option | The minimum frequency of the term to be included. Defaults to `10`. | +| `--negative`, `-g` | option | Number of negative examples for Word2Vec. Defaults to `5`. | +| `--n-iter`, `-i` | option | Number of iterations. | +| `--verbose`, `-V` | flag | Log debugging info. | + ## 🍳 Prodigy recipes This package also seamlessly integrates with the [Prodigy](https://prodi.gy) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index e88102a..2dd2fe6 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -20,14 +20,14 @@ def represent_doc(doc): in_file=("Path to input file", "positional", None, str), out_file=("Path to output file", "positional", None, str), spacy_model=("Name of spaCy model to use", "positional", None, str), - n_workers=("Number of workers", "option", "n", int), + n_process=("Number of processes (multiprocessing)", "option", "n", int), ) -def main(in_file, out_file, spacy_model="en_core_web_sm", n_workers=4): +def main(in_file, out_file, spacy_model="en_core_web_sm", n_process=1): """ This script can be used to preprocess a corpus for training a sense2vec - model. It take text file with one sentence per line, and outputs a text file - with one sentence per line in the expected sense2vec format (merged noun - phrases, concatenated phrases with underscores and added "senses"). + model. It takes a text file with one sentence per line, and outputs a text + file with one sentence per line in the expected sense2vec format (merged + noun phrases, concatenated phrases with underscores and added "senses"). Example input: Rats, mould and broken furniture: the scandal of the UK's refugee housing @@ -35,11 +35,6 @@ def main(in_file, out_file, spacy_model="en_core_web_sm", n_workers=4): Example output: Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN - - DISCLAIMER: The sense2vec training and preprocessing tools are still a work - in progress. Please note that this script hasn't been optimised for - efficiency yet and doesn't paralellize or batch up any of the work, so you - might have to add this functionality yourself for now. """ msg = Printer() input_path = Path(in_file) @@ -52,7 +47,7 @@ def main(in_file, out_file, spacy_model="en_core_web_sm", n_workers=4): lines_count = 0 msg.text("Preprocessing text...") with input_path.open("r", encoding="utf8") as texts: - docs = nlp.pipe(texts, n_threads=n_workers) + docs = nlp.pipe(texts, n_process=n_process) lines = (represent_doc(doc) for doc in docs) with output_path.open("w", encoding="utf8") as f: for line in tqdm.tqdm(lines, desc="Lines", unit=""): diff --git a/scripts/train.py b/scripts/train.py index 31a61f1..e36abe9 100644 --- a/scripts/train.py +++ b/scripts/train.py @@ -13,11 +13,11 @@ input_data=("Location of input directory or text file", "positional", None, str), output_dir=("Location of output directory", "positional", None, str), n_workers=("Number of workers", "option", "n", int), - size=("Dimension of the word vectors", "option", "d", int), + size=("Dimension of the vectors", "option", "s", int), window=("Context window size", "option", "w", int), - min_count=("Min count", "option", "m", int), - negative=("Number of negative samples", "option", "g", int), - nr_iter=("Number of iterations", "option", "i", int), + min_count=("Minimum frequency of terms to be included", "option", "m", int), + negative=("Number of negative examples for Word2Vec", "option", "g", int), + n_iter=("Number of iterations", "option", "i", int), verbose=("Log debugging info", "flag", "V", bool), ) def main( @@ -28,9 +28,13 @@ def main( window=5, size=128, min_count=10, - nr_iter=2, + n_iter=2, verbose=False, ): + """Train a sense2vec model using Gensim. Accepts a text file or a directory + of text files in the format created by the preprocessing script. Saves out + a sense2vec model component that can be loaded via Sense2Vec.from_disk. + """ msg = Printer(hide_animation=verbose) if not Path(input_data).exists(): msg.fail("Can't find input data (file or directory)", input_data, exits=1) @@ -45,7 +49,7 @@ def main( workers=n_workers, sample=1e-5, negative=negative, - iter=nr_iter, + iter=n_iter, ) sentences = PathLineSentences(input_data) msg.info(f"Using input data from {len(sentences.input_files)} file(s)") From 8334c441c48f077977cf68762f75556012021eeb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 14:58:39 +0100 Subject: [PATCH 101/297] Update README.md [ci skip] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8255bc1..934c188 100644 --- a/README.md +++ b/README.md @@ -26,10 +26,10 @@ simple Python implementation for loading and querying sense2vec models. - Fully **serializable** so you can easily ship your sense2vec vectors with your spaCy model packages. - **Train your own vectors** using a pretrained spaCy model and raw text of your - choice. + choice ([details](#-training-your-own-sense2vec-vectors)). - [Prodigy](https://prodi.gy) annotation recipes for creating lists of similar multi-word phrases and converting them to match patterns, e.g. for rule-based - NER or to boostrap NER annotation ([details & examples](#prodigy-recipes)). + NER or to boostrap NER annotation ([details & examples](#-prodigy-recipes)). ## 🚀 Usage Examples From fdbf52116b57899a04fd3c489cbfb26de0e89459 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 15:14:06 +0100 Subject: [PATCH 102/297] Add prefer_ents setting to make_spacy_key --- scripts/preprocess.py | 4 +++- sense2vec/component.py | 4 +++- sense2vec/util.py | 10 ++++++++-- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/scripts/preprocess.py b/scripts/preprocess.py index 76dd6e3..023afbe 100644 --- a/scripts/preprocess.py +++ b/scripts/preprocess.py @@ -10,7 +10,9 @@ def represent_doc(doc): strings = [] for sent in doc.sents: if sent.text.strip(): - words = " ".join(make_spacy_key(w) for w in sent if not w.is_space) + words = " ".join( + make_spacy_key(w, prefer_ents=True) for w in sent if not w.is_space + ) strings.append(words) return "\n".join(strings) + "\n" if strings else "" diff --git a/sense2vec/component.py b/sense2vec/component.py index fc8c71d..3e9112a 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -103,7 +103,9 @@ def s2v_key(self, obj: Union[Token, Span]) -> str: obj (Token / Span): The object to create the key for. RETURNS (unicode): The key. """ - return make_spacy_key(obj, obj.doc._._s2v.make_key) + return make_spacy_key( + obj, obj.doc._._s2v.make_key, prefer_ents=self.merge_phrases + ) def s2v_most_similar( self, obj: Union[Token, Span], n: int = 10 diff --git a/sense2vec/util.py b/sense2vec/util.py index 9b9a391..2f55516 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -29,7 +29,9 @@ def split_key(key: str) -> Tuple[str, str]: def make_spacy_key( - obj: Union[Token, Span], make_key: Callable[[str, str], str] = make_key + obj: Union[Token, Span], + make_key: Callable[[str, str], str] = make_key, + prefer_ents: bool = False, ) -> str: """Create a key from a spaCy object, i.e. a Token or Span. If the object is a token, the part-of-speech tag (Token.pos_) is used for the sense @@ -40,6 +42,10 @@ def make_spacy_key( obj (Token / Span): The spaCy object to create the key for. make_key (callable): function that takes a word and sense string and creates the key (e.g. "word|sense"). + prefer_ents (bool): Prefer entity types for single tokens (i.e. + token.ent_type instead of tokens.pos_). Should be enabled if phrases + are merged into single tokens, because otherwise the entity sense would + never be used. RETURNS (unicode): The key. """ text = obj.text @@ -47,7 +53,7 @@ def make_spacy_key( if obj.like_url: text = "%%URL" sense = "X" - elif obj.ent_type_: + elif obj.ent_type_ and prefer_ents: sense = obj.ent_type_ else: sense = obj.pos_ From b327e087a0a1f5c4118bfef536561127f67c35c4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 15:22:15 +0100 Subject: [PATCH 103/297] Update docstring --- sense2vec/util.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index 0184cf0..5309401 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -73,9 +73,14 @@ def get_phrases(doc: Doc) -> List[Span]: return spans -def is_particle(word, tags=("RP",), deps=("prt",)): - """Determine whether a word is a 'particle', for phrasal verb detection.""" - return (word.tag_ in tags or word.dep_ in deps) +def is_particle(token: Token, pos: Tuple[str] = ("PART",), deps: Tuple[str] = ("prt",)): + """Determine whether a word is a particle, for phrasal verb detection. + + token (Token): The token to check. + pos (tuple): The universal POS tags to check (Token.pos_). + deps (tuple): The dependency labels to check (Token.dep_). + """ + return token.pos_ in pos or token.dep_ in deps def merge_phrases(doc: Doc) -> Doc: From c126716554c8b84e0025f205c01dfdb2aa74491b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 15:23:15 +0100 Subject: [PATCH 104/297] Remove phrasal verbs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Can't support it like this: logic should live in merge_phrases and not in the function that generates the key – otherwise all attributes are out-of-sync and s2v secretly produces more objects that are not on record anywhere. Need to find another way to manage non-spans. --- sense2vec/util.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index 5309401..36679c2 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -51,9 +51,6 @@ def make_spacy_key( sense = obj.pos_ elif isinstance(obj, Span): sense = obj.label_ or obj.root.pos_ - if obj.doc.is_parsed and isinstance(obj, Token) and sense == "VERB": - particles = [child.text for child in obj.children if is_particle(child)] - text = "_".join([obj.text] + particles) return make_key(text, sense or DEFAULT_SENSE) From c9bfa77654980b2018bddd47de56cbe6cd57fe2a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 15:48:35 +0100 Subject: [PATCH 105/297] Add return type annotation --- sense2vec/util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index 8ddd489..240d27e 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -83,7 +83,9 @@ def get_phrases(doc: Doc) -> List[Span]: return spans -def is_particle(token: Token, pos: Tuple[str] = ("PART",), deps: Tuple[str] = ("prt",)): +def is_particle( + token: Token, pos: Tuple[str] = ("PART",), deps: Tuple[str] = ("prt",) +) -> bool: """Determine whether a word is a particle, for phrasal verb detection. token (Token): The token to check. From f98128f4135e13a4d7e991228c82e795c4f30bda Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 15:52:28 +0100 Subject: [PATCH 106/297] Move get_noun_chunks to own function --- sense2vec/util.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index 240d27e..a107347 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -62,6 +62,25 @@ def make_spacy_key( return make_key(text, sense or DEFAULT_SENSE) +def get_noun_phrases(doc: Doc) -> List[Span]: + """Compile a list of noun phrases in sense2vec's format (without + determiners). Separated out to make it easier to customize, e.g. for + languages that don't implement a noun_chunks iterator out-of-the-box, or + use different label schemes. + + doc (Doc): The Doc to get noun phrases from. + RETURNS (list): The noun phrases as a list of Span objects. + """ + trim_labels = ("advmod", "amod", "compound") + spans = [] + if doc.is_parsed: + for np in doc.noun_chunks: + while len(np) > 1 and np[0].dep_ not in trim_labels: + np = np[1:] + spans.append(np) + return spans + + def get_phrases(doc: Doc) -> List[Span]: """Compile a list of sense2vec phrases based on a processed Doc: named entities and noun chunks without determiners. @@ -73,13 +92,10 @@ def get_phrases(doc: Doc) -> List[Span]: ent_words = set() for span in spans: ent_words.update(token.i for token in span) - if doc.is_parsed: - for np in doc.noun_chunks: - # Prefer entities over noun chunks if there's overlap. - if not any(w.i in ent_words for w in np): - while len(np) > 1 and np[0].dep_ not in ("advmod", "amod", "compound"): - np = np[1:] - spans.append(np) + for np in get_noun_phrases(doc): + # Prefer entities over noun chunks if there's overlap + if not any(w.i in ent_words for w in np): + spans.append(np) return spans From 2521b9dae98a19abe4f1e65f5cccc7feb74c3df3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 16:48:23 +0100 Subject: [PATCH 107/297] Update README --- README.md | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 934c188..8f7f32d 100644 --- a/README.md +++ b/README.md @@ -92,16 +92,8 @@ s2v = Sense2Vec().from_disk("/path/to/reddit_vectors-1.1.0") ### Usage with spaCy v2.x The easiest way to use the library and vectors is to plug it into your spaCy -pipeline. Note that `sense2vec` doesn't depend on spaCy, so you'll have to -install it separately and download the English model. - -```bash -pip install -U spacy -python -m spacy download en_core_web_sm -``` - -The `sense2vec` package exposes a `Sense2VecComponent`, which can be initialised -with the shared vocab and added to your spaCy pipeline as a +pipeline. The `sense2vec` package exposes a `Sense2VecComponent`, which can be +initialised with the shared vocab and added to your spaCy pipeline as a [custom pipeline component](https://spacy.io/usage/processing-pipelines#custom-components). By default, components are added to the _end of the pipeline_, which is the recommended position for this component, since it needs access to the dependency @@ -112,14 +104,11 @@ import spacy from sense2vec import Sense2VecComponent nlp = spacy.load("en_core_web_sm") -s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/reddit_vectors-1.1.0") +s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/sense2vec_vectors") nlp.add_pipe(s2v) ``` -The pipeline component will **merge noun phrases and entities** according to the -same schema used when training the sense2vec models (e.g. noun chunks without -determiners like "the"). This ensures that you'll be able to retrieve meaningful -vectors for phrases in your text. The component will also add serveral +The component will add serveral [extension attributes and methods](https://spacy.io/usage/processing-pipelines#custom-components-attributes) to spaCy's `Token` and `Span` objects that let you retrieve vectors and frequencies, as well as most similar terms. From ac59f8ea6428c8610ffcb89b7d3a9cd9661eff76 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 16:48:31 +0100 Subject: [PATCH 108/297] Update version --- sense2vec/about.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/about.py b/sense2vec/about.py index e3ac18c..6559039 100644 --- a/sense2vec/about.py +++ b/sense2vec/about.py @@ -1 +1 @@ -__version__ = "1.0.0.dev0" +__version__ = "1.0.0a2" From c01c79ebb8801824b5da9158a9e792979a62460d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 16:48:48 +0100 Subject: [PATCH 109/297] Auto-format --- sense2vec/component.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index 3e9112a..ed6269a 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -17,7 +17,7 @@ def __init__( vocab: Vocab = None, shape: Tuple[int, int] = (1000, 128), merge_phrases: bool = False, - **kwargs + **kwargs, ): """Initialize the pipeline component. From 4789503b5d10568668ac28f6e5527791054ac92c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 16:49:11 +0100 Subject: [PATCH 110/297] Use new component decorator --- sense2vec/component.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index ed6269a..742e32a 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -1,4 +1,5 @@ from typing import Tuple, Union, List +from spacy import component from spacy.tokens import Doc, Token, Span from spacy.vocab import Vocab from spacy.language import Language @@ -9,9 +10,28 @@ from .util import merge_phrases, get_phrases, make_spacy_key +@component( + "sense2vec", + requires=["token.pos", "token.dep", "token.ent_type", "token.ent_iob", "doc.ents"], + assigns=[ + "doc._._s2v", + "doc._.s2v_phrases", + "token._.in_s2v", + "token._.s2v_key", + "token._.s2v_vec", + "token._.s2v_freq", + "token._.s2v_other_senses", + "token._.s2v_most_similar", + # TODO: requires https://github.com/explosion/spaCy/pull/4555 + # "span._.in_s2v", + # "span._.s2v_key", + # "span._.s2v_vec", + # "span._.s2v_freq", + # "span._.s2v_other_senses", + # "span._.s2v_most_similar", + ], +) class Sense2VecComponent(object): - name = "sense2vec" - def __init__( self, vocab: Vocab = None, From da1cd77dca8f8d49e7abeaafc7f33af56dbbf119 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 17:54:12 +0100 Subject: [PATCH 111/297] Mark as zip-safe --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 9b2f885..f046011 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,7 +22,7 @@ classifiers = Topic :: Scientific/Engineering [options] -zip_safe = false +zip_safe = true include_package_data = true python_requires = >=3.6 install_requires = From 6b5b4cfed33ee2dfadb6e7efaf32fc47543cb2e1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 18:00:11 +0100 Subject: [PATCH 112/297] Update README.md [ci skip] --- README.md | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 8f7f32d..115024d 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ interactive example of the technology, see our semantic similarities across all Reddit comments of 2015. This library is a simple Python implementation for loading and querying sense2vec models. -🦆 **Version 1.0 out now!** +🦆 **Version 1.0 alpha out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/12/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=12) @@ -38,7 +38,7 @@ simple Python implementation for loading and querying sense2vec models. ```python from sense2vec import Sense2Vec -s2v = Sense2Vec().from_disk("/path/to/reddit_vectors-1.1.0") +s2v = Sense2Vec().from_disk("/path/to/sense2vec_vectors") query = "natural_language_processing|NOUN" assert query in s2v vector = s2v[query] @@ -56,7 +56,7 @@ import spacy from sense2vec import Sense2VecComponent nlp = spacy.load("en_core_web_sm") -s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/reddit_vectors-1.1.0") +s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/sense2vec_vectors") nlp.add_pipe(s2v) doc = nlp("A sentence about natural language processing.") @@ -71,10 +71,14 @@ most_similar = doc[3:6]._.s2v_most_similar(3) ## ⏳ Installation & Setup +> ️🚨 **This is an alpha release so you need to specify the explicit version +> during installation. The pre-packaged vectors are just a converted version of +> the old model and will be updated for the stable release.** + sense2vec releases are available on pip: ```bash -pip install sense2vec +pip install sense2vec==1.0.0a2 ``` The Reddit vectors model is attached to the @@ -84,12 +88,12 @@ extracted data directory: ```python from sense2vec import Sense2Vec -s2v = Sense2Vec().from_disk("/path/to/reddit_vectors-1.1.0") +s2v = Sense2Vec().from_disk("/path/to/sense2vec_vectors") ``` ## 👩‍💻 Usage -### Usage with spaCy v2.x +### Usage with spaCy v2.2+ The easiest way to use the library and vectors is to plug it into your spaCy pipeline. The `sense2vec` package exposes a `Sense2VecComponent`, which can be From 1a513dcbf2f4b26b264b074b00ca24484818c8ab Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 19:12:07 +0100 Subject: [PATCH 113/297] Update spaCy pin and component decorator --- requirements.txt | 2 +- sense2vec/component.py | 13 ++++++------- setup.cfg | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index b5c4346..03f27f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -spacy==2.2.2.dev3 +spacy==2.2.2.dev4 numpy>=1.15.0 srsly>=0.1.0 # Development requirements diff --git a/sense2vec/component.py b/sense2vec/component.py index 742e32a..4fe0179 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -22,13 +22,12 @@ "token._.s2v_freq", "token._.s2v_other_senses", "token._.s2v_most_similar", - # TODO: requires https://github.com/explosion/spaCy/pull/4555 - # "span._.in_s2v", - # "span._.s2v_key", - # "span._.s2v_vec", - # "span._.s2v_freq", - # "span._.s2v_other_senses", - # "span._.s2v_most_similar", + "span._.in_s2v", + "span._.s2v_key", + "span._.s2v_vec", + "span._.s2v_freq", + "span._.s2v_other_senses", + "span._.s2v_most_similar", ], ) class Sense2VecComponent(object): diff --git a/setup.cfg b/setup.cfg index f046011..b85b922 100644 --- a/setup.cfg +++ b/setup.cfg @@ -26,7 +26,7 @@ zip_safe = true include_package_data = true python_requires = >=3.6 install_requires = - spacy==2.2.2.dev3 + spacy==2.2.2.dev4 numpy>=1.15.0 srsly>=0.1.0 From 6e7cfe706a3b435f2a52b9ebcabb2c178fc3fee7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 30 Oct 2019 19:20:37 +0100 Subject: [PATCH 114/297] Update .gitignore [ci skip] --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ea09aa7..a1eb014 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ cythonize.dat .vscode .mypy_cache .prettierrc +.python-version # Byte-compiled / optimized / DLL files __pycache__/ From f5c064451a4648d55430ebc2fc39d21ee0e54780 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 13:35:41 +0100 Subject: [PATCH 115/297] Update version pin for 3.8 compat --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 03f27f3..ea3534a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ spacy==2.2.2.dev4 numpy>=1.15.0 -srsly>=0.1.0 +srsly>=0.2.0 # Development requirements pytest>=5.2.0,<6.0.0 diff --git a/setup.cfg b/setup.cfg index b85b922..b0c814d 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,7 +28,7 @@ python_requires = >=3.6 install_requires = spacy==2.2.2.dev4 numpy>=1.15.0 - srsly>=0.1.0 + srsly>=0.2.0 [options.entry_points] spacy_factories = From 6f8da90ce64cf1bdf213ab535c9d5acff3c65fe5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 13:36:12 +0100 Subject: [PATCH 116/297] Replace about.py with importlib_metadata --- push-tag.sh | 8 ++------ requirements.txt | 1 + sense2vec/__init__.py | 4 +++- sense2vec/about.py | 1 - sense2vec/util.py | 5 +++++ setup.cfg | 2 ++ setup.py | 14 +------------- 7 files changed, 14 insertions(+), 21 deletions(-) delete mode 100644 sense2vec/about.py diff --git a/push-tag.sh b/push-tag.sh index a5a29c9..ae2dced 100755 --- a/push-tag.sh +++ b/push-tag.sh @@ -9,11 +9,7 @@ git checkout $1 git pull origin $1 git push origin $1 -version=$(grep "__version__ = " sense2vec/about.py) -version=${version/__version__ = } -version=${version/\'/} -version=${version/\'/} -version=${version/\"/} -version=${version/\"/} +version=$(grep "version = " setup.cfg) +version=${version/version = } git tag "v$version" git push origin "v$version" diff --git a/requirements.txt b/requirements.txt index ea3534a..f4549c8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ spacy==2.2.2.dev4 numpy>=1.15.0 srsly>=0.2.0 +importlib_metadata>=0.20; python_version < "3.8" # Development requirements pytest>=5.2.0,<6.0.0 diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 9db2b5a..e62af15 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -1,3 +1,5 @@ -from .about import __version__ # noqa: F401 from .sense2vec import Sense2Vec # noqa: F401 from .component import Sense2VecComponent # noqa: F401 +from .util import importlib_metadata + +__version__ = importlib_metadata.version(__name__) diff --git a/sense2vec/about.py b/sense2vec/about.py deleted file mode 100644 index 6559039..0000000 --- a/sense2vec/about.py +++ /dev/null @@ -1 +0,0 @@ -__version__ = "1.0.0a2" diff --git a/sense2vec/util.py b/sense2vec/util.py index a107347..1688328 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -3,6 +3,11 @@ from spacy.tokens import Doc, Token, Span from spacy.util import filter_spans +try: + import importlib.metadata as importlib_metadata # Python 3.8 +except ImportError: + import importlib_metadata # noqa: F401 + DEFAULT_SENSE = "?" diff --git a/setup.cfg b/setup.cfg index b0c814d..ee09a70 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,4 +1,5 @@ [metadata] +version = 1.0.0a2 description = Use NLP to go beyond vanilla word2vec url = https://github.com/explosion/sense2vec author = Explosion @@ -29,6 +30,7 @@ install_requires = spacy==2.2.2.dev4 numpy>=1.15.0 srsly>=0.2.0 + importlib_metadata>=0.20; python_version < "3.8" [options.entry_points] spacy_factories = diff --git a/setup.py b/setup.py index fa6192a..2a5cf30 100644 --- a/setup.py +++ b/setup.py @@ -1,21 +1,9 @@ #!/usr/bin/env python -from __future__ import unicode_literals - -import os -import io from setuptools import setup, find_packages def setup_package(): - package_name = "sense2vec" - root = os.path.abspath(os.path.dirname(__file__)) - # Read in package meta from about.py - about_path = os.path.join(root, package_name, "about.py") - with io.open(about_path, encoding="utf8") as f: - about = {} - exec(f.read(), about) - - setup(name=package_name, version=about["__version__"], packages=find_packages()) + setup(name="sense2vec", packages=find_packages()) if __name__ == "__main__": From 95e228cd6aa7b12c08d1697760ea6ee2364205ff Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 13:39:27 +0100 Subject: [PATCH 117/297] Move push-tag script to bin for consistency --- push-tag.sh => bin/push-tag.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename push-tag.sh => bin/push-tag.sh (100%) diff --git a/push-tag.sh b/bin/push-tag.sh similarity index 100% rename from push-tag.sh rename to bin/push-tag.sh From 2ea9215ef0813ca2b191ee5ecb344c7161a4c11d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 14:23:48 +0100 Subject: [PATCH 118/297] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 115024d..51d50a3 100644 --- a/README.md +++ b/README.md @@ -147,7 +147,7 @@ property: The following attributes are available via the `._` property of `Token` and `Span` objects – for example `token._.in_s2v`: -| Name | Attribute Type | Type | Description | +| Name | Attribute Type | Return Type | Description | | ------------------ | -------------- | ------------------ | ---------------------------------------------------------------------------------- | | `in_s2v` | property | bool | Whether a key exists in the vector map. | | `s2v_key` | property | unicode | The sense2vec key of the given object, e.g. `"duck|NOUN"`. | From dbbe969b0b679bab645a72feb384cae8a1fc8394 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 14:24:14 +0100 Subject: [PATCH 119/297] Add Sense2vec.similarity --- README.md | 19 +++++++++++++++++++ sense2vec/component.py | 16 ++++++++++++++++ sense2vec/sense2vec.py | 22 ++++++++++++++++++++++ tests/test_component.py | 10 ++++++++++ tests/test_sense2vec.py | 15 +++++++++++++++ 5 files changed, 82 insertions(+) diff --git a/README.md b/README.md index 51d50a3..8a84b0d 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,7 @@ The following attributes are available via the `._` property of `Token` and | `s2v_freq` | property | int | The frequency of the given key. | | `s2v_other_senses` | property | list | Available other senses, e.g. `"duck|VERB"` for `"duck|NOUN"`. | | `s2v_most_similar` | method | list | Get the `n` most similar terms. Returns a list of `((word, sense), score)` tuples. | +| `s2v_similarity` | method | float | Get the similarity to another `Token` or `Span`. | > ⚠️ **A note on span attributes:** Under the hood, entities in `doc.ents` are > `Span` objects. This is why the pipeline component also adds attributes and @@ -355,6 +356,24 @@ s2v = Sense2Vec(senses=["VERB", "NOUN"]) assert "VERB" in s2v.senses ``` +### method `Sense2vec.similarity` + +Make a semantic similarity estimate of two keys or two sets of keys. The default +estimate is cosine similarity using an average of vectors. + +| Argument | Type | Description | +| ----------- | ------------------------ | ----------------------------------- | +| `keys_a` | unicode / int / iterable | The string or integer key(s). | +| `keys_b` | unicode / int / iterable | The other string or integer key(s). | +| **RETURNS** | float | The similarity score. | + +```python +keys_a = ["machine_learning|NOUN", "natural_language_processing|NOUN"] +keys_b = ["computer_vision|NOUN", "object_detection|NOUN"] +print(s2v.similarity(keys_a, keys_b)) +assert s2v.similarity("machine_learning|NOUN", "machine_learning|NOUN") == 1.0 +``` + ### method `Sense2Vec.most_similar` Get the most similar entries in the table. If more than one key is provided, the diff --git a/sense2vec/component.py b/sense2vec/component.py index 4fe0179..3315955 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -22,12 +22,14 @@ "token._.s2v_freq", "token._.s2v_other_senses", "token._.s2v_most_similar", + "token._.s2v_similarity", "span._.in_s2v", "span._.s2v_key", "span._.s2v_vec", "span._.s2v_freq", "span._.s2v_other_senses", "span._.s2v_most_similar", + "span._.s2v_similarity", ], ) class Sense2VecComponent(object): @@ -90,6 +92,7 @@ def init_component(self): obj.set_extension("s2v_freq", getter=self.s2v_freq) obj.set_extension("s2v_other_senses", getter=self.s2v_other_senses) obj.set_extension("s2v_most_similar", method=self.s2v_most_similar) + obj.set_extension("s2v_similarity", method=self.s2v_similarity) def in_s2v(self, obj: Union[Token, Span]) -> bool: """Extension attribute getter. Check if a token or span has a vector. @@ -126,11 +129,24 @@ def s2v_key(self, obj: Union[Token, Span]) -> str: obj, obj.doc._._s2v.make_key, prefer_ents=self.merge_phrases ) + def s2v_similarity(self, obj: Union[Token, Span], other: Union[Token, Span]) -> str: + """Extension attribute method. Estimate the similarity of two objects. + + obj (Token / Span): The object the attribute is called on. + other (Token / Span): The object to compare it to. + RETURNS (float): The similarity score. + """ + if not isinstance(other, (Token, Span)): + msg = f"Can only get similarity of Token or Span, not {type(other)}" + raise ValueError(msg) + return obj.doc._._s2v.similarity(self.s2v_key(obj), self.s2v_key(other)) + def s2v_most_similar( self, obj: Union[Token, Span], n: int = 10 ) -> List[Tuple[Tuple[str, str], float]]: """Extension attribute method. Get the most similar entries. + obj (Token / Span): The object the attribute is called on. n (int): The number of similar entries to return. RETURNS (list): The most similar entries as a list of ((word, sense), score) tuples. diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 6c1c93c..8e8729c 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -140,6 +140,28 @@ def ensure_int_key(self, key: Union[str, int]) -> int: """ return key if isinstance(key, int) else self.strings[key] + def similarity(self, keys_a, keys_b): + """Make a semantic similarity estimate of two keys or two sets of keys. + The default estimate is cosine similarity using an average of vectors. + + keys_a (unicode / int / iterable): The string or integer key(s). + keys_b (unicode / int / iterable): The other string or integer key(s). + RETURNS (float): The similarity score. + """ + if isinstance(keys_a, (str, int)): + keys_a = [keys_a] + if isinstance(keys_b, (str, int)): + keys_b = [keys_b] + average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0) + average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0) + if average_a.all() == 0 or average_b.all() == 0: + return 0.0 + norm_a = numpy.linalg.norm(average_a) + norm_b = numpy.linalg.norm(average_b) + if norm_a == norm_b: + return 1.0 + return numpy.dot(average_a, average_b) / (norm_a * norm_b) + def most_similar( self, keys: Union[Iterable[Union[str, int]], str, int], diff --git a/tests/test_component.py b/tests/test_component.py index 8978796..0886943 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -45,6 +45,16 @@ def test_component_attributes_ents(doc): assert phrase[0]._.in_s2v is True +def test_component_similarity(doc): + s2v = Sense2VecComponent(doc.vocab, shape=(4, 4)) + s2v.first_run = False + vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) + s2v.s2v.add("hello|INTJ", vector) + s2v.s2v.add("world|NOUN", vector) + doc = s2v(doc) + assert doc[0]._.s2v_similarity(doc[1]) == 1.0 + + def test_component_to_from_bytes(doc): s2v = Sense2VecComponent(doc.vocab, shape=(1, 4)) s2v.first_run = False diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 6aaf5ca..63d00f9 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -64,6 +64,21 @@ def test_sense2vec_best_sense(): assert s2v.get_best_sense("a") is None +def test_sense2vec_similarity(): + s2v = Sense2Vec(shape=(5, 4)) + s2v.add("a", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32)) + s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32)) + s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32)) + s2v.add("d", numpy.asarray([0.1, 0.2, 0.3, 0.4], dtype=numpy.float32)) + s2v.add("e", numpy.asarray([0, 0, 0, 0], dtype=numpy.float32)) + assert s2v.similarity("a", "b") == 1.0 + assert 1.0 > s2v.similarity("b", "c") > 0.9 + assert 1.0 > s2v.similarity(["a", "b"], "c") > 0.9 + assert s2v.similarity("b", "c") == s2v.similarity(["a", "b"], "c") + assert s2v.similarity("a", "d") < 0.8 + assert s2v.similarity("a", "e") == 0.0 + + def test_sense2vec_most_similar(): s2v = Sense2Vec(shape=(6, 4)) s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) From 1e1916b2e86c786d58975570c0822565d0a053dd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 14:25:44 +0100 Subject: [PATCH 120/297] Add span similarity test --- tests/test_component.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_component.py b/tests/test_component.py index 0886943..3b2e745 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -53,6 +53,7 @@ def test_component_similarity(doc): s2v.s2v.add("world|NOUN", vector) doc = s2v(doc) assert doc[0]._.s2v_similarity(doc[1]) == 1.0 + assert doc[1:3]._.s2v_similarity(doc[1:3]) == 1.0 def test_component_to_from_bytes(doc): From 3342c3b5562be435087e702687f0765364203a12 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 14:39:15 +0100 Subject: [PATCH 121/297] Fix types [ci skip] --- sense2vec/component.py | 4 +++- sense2vec/sense2vec.py | 6 +++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index 3315955..9c5301d 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -129,7 +129,9 @@ def s2v_key(self, obj: Union[Token, Span]) -> str: obj, obj.doc._._s2v.make_key, prefer_ents=self.merge_phrases ) - def s2v_similarity(self, obj: Union[Token, Span], other: Union[Token, Span]) -> str: + def s2v_similarity( + self, obj: Union[Token, Span], other: Union[Token, Span] + ) -> float: """Extension attribute method. Estimate the similarity of two objects. obj (Token / Span): The object the attribute is called on. diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 8e8729c..4e099c9 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -140,7 +140,11 @@ def ensure_int_key(self, key: Union[str, int]) -> int: """ return key if isinstance(key, int) else self.strings[key] - def similarity(self, keys_a, keys_b): + def similarity( + self, + keys_a: Union[Iterable[Union[str, int]], str, int], + keys_b: Union[Iterable[Union[str, int]], str, int], + ) -> float: """Make a semantic similarity estimate of two keys or two sets of keys. The default estimate is cosine similarity using an average of vectors. From 64b0d30d4ed06ccab69ecd2d6beb6e6c936c1802 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 14:46:21 +0100 Subject: [PATCH 122/297] Fix types and mypy config --- sense2vec/sense2vec.py | 21 ++++++++++----------- sense2vec/util.py | 4 ++-- setup.cfg | 3 +++ 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 4e099c9..88753d0 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -1,5 +1,4 @@ -from typing import Callable, Tuple, List, Union, Iterable, Dict -from collections import OrderedDict +from typing import Callable, Tuple, List, Union, Sequence, Dict from pathlib import Path from spacy.vectors import Vectors from spacy.strings import StringStore @@ -142,8 +141,8 @@ def ensure_int_key(self, key: Union[str, int]) -> int: def similarity( self, - keys_a: Union[Iterable[Union[str, int]], str, int], - keys_b: Union[Iterable[Union[str, int]], str, int], + keys_a: Union[Sequence[Union[str, int]], str, int], + keys_b: Union[Sequence[Union[str, int]], str, int], ) -> float: """Make a semantic similarity estimate of two keys or two sets of keys. The default estimate is cosine similarity using an average of vectors. @@ -168,7 +167,7 @@ def similarity( def most_similar( self, - keys: Union[Iterable[Union[str, int]], str, int], + keys: Union[Sequence[Union[str, int]], str, int], n: int = 10, batch_size: int = 16, ) -> List[Tuple[str, float]]: @@ -197,8 +196,8 @@ def most_similar( result_keys, _, scores = self.vectors.most_similar( average, n=n_similar, batch_size=batch_size ) - result = OrderedDict(zip(result_keys.flatten(), scores.flatten())) - result = [(self.strings[key], score) for key, score in result.items() if key] + result = list(zip(result_keys.flatten(), scores.flatten())) + result = [(self.strings[key], score) for key, score in result if key] result = [(key, score) for key, score in result if key not in keys] return result @@ -243,7 +242,7 @@ def get_best_sense(self, word: str, ignore_case: bool = True) -> Union[str, None freqs.append((freq, key)) return max(freqs)[1] if freqs else None - def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: + def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes: """Serialize a Sense2Vec object to a bytestring. exclude (list): Names of serialization fields to exclude. @@ -256,7 +255,7 @@ def to_bytes(self, exclude: Iterable[str] = tuple()) -> bytes: data["strings"] = self.strings.to_bytes() return srsly.msgpack_dumps(data) - def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): + def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a bytestring. bytes_data (bytes): The data to load. @@ -271,7 +270,7 @@ def from_bytes(self, bytes_data: bytes, exclude: Iterable[str] = tuple()): self.strings = StringStore().from_bytes(data["strings"]) return self - def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): + def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Serialize a Sense2Vec object to a directory. path (unicode / Path): The path. @@ -284,7 +283,7 @@ def to_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): if "strings" not in exclude: self.strings.to_disk(path / "strings.json") - def from_disk(self, path: Union[Path, str], exclude: Iterable[str] = tuple()): + def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a directory. path (unicode / Path): The path to load from. diff --git a/sense2vec/util.py b/sense2vec/util.py index 1688328..e4dcb54 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -1,4 +1,4 @@ -from typing import Union, Callable, List, Tuple +from typing import Union, Callable, List, Tuple, Set import re from spacy.tokens import Doc, Token, Span from spacy.util import filter_spans @@ -94,7 +94,7 @@ def get_phrases(doc: Doc) -> List[Span]: RETURNS (list): The phrases as a list of Span objects. """ spans = list(doc.ents) - ent_words = set() + ent_words: Set[str] = set() for span in spans: ent_words.update(token.i for token in span) for np in get_noun_phrases(doc): diff --git a/setup.cfg b/setup.cfg index ee09a70..619157f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -53,3 +53,6 @@ exclude = .env, .git, __pycache__, + +[mypy] +ignore_missing_imports = True From 4b933e747f0ff3f11fe98a64683d4ce0eef17756 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 15:19:43 +0100 Subject: [PATCH 123/297] Tidy up [ci skip] --- setup.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 2a5cf30..0468093 100644 --- a/setup.py +++ b/setup.py @@ -1,10 +1,6 @@ #!/usr/bin/env python -from setuptools import setup, find_packages +if __name__ == "__main__": + from setuptools import setup, find_packages -def setup_package(): setup(name="sense2vec", packages=find_packages()) - - -if __name__ == "__main__": - setup_package() From 4b84c2a93fbeb8347d0127c16869305fe1d81e47 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 17:47:47 +0100 Subject: [PATCH 124/297] Update spaCy pin --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index f4549c8..529dcf0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -spacy==2.2.2.dev4 +spacy>=2.2.2,<3.0.0 numpy>=1.15.0 srsly>=0.2.0 importlib_metadata>=0.20; python_version < "3.8" diff --git a/setup.cfg b/setup.cfg index 619157f..daaa5dc 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,7 +27,7 @@ zip_safe = true include_package_data = true python_requires = >=3.6 install_requires = - spacy==2.2.2.dev4 + spacy>=2.2.2,<3.0.0 numpy>=1.15.0 srsly>=0.2.0 importlib_metadata>=0.20; python_version < "3.8" From b20a19623d5b507d9975920ccde4cca3abc9f80b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 21:44:22 +0100 Subject: [PATCH 125/297] Raise error for invalid keys --- sense2vec/util.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sense2vec/util.py b/sense2vec/util.py index e4dcb54..8413163 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -29,6 +29,8 @@ def split_key(key: str) -> Tuple[str, str]: key (unicode): The key to split. RETURNS (tuple): The split (word, sense) tuple. """ + if "|" not in key: + raise ValueError(f"Invalid key: {key}") word, sense = key.replace("_", " ").rsplit("|", 1) return word, sense From d8c2dfb4f22602de5abae2d9d05e1b3b06874fff Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 21:46:02 +0100 Subject: [PATCH 126/297] Update preprocessing and training scripts [ci skip] --- README.md | 83 ++++++++++------------------ scripts/01_parse.py | 48 +++++++++++++++++ scripts/02_preprocess.py | 65 ++++++++++++++++++++++ scripts/03_glove_build_counts.py | 86 +++++++++++++++++++++++++++++ scripts/04_glove_train_vectors.py | 67 +++++++++++++++++++++++ scripts/05_export.py | 63 ++++++++++++++++++++++ scripts/preprocess.py | 62 --------------------- scripts/requirements.txt | 1 - scripts/train.py | 90 ------------------------------- 9 files changed, 356 insertions(+), 209 deletions(-) create mode 100644 scripts/01_parse.py create mode 100644 scripts/02_preprocess.py create mode 100644 scripts/03_glove_build_counts.py create mode 100644 scripts/04_glove_train_vectors.py create mode 100644 scripts/05_export.py delete mode 100644 scripts/preprocess.py delete mode 100644 scripts/train.py diff --git a/README.md b/README.md index 8a84b0d..25b2b2a 100644 --- a/README.md +++ b/README.md @@ -483,8 +483,11 @@ new_s2v = Sense2Vec().from_disk("/path/to/sense2vec") ## 🚂 Training your own sense2vec vectors The [`/scripts`](/scripts) directory contains command line utilities for -preprocessing text and training your own vectors. To train your own sense2vec -vectors, you'll need the following: +preprocessing text and training your own vectors. + +### Requirements + +To train your own sense2vec vectors, you'll need the following: - A **very large** source of raw text (ideally more than you'd use for word2vec, since the senses make the vocabulary more sparse). We recommend at least 1 @@ -496,60 +499,28 @@ vectors, you'll need the following: you'll need to write your own. (The `doc.noun_chunks` and `doc.ents` are what sense2vec uses to determine what's a phrase.) -### script `preprocess.py` - -Preprocess a corpus for training a sense2vec model. It takes a text file with -one sentence per line, and outputs a text file with one sentence per line in the -expected sense2vec format (merged noun phrases, concatenated phrases with -underscores and added "senses"). - -```bash -python preprocess.py [in_file] [out_file] [spacy_model] [--n-process] -``` - -| Argument | Type | Description | -| ------------------- | ---------- | ------------------------------------------------------------------------------------ | -| `in_file` | positional | Path to input file. | -| `out_file` | positional | Path to output file. | -| `spacy_model` | positional | Name of [spaCy model](https://spacy.io/models) to use. Defaults to `en_core_web_sm`. | -| `--n-process`, `-n` | option | Number of processes (multiprocessing). Defaults to `1`. | - -#### Example input - -``` -Rats, mould and broken furniture: the scandal of the UK's refugee housing -``` - -#### Example output - -``` -Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT -the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN -``` - -### script `train.py` - -Train a sense2vec model using [Gensim](https://radimrehurek.com/gensim/). -Accepts a text file or a directory of text files in the format created by the -preprocessing script. Saves out a sense2vec model component that can be loaded -via `Sense2Vec.from_disk`. - -```bash -python train.py [input_data] [output_dir] [--n-workers] [--size] [--window] -[--min-count] [--negative] [--n-iter] [--verbose] -``` - -| Argument | Type | Description | -| ------------------- | ---------- | ------------------------------------------------------------------- | -| `input_data` | positional | Location of input directory or text file. | -| `output_dir` | positional | Location of output directory. Will be created if it doesn't exist. | -| `--n-workers`, `-n` | option | Number of workers. Defaults to `4`. | -| `--size`, `-s` | option | Dimension of the vectors. Defaults to `128`. | -| `--window`, `-w` | option | Context window size. Defaults to `5`. | -| `--min-count`, `-m` | option | The minimum frequency of the term to be included. Defaults to `10`. | -| `--negative`, `-g` | option | Number of negative examples for Word2Vec. Defaults to `5`. | -| `--n-iter`, `-i` | option | Number of iterations. | -| `--verbose`, `-V` | flag | Log debugging info. | +### Step-by-step process + +The training process is split up into several steps to allow you to resume at +any given point. Processing scripts are designed to operate on single files, +making it easy to paralellize the work. + +1. [`01_parse.py`](scripts/01_parse.py): Use spaCy to parse the raw text and + output binary collections of `Doc` objects (see + [DocBin](https://spacy.io/api/docbin)). +2. [`02_preprocess.py`](scripts/02_preprocess.py): Load a collection of parsed + `Doc` objects produced in the previous step and output text files in the + sense2vec format (one sentence per line and merged phrases with senses). +3. [`03_glove_build_counts.py`](scripts/03_glove_build_counts.py): Use + [GloVe](https://github.com/stanfordnlp/GloVe) to build the vocabulary and + counts. +4. [`04_glove_train_vectors.py`](scripts/04_glove_train_vectors.py): Use + [GloVe](https://github.com/stanfordnlp/GloVe) to train vectors. +5. [`05_export.py`](scripts/05_export.py): Load the vectors and frequencies and + output a sense2vec component that can be loaded via `Sense2Vec.from_disk`. + +For more detailed documentation of the scripts, check out the source or run them +with `--help`. For example, `python scripts/01_parse.py --help`. ## 🍳 Prodigy recipes diff --git a/scripts/01_parse.py b/scripts/01_parse.py new file mode 100644 index 0000000..4bbaed4 --- /dev/null +++ b/scripts/01_parse.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +import spacy +from spacy.tokens import DocBin +import plac +from wasabi import Printer +from pathlib import Path +import tqdm + + +@plac.annotations( + in_file=("Path to input file", "positional", None, str), + out_dir=("Path to output directory", "positional", None, str), + spacy_model=("Name of spaCy model to use", "positional", None, str), + n_process=("Number of processes (multiprocessing)", "option", "n", int), +) +def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): + """ + Step 1: Parse raw text with spaCy + + Expects an input file with one sentence per line and will output a .spacy + file of the parsed collection of Doc objects (DocBin). + """ + msg = Printer() + input_path = Path(in_file) + output_path = Path(out_dir) + if not input_path.exists(): + msg.fail("Can't find input file", in_file, exits=1) + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory {out_dir}") + nlp = spacy.load(spacy_model) + msg.info(f"Using spaCy model {spacy_model}") + doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) + msg.text("Preprocessing text...") + with input_path.open("r", encoding="utf8") as texts: + docs = nlp.pipe(texts, n_process=n_process) + for doc in tqdm.tqdm(docs, desc="Docs", unit=""): + doc_bin.add(doc) + msg.good(f"Processed {len(doc_bin)} docs") + doc_bin_bytes = doc_bin.to_bytes() + output_file = output_path / f"{input_path.stem}.spacy" + with output_file.open("wb") as f: + f.write(doc_bin_bytes) + msg.good(f"Saved parsed docs to file", output_file.resolve()) + + +if __name__ == "__main__": + plac.call(main) diff --git a/scripts/02_preprocess.py b/scripts/02_preprocess.py new file mode 100644 index 0000000..b29f126 --- /dev/null +++ b/scripts/02_preprocess.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python +from sense2vec.util import merge_phrases, make_spacy_key +import spacy +from spacy.tokens import DocBin +import plac +from wasabi import Printer +from pathlib import Path +import tqdm + + +@plac.annotations( + in_file=("Path to input file", "positional", None, str), + out_dir=("Path to output directory", "positional", None, str), + spacy_model=("Name of spaCy model to use", "positional", None, str), + n_process=("Number of processes (multiprocessing)", "option", "n", int), +) +def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): + """ + Step 2: Preprocess text in sense2vec's format + + Expects a binary .spacy input file consisting of the parsed Docs (DocBin) + and outputs a text file with one sentence per line in the expected sense2vec + format (merged noun phrases, concatenated phrases with underscores and + added "senses"). + + Example input: + Rats, mould and broken furniture: the scandal of the UK's refugee housing + + Example output: + Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT + the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN + """ + msg = Printer() + input_path = Path(in_file) + output_path = Path(out_dir) + if not input_path.exists(): + msg.fail("Can't find input file", in_file, exits=1) + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory {out_dir}") + nlp = spacy.load(spacy_model) + msg.info(f"Using spaCy model {spacy_model}") + with input_path.open("rb") as f: + doc_bin_bytes = f.read() + doc_bin = DocBin().from_bytes(doc_bin_bytes) + msg.good(f"Loaded {len(doc_bin)} parsed docs") + docs = doc_bin.get_docs(nlp.vocab) + output_file = output_path / f"{input_path.stem}.s2v" + lines_count = 0 + words_count = 0 + with output_file.open("w", encoding="utf8") as f: + for doc in tqdm.tqdm(docs, desc="Docs", unit=""): + doc = merge_phrases(doc) + words = [make_spacy_key(w, prefer_ents=True) for w in doc if not w.is_space] + f.write(" ".join(words) + "\n") + lines_count += 1 + words_count += len(words) + msg.good( + f"Successfully preprocessed {lines_count} docs ({words_count} words)", + output_file.resolve(), + ) + + +if __name__ == "__main__": + plac.call(main) diff --git a/scripts/03_glove_build_counts.py b/scripts/03_glove_build_counts.py new file mode 100644 index 0000000..f53bf3c --- /dev/null +++ b/scripts/03_glove_build_counts.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +import plac +import os +from pathlib import Path +from wasabi import Printer + + +@plac.annotations( + glove_dir=("Directory containing the GloVe build", "positional", None, str), + in_dir=("Directory with preprocessed .s2v files", "positional", None, str), + out_dir=("Path to output directory", "positional", None, str), + min_count=("Minimum count for inclusion in vocab", "option", "c", int), + memory=("Soft limit for memory consumption, in GB", "option", "m", float), + window_size=("Number of context words on either side", "option", "w", int), + verbose=("Set verbosity: 0, 1, or 2", "option", "v", int), +) +def main( + glove_dir, in_dir, out_dir, min_count=5, memory=4.0, window_size=15, verbose=2 +): + """ + Step 3: Build vocabulary and frequency counts + + Expects a directory of preprocessed .s2v input files and will use GloVe to + collect unigram counts and construct and shuffle cooccurrence data. See here + for installation instructions: https://github.com/stanfordnlp/GloVe + + Note that this script will call into GloVe in a subprocess and expects you + to pass in the GloVe build directory (/build if you run the Makefile). The + commands will also be printed if you want to run them separately. + """ + msg = Printer() + input_path = Path(in_dir) + output_path = Path(out_dir) + if not Path(glove_dir).exists(): + msg.fail("Can't find GloVe build directory", glove_dir, exits=1) + if not input_path.exists() or not input_path.is_dir(): + msg.fail("Not a valid input directory", in_dir, exits=1) + input_files = [str(fp) for fp in input_path.iterdir() if fp.suffix == ".s2v"] + if not input_files: + msg.fail("No .s2v files found in input directory", in_dir, exits=1) + msg.info(f"Using {len(input_files)} input files") + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory {out_dir}") + + vocab_file = output_path / f"vocab.txt" + cooc_file = output_path / f"cooccurrence.bin" + cooc_shuffle_file = output_path / f"cooccurrence.shuf.bin" + + msg.info("Creating vocabulary counts") + cmd = ( + f"cat {' '.join(input_files)} | {glove_dir}/vocab_count " + f"-min-count {min_count} -verbose {verbose} > {vocab_file}" + ) + print(cmd) + vocab_cmd = os.system(cmd) + if vocab_cmd == 1 or not Path(vocab_file).exists(): + msg.fail("Failed creating vocab counts", exits=1) + msg.good("Created vocab counts", vocab_file) + + msg.info("Creating cooccurrence statistics") + cmd = ( + f"cat {' '.join(input_files)} | {glove_dir}/cooccur -memory {memory} " + f"-vocab-file {vocab_file} -verbose {verbose} " + "-window-size {window_size} > {cooc_file}" + ) + print(cmd) + cooccur_cmd = os.system(cmd) + if cooccur_cmd == 1 or not Path(cooc_file).exists(): + msg.fail("Failed creating cooccurrence statistics", exits=1) + msg.good("Created cooccurrence statistics", cooc_file) + + msg.info("Shuffling cooccurrence file") + cmd = ( + f"{glove_dir}/shuffle -memory {memory} -verbose {verbose} " + f"< {cooc_file} > {cooc_shuffle_file}" + ) + print(cmd) + shuffle_cmd = os.system(cmd) + if shuffle_cmd == 1 or not Path(cooc_shuffle_file).exists(): + msg.fail("Failed to shuffle cooccurrence file", exits=1) + msg.good("Shuffled cooccurrence file", cooc_shuffle_file) + + +if __name__ == "__main__": + plac.call(main) diff --git a/scripts/04_glove_train_vectors.py b/scripts/04_glove_train_vectors.py new file mode 100644 index 0000000..d20d517 --- /dev/null +++ b/scripts/04_glove_train_vectors.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +import plac +import os +from pathlib import Path +from wasabi import Printer + + +@plac.annotations( + glove_dir=("Directory containing the GloVe build", "positional", None, str), + in_file=("Input file (shuffled cooccurrences)", "positional", None, str), + vocab_file=("Vocabulary file", "positional", None, str), + out_dir=("Path to output directory", "positional", None, str), + n_threads=("Number of threads", "option", "t", int), + n_iter=("Number of iterations", "option", "n", int), + x_max=("Parameter specifying cutoff in weighting function", "option", "x", int), + vector_size=("Dimension of word vector representations", "option", "s", int), + verbose=("Set verbosity: 0, 1, or 2", "option", "v", int), +) +def main( + glove_dir, + in_file, + vocab_file, + out_dir, + n_threads=8, + n_iter=15, + x_max=10, + vector_size=128, + verbose=2, +): + """ + Step 4: Train the vectors + + Expects a file containing the shuffled cooccurrences and a vocab file and + will output a plain-text vectors file. + + Note that this script will call into GloVe in a subprocess and expects you + to pass in the GloVe build directory (/build if you run the Makefile). The + commands will also be printed if you want to run them separately. + """ + msg = Printer() + output_path = Path(out_dir) + if not Path(glove_dir).exists(): + msg.fail("Can't find GloVe build directory", glove_dir, exits=1) + if not Path(in_file).exists(): + msg.fail("Can't find input file", in_file, exits=1) + if not Path(vocab_file).exists(): + msg.fail("Can't find vocab file", vocab_file, exits=1) + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory {out_dir}") + output_file = output_path / "vectors" + msg.info("Training vectors") + cmd = ( + f"{glove_dir}/glove -save-file {output_file} -threads {n_threads} " + f"-input-file {in_file} -x-max {x_max} -iter {n_iter} " + f"-vector-size {vector_size} -binary 0 -vocab-file {vocab_file} " + f"-verbose {verbose}" + ) + print(cmd) + train_cmd = os.system(cmd) + if train_cmd == 1: + msg.fail("Failed training vectors", exits=1) + msg.good("Successfully trained vectors") + + +if __name__ == "__main__": + plac.call(main) diff --git a/scripts/05_export.py b/scripts/05_export.py new file mode 100644 index 0000000..93d41fc --- /dev/null +++ b/scripts/05_export.py @@ -0,0 +1,63 @@ +#!/usr/bin/env python +from sense2vec import Sense2Vec +from sense2vec.util import split_key +from pathlib import Path +import plac +from wasabi import Printer +import numpy + + +@plac.annotations( + in_file=("Vectors file", "positional", None, str), + vocab_file=("Vocabulary file", "positional", None, str), + out_dir=("Path to output directory", "positional", None, str), + vector_size=("Dimension of word vector representations", "option", "s", int), +) +def main(in_file, vocab_file, out_dir, vector_size=128): + """ + Step 5: Export a sense2vec component + + Expects a vectors.txt and a vocab file trained with GloVe and exports + a component that can be loaded with Sense2vec.from_disk. + """ + msg = Printer() + input_path = Path(in_file) + vocab_path = Path(vocab_file) + output_path = Path(out_dir) + if not input_path.exists(): + msg.fail("Can't find input file", in_file, exits=1) + if not vocab_path.exists(): + msg.fail("Can't find vocab file", vocab_file, exits=1) + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory {out_dir}") + with input_path.open("r", encoding="utf8") as f: + vectors_data = f.readlines() + with vocab_path.open("r", encoding="utf8") as f: + vocab_data = f.readlines() + data = [] + all_senses = set() + for item in vectors_data: + item = item.rstrip().rsplit(" ", vector_size) + key = item[0] + if key == "": + continue + vec = item[1:] + if len(vec) != vector_size: + msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) + all_senses.add(split_key(key)[1]) + data.append((key, numpy.asarray(vec, dtype=numpy.float32))) + s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses) + for key, vector in data: + s2v.add(key, vector) + for item in vocab_data: + key, freq = item.rstrip().rsplit(" ", 1) + s2v.set_freq(key, freq) + msg.good("Created the sense2vec model") + msg.info(f"{len(data)} vectors, {len(all_senses)} total senses") + s2v.to_disk(output_path) + msg.good("Saved model to directory", out_dir) + + +if __name__ == "__main__": + plac.call(main) diff --git a/scripts/preprocess.py b/scripts/preprocess.py deleted file mode 100644 index c6a9a90..0000000 --- a/scripts/preprocess.py +++ /dev/null @@ -1,62 +0,0 @@ -#!/usr/bin/env python -from sense2vec.util import merge_phrases, make_spacy_key -import spacy -from pathlib import Path -import plac -import tqdm -from wasabi import Printer - - -def represent_doc(doc): - strings = [] - for sent in doc.sents: - if sent.text.strip(): - words = " ".join( - make_spacy_key(w, prefer_ents=True) for w in sent if not w.is_space - ) - strings.append(words) - return "\n".join(strings) + "\n" if strings else "" - - -@plac.annotations( - in_file=("Path to input file", "positional", None, str), - out_file=("Path to output file", "positional", None, str), - spacy_model=("Name of spaCy model to use", "positional", None, str), - n_process=("Number of processes (multiprocessing)", "option", "n", int), -) -def main(in_file, out_file, spacy_model="en_core_web_sm", n_process=1): - """ - This script can be used to preprocess a corpus for training a sense2vec - model. It takes a text file with one sentence per line, and outputs a text - file with one sentence per line in the expected sense2vec format (merged - noun phrases, concatenated phrases with underscores and added "senses"). - - Example input: - Rats, mould and broken furniture: the scandal of the UK's refugee housing - - Example output: - Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT - the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN - """ - msg = Printer() - input_path = Path(in_file) - output_path = Path(out_file) - if not input_path.exists(): - msg.fail("Can't find input file", in_file, exits=1) - nlp = spacy.load(spacy_model) - msg.info(f"Using spaCy model {spacy_model}") - nlp.add_pipe(merge_phrases, name="merge_sense2vec_phrases") - lines_count = 0 - msg.text("Preprocessing text...") - with input_path.open("r", encoding="utf8") as texts: - docs = nlp.pipe(texts, n_process=n_process) - lines = (represent_doc(doc) for doc in docs) - with output_path.open("w", encoding="utf8") as f: - for line in tqdm.tqdm(lines, desc="Lines", unit=""): - lines_count += 1 - f.write(line) - msg.good(f"Successfully preprocessed {lines_count} lines", output_path.resolve()) - - -if __name__ == "__main__": - plac.call(main) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index b3b680b..8977703 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,4 +1,3 @@ plac>=0.9.6,<1.2.0 tqdm>=4.36.1,<5.0.0 -gensim>=3.8.1,<4.0.0 wasabi>=0.2.0,<1.1.0 diff --git a/scripts/train.py b/scripts/train.py deleted file mode 100644 index e36abe9..0000000 --- a/scripts/train.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python -from gensim.models import Word2Vec -from gensim.models.word2vec import PathLineSentences -from sense2vec import Sense2Vec -from sense2vec.util import split_key -from pathlib import Path -import plac -import logging -from wasabi import Printer - - -@plac.annotations( - input_data=("Location of input directory or text file", "positional", None, str), - output_dir=("Location of output directory", "positional", None, str), - n_workers=("Number of workers", "option", "n", int), - size=("Dimension of the vectors", "option", "s", int), - window=("Context window size", "option", "w", int), - min_count=("Minimum frequency of terms to be included", "option", "m", int), - negative=("Number of negative examples for Word2Vec", "option", "g", int), - n_iter=("Number of iterations", "option", "i", int), - verbose=("Log debugging info", "flag", "V", bool), -) -def main( - input_data, - output_dir, - negative=5, - n_workers=4, - window=5, - size=128, - min_count=10, - n_iter=2, - verbose=False, -): - """Train a sense2vec model using Gensim. Accepts a text file or a directory - of text files in the format created by the preprocessing script. Saves out - a sense2vec model component that can be loaded via Sense2Vec.from_disk. - """ - msg = Printer(hide_animation=verbose) - if not Path(input_data).exists(): - msg.fail("Can't find input data (file or directory)", input_data, exits=1) - if verbose: - logging.basicConfig( - format="%(asctime)s - %(message)s", datefmt="%H:%M:%S", level=logging.INFO - ) - w2v_model = Word2Vec( - size=size, - window=window, - min_count=min_count, - workers=n_workers, - sample=1e-5, - negative=negative, - iter=n_iter, - ) - sentences = PathLineSentences(input_data) - msg.info(f"Using input data from {len(sentences.input_files)} file(s)") - with msg.loading("Building the vocabulary..."): - w2v_model.build_vocab(sentences) - msg.good("Built the vocabulary") - with msg.loading("Training the model..."): - w2v_model.train( - sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter - ) - msg.good("Trained the model") - vectors = [] - all_senses = set() - with msg.loading("Creating the sense2vec model..."): - for string in w2v_model.wv.vocab: - vocab = w2v_model.wv.vocab[string] - freq, idx = vocab.count, vocab.index - if freq < min_count: - continue - vector = w2v_model.wv.vectors[idx] - vectors.append((string, freq, vector)) - _, sense = split_key(string) - all_senses.add(sense) - s2v = Sense2Vec(shape=(len(vectors), size), senses=all_senses) - for string, freq, vector in vectors: - s2v.add(string, vector, freq) - msg.good("Created the sense2vec model") - msg.info(f"{len(vectors)} vectors, {len(all_senses)} total senses") - with msg.loading("Saving the model..."): - output_path = Path(output_dir) - if not output_path.exists(): - output_path.mkdir(parents=True) - s2v.to_disk(output_path) - msg.good("Saved model to directory", output_dir) - - -if __name__ == "__main__": - plac.call(main) From 2c583de628fcc32adf13be1a5c0b417b0395e27d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 21:47:09 +0100 Subject: [PATCH 127/297] Update README.md [ci skip] --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 25b2b2a..5d33b6f 100644 --- a/README.md +++ b/README.md @@ -503,11 +503,13 @@ To train your own sense2vec vectors, you'll need the following: The training process is split up into several steps to allow you to resume at any given point. Processing scripts are designed to operate on single files, -making it easy to paralellize the work. +making it easy to paralellize the work. The scripts in this repo require +[Glove](https://github.com/stanfordnlp/GloVe), which you need to clone and +`make`. 1. [`01_parse.py`](scripts/01_parse.py): Use spaCy to parse the raw text and output binary collections of `Doc` objects (see - [DocBin](https://spacy.io/api/docbin)). + [`DocBin`](https://spacy.io/api/docbin)). 2. [`02_preprocess.py`](scripts/02_preprocess.py): Load a collection of parsed `Doc` objects produced in the previous step and output text files in the sense2vec format (one sentence per line and merged phrases with senses). From 257efd48992ad959d6ed9f8fbb593a726cad94e5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 31 Oct 2019 21:53:47 +0100 Subject: [PATCH 128/297] Update README.md [ci skip] --- README.md | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 5d33b6f..e017656 100644 --- a/README.md +++ b/README.md @@ -498,6 +498,8 @@ To train your own sense2vec vectors, you'll need the following: [syntax iterator for noun phrases](https://spacy.io/usage/adding-languages#syntax-iterators), you'll need to write your own. (The `doc.noun_chunks` and `doc.ents` are what sense2vec uses to determine what's a phrase.) +- [GloVe](https://github.com/stanfordnlp/GloVe) installed and built. You should + be able to clone the repo and run `make` in the directory. ### Step-by-step process @@ -507,19 +509,13 @@ making it easy to paralellize the work. The scripts in this repo require [Glove](https://github.com/stanfordnlp/GloVe), which you need to clone and `make`. -1. [`01_parse.py`](scripts/01_parse.py): Use spaCy to parse the raw text and - output binary collections of `Doc` objects (see - [`DocBin`](https://spacy.io/api/docbin)). -2. [`02_preprocess.py`](scripts/02_preprocess.py): Load a collection of parsed - `Doc` objects produced in the previous step and output text files in the - sense2vec format (one sentence per line and merged phrases with senses). -3. [`03_glove_build_counts.py`](scripts/03_glove_build_counts.py): Use - [GloVe](https://github.com/stanfordnlp/GloVe) to build the vocabulary and - counts. -4. [`04_glove_train_vectors.py`](scripts/04_glove_train_vectors.py): Use - [GloVe](https://github.com/stanfordnlp/GloVe) to train vectors. -5. [`05_export.py`](scripts/05_export.py): Load the vectors and frequencies and - output a sense2vec component that can be loaded via `Sense2Vec.from_disk`. +| | Script | Description | +| ------ | ---------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **1.** | [`01_parse.py`](scripts/01_parse.py) | Use spaCy to parse the raw text and output binary collections of `Doc` objects (see [`DocBin`](https://spacy.io/api/docbin)). | +| **2.** | [`02_preprocess.py`](scripts/02_preprocess.py) | Load a collection of parsed `Doc` objects produced in the previous step and output text files in the sense2vec format (one sentence per line and merged phrases with senses). | +| **3.** | [`03_glove_build_counts.py`](scripts/03_glove_build_counts.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) to build the vocabulary and counts. | +| **4.** | [`04_glove_train_vectors.py`](scripts/04_glove_train_vectors.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) to train vectors. | +| **5.** | [`05_export.py`](scripts/05_export.py) | Load the vectors and frequencies and output a sense2vec component that can be loaded via `Sense2Vec.from_disk`. | For more detailed documentation of the scripts, check out the source or run them with `--help`. For example, `python scripts/01_parse.py --help`. From d688d679f0e571b6add5594ea34662e34f8cafb0 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 31 Oct 2019 22:36:40 +0100 Subject: [PATCH 129/297] Draft evaluate recipe --- sense2vec/prodigy_recipes.py | 71 ++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index dc79111..5171f74 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -11,6 +11,77 @@ {{sense}} """ +@prodigy.recipe( + "sense2vec.evaluate", + dataset=("Dataset to save annotations to", "positional", None, str), + model=("Name or path of sense2vec model", "positional", None, str), + tasks=("File with similarity triples to ask about. If none, questions will be generated.", "positional", None) +) +def evaluate( + dataset, + model, + tasks=None, +): + """Evaluate a word vectors model by asking providing questions triples: + is word A more similar to word B, or to word C? If the human mostly agrees + with the model, the vectors model is good. + """ + nlp = spacy.load(model) + + def get_stream(s2v): + keys = list(s2v.vectors.keys()) + while True: + a, b, c = random.sample(keys, 3) + sim_ab = self.vectors.similarity(a, b) + sim_ac = self.vectors.similarity(a, c) + sim_bc = self.vectors.similarity(b, c) + wordA = s2v.strings[a] + wordB = s2v.strings[b] + wordC = s2v.strings[c] + confidence = 1. - (min(sim_ab, sim_ac) / max(sim_ab, sim_ac)) + + if sim_ab > sim_ac: + mapping = {"agree": accept, "disagree": reject} + else: + mapping = {"disagree": accept, "agree": reject} + + task = { + "input": { "text": s2v.strings[a] }, + "accept": { "text": s2v.strings[wordB] }, + "reject": { "text": s2v.strings[wordC] }, + "mapping": mapping, + "similarities": [ + {"pair": [wordA, wordB], "score": sim_ab}, + {"pair": [wordA, wordC], "score": sim_ac}, + {"pair": [wordB, wordC], "score": sim_bc} + ], + "confidence": confidence + } + task = set_hashes(task) + yield task + + def update(answers): + """Updates accept_keys so that the stream can find new phrases.""" + log(f"RECIPE: Updating with {len(answers)} answers") + loss = 0. + for eg in answers: + human = eg["answer"] + if eg["answer"] in ("accept", "reject"): + if eg["mapping"][eg["answer"]] == "agree": + right += 1 + else: + wrong += 1 + loss += eg["confidence"] + return { + "view_id": "compare", + "dataset": dataset, + "stream": stream, + "update": update, + "config": {"batch_size": batch_size, "html_template": HTML_TEMPLATE}, + } + + + @prodigy.recipe( "sense2vec.teach", From 5078aa2ca019484b95584d448093d48a48a2f0fe Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:43:26 +0100 Subject: [PATCH 130/297] Adjust wording --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index e017656..533baf9 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,11 @@ sense2vec [Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn -more interesting, detailed and context-sensitive word vectors. For an -interactive example of the technology, see our -[sense2vec demo](https://demos.explosion.ai/sense2vec) that lets you explore -semantic similarities across all Reddit comments of 2015. This library is a -simple Python implementation for loading and querying sense2vec models. +more interesting and detailed word vectors. For an interactive example of the +technology, see our [sense2vec demo](https://demos.explosion.ai/sense2vec) that +lets you explore semantic similarities across all Reddit comments of 2015. This +library is a simple Python implementation for loading and querying sense2vec +models. 🦆 **Version 1.0 alpha out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) @@ -20,8 +20,8 @@ simple Python implementation for loading and querying sense2vec models. ## ✨ Features -- Query **context-sensitive vectors** for **multi-word phrases** based on - part-of-speech tags and entity labels. +- Query **vectors for multi-word phrases** based on part-of-speech tags and + entity labels. - spaCy **pipeline component** and **extension attributes**. - Fully **serializable** so you can easily ship your sense2vec vectors with your spaCy model packages. From 71eed8baeb3bc60bd9cd3a0cb4554f026ee94cff Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:43:42 +0100 Subject: [PATCH 131/297] Add Sense2Vec.frequencies property --- README.md | 13 +++++++++++++ sense2vec/sense2vec.py | 6 ++++++ 2 files changed, 19 insertions(+) diff --git a/README.md b/README.md index 533baf9..1bbbe9b 100644 --- a/README.md +++ b/README.md @@ -356,6 +356,19 @@ s2v = Sense2Vec(senses=["VERB", "NOUN"]) assert "VERB" in s2v.senses ``` +### property `Sense2vec.frequencies` + +The frequencies of they keys in the table, in descending order. + +| Argument | Type | Description | +| ----------- | ---- | -------------------------------------------------- | +| **RETURNS** | list | The `(key, freq)` tuples by frequency, descending. | + +```python +most_frequent = s2v.frequencies[:10] +key, score = s2v.frequencies[0] +``` + ### method `Sense2vec.similarity` Make a semantic similarity estimate of two keys or two sets of keys. The default diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 88753d0..e327be5 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -44,6 +44,12 @@ def senses(self) -> List[str]: """RETURNS (list): The available senses.""" return self.cfg.get("senses", []) + @property + def frequencies(self) -> List[Tuple[str, int]]: + """RETURNS (list): The (key, freq) tuples by frequency, descending.""" + freqs = [(self.strings[k], s) for k, s in self.freqs.items() if s is not None] + return sorted(freqs, key=lambda item: item[1], reverse=True) + def __len__(self) -> int: """RETURNS (int): The number of rows in the vectors table.""" return len(self.vectors) From 59a91157a4960a46305aa6d98c99c59bf269d155 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:43:53 +0100 Subject: [PATCH 132/297] Improve error handling in split_key --- sense2vec/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index 8413163..309db3f 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -29,7 +29,7 @@ def split_key(key: str) -> Tuple[str, str]: key (unicode): The key to split. RETURNS (tuple): The split (word, sense) tuple. """ - if "|" not in key: + if not isinstance(key, str) or "|" not in key: raise ValueError(f"Invalid key: {key}") word, sense = key.replace("_", " ").rsplit("|", 1) return word, sense From da86c5db3b73bdb81b32dded2fbc274bf8b59731 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:44:06 +0100 Subject: [PATCH 133/297] Update entry points --- setup.cfg | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index daaa5dc..e314b49 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,8 +36,9 @@ install_requires = spacy_factories = sense2vec = sense2vec:Sense2VecComponent.from_nlp prodigy_recipes = - sense2vec_teach = sense2vec:prodigy_recipes.teach - sens2vec_to_patterns = sense2vec:prodigy_recipes.to_patterns + sense2vec.teach = sense2vec:prodigy_recipes.teach + sens2vec.to_patterns = sense2vec:prodigy_recipes.to_patterns + sens2vec.evaluate = sense2vec:prodigy_recipes.evaluate [bdist_wheel] universal = true From 52a559fc90d952cc3320a5483153ee3e38c0212f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:44:13 +0100 Subject: [PATCH 134/297] Fix entry point loading --- sense2vec/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index e62af15..31ac772 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -2,4 +2,10 @@ from .component import Sense2VecComponent # noqa: F401 from .util import importlib_metadata +try: + # This needs to be imported in order for the entry points to be loaded + from . import prodigy_recipes # noqa: F401 +except ImportError: + pass + __version__ = importlib_metadata.version(__name__) From 97c9e71244d9d9e6eb40152e32bbbe08c1fdf489 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:45:01 +0100 Subject: [PATCH 135/297] Fix entry points --- setup.cfg | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index e314b49..e88fcca 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,7 +38,6 @@ spacy_factories = prodigy_recipes = sense2vec.teach = sense2vec:prodigy_recipes.teach sens2vec.to_patterns = sense2vec:prodigy_recipes.to_patterns - sens2vec.evaluate = sense2vec:prodigy_recipes.evaluate [bdist_wheel] universal = true From d8b632b0ea6eb0c0fb1a1e6f63308d62529bd409 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:45:35 +0100 Subject: [PATCH 136/297] Update entry points --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index e88fcca..e314b49 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,6 +38,7 @@ spacy_factories = prodigy_recipes = sense2vec.teach = sense2vec:prodigy_recipes.teach sens2vec.to_patterns = sense2vec:prodigy_recipes.to_patterns + sens2vec.evaluate = sense2vec:prodigy_recipes.evaluate [bdist_wheel] universal = true From 8dcc3c33a46d9b9f309e3d87e874a356ab18ad0a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:45:42 +0100 Subject: [PATCH 137/297] Update recipe --- sense2vec/prodigy_recipes.py | 176 +++++++++++++++++++++-------------- 1 file changed, 105 insertions(+), 71 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 5171f74..4cdcf32 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -4,6 +4,10 @@ from sense2vec import Sense2Vec import srsly import spacy +import random +from wasabi import Printer +from collections import defaultdict +import copy HTML_TEMPLATE = """ @@ -11,77 +15,6 @@ {{sense}} """ -@prodigy.recipe( - "sense2vec.evaluate", - dataset=("Dataset to save annotations to", "positional", None, str), - model=("Name or path of sense2vec model", "positional", None, str), - tasks=("File with similarity triples to ask about. If none, questions will be generated.", "positional", None) -) -def evaluate( - dataset, - model, - tasks=None, -): - """Evaluate a word vectors model by asking providing questions triples: - is word A more similar to word B, or to word C? If the human mostly agrees - with the model, the vectors model is good. - """ - nlp = spacy.load(model) - - def get_stream(s2v): - keys = list(s2v.vectors.keys()) - while True: - a, b, c = random.sample(keys, 3) - sim_ab = self.vectors.similarity(a, b) - sim_ac = self.vectors.similarity(a, c) - sim_bc = self.vectors.similarity(b, c) - wordA = s2v.strings[a] - wordB = s2v.strings[b] - wordC = s2v.strings[c] - confidence = 1. - (min(sim_ab, sim_ac) / max(sim_ab, sim_ac)) - - if sim_ab > sim_ac: - mapping = {"agree": accept, "disagree": reject} - else: - mapping = {"disagree": accept, "agree": reject} - - task = { - "input": { "text": s2v.strings[a] }, - "accept": { "text": s2v.strings[wordB] }, - "reject": { "text": s2v.strings[wordC] }, - "mapping": mapping, - "similarities": [ - {"pair": [wordA, wordB], "score": sim_ab}, - {"pair": [wordA, wordC], "score": sim_ac}, - {"pair": [wordB, wordC], "score": sim_bc} - ], - "confidence": confidence - } - task = set_hashes(task) - yield task - - def update(answers): - """Updates accept_keys so that the stream can find new phrases.""" - log(f"RECIPE: Updating with {len(answers)} answers") - loss = 0. - for eg in answers: - human = eg["answer"] - if eg["answer"] in ("accept", "reject"): - if eg["mapping"][eg["answer"]] == "agree": - right += 1 - else: - wrong += 1 - loss += eg["confidence"] - return { - "view_id": "compare", - "dataset": dataset, - "stream": stream, - "update": update, - "config": {"batch_size": batch_size, "html_template": HTML_TEMPLATE}, - } - - - @prodigy.recipe( "sense2vec.teach", @@ -221,3 +154,104 @@ def to_patterns( if not dry: srsly.write_jsonl(output_file, patterns) return patterns + + +@prodigy.recipe( + "sense2vec.evaluate", + dataset=("Dataset to save annotations to", "positional", None, str), + vectors_path=("Path to pretrained sense2vec vectors", "positional", None, str), + senses=("The senses to use (all if not set)", "option", "s", split_string), + n_freq=("Number of most frequent entries to limit to", "option", "f", int), + threshold=("Similarity threshold to consider examples", "option", "t", float), + eval_whole=("Evaluate whole dataset instead of session", "flag", "E", bool), +) +def evaluate( + dataset, vectors_path, senses=None, n_freq=100_000, threshold=0.7, eval_whole=False +): + """Evaluate a word vectors model by asking providing questions triples: + is word A more similar to word B, or to word C? If the human mostly agrees + with the model, the vectors model is good. + """ + random.seed(0) + log("RECIPE: Starting recipe sense2vec.evaluate", locals()) + s2v = Sense2Vec().from_disk(vectors_path) + log("RECIPE: Loaded sense2vec vectors", vectors_path) + + def get_stream(): + html = "{} {}" + # Limit to most frequent entries + keys = [key for key, _ in s2v.frequencies[:n_freq]] + keys_by_sense = defaultdict(set) + for key in keys: + sense = s2v.split_key(key)[1] + if senses is None or sense in senses: + keys_by_sense[sense].add(key) + keys_by_sense = {s: keys for s, keys in keys_by_sense.items() if len(keys) >= 3} + all_senses = list(keys_by_sense.keys()) + total_keys = sum(len(keys) for keys in keys_by_sense.values()) + log(f"RECIPE: Using {total_keys} entries for {len(all_senses)} senses") + while True: + current_keys = copy.deepcopy(keys_by_sense) + while any(len(values) >= 3 for values in current_keys.values()): + sense = random.choice(all_senses) + key_a, key_b, key_c = random.sample(current_keys[sense], 3) + if len(set([key_a.lower(), key_b.lower(), key_c.lower()])) != 3: + continue + sim_ab = s2v.similarity(key_a, key_b) + sim_ac = s2v.similarity(key_a, key_c) + if sim_ab < threshold or sim_ac < threshold: + continue + current_keys[sense].remove(key_a) + current_keys[sense].remove(key_b) + current_keys[sense].remove(key_c) + confidence = 1.0 - (min(sim_ab, sim_ac) / max(sim_ab, sim_ac)) + task = { + "label": "Which one is more similar?", + "html": html.format(*s2v.split_key(key_a)), + "key": key_a, + "options": [ + { + "id": key_b, + "html": html.format(*s2v.split_key(key_b)), + "score": sim_ab, + }, + { + "id": key_c, + "html": html.format(*s2v.split_key(key_c)), + "score": sim_ac, + }, + ], + "confidence": confidence, + } + yield task + + def on_exit(ctrl): + """Output summary about user agreement with the model.""" + msg = Printer() + set_id = dataset if eval_whole else ctrl.session_id + data = ctrl.db.get_dataset(set_id) + data = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] + if not data: + msg.warn("No annotations collected", exits=1) + agree_count = 0 + for eg in data: + choice = eg["accept"][0] + score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0] + score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0] + if score_choice > score_other: + agree_count += 1 + pc = agree_count / len(data) + text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" + msg.info(f"Evaluating data from '{set_id}'") + if pc > 0.5: + msg.good(text) + else: + msg.fail(text) + + return { + "view_id": "choice", + "dataset": dataset, + "stream": get_stream(), + "on_exit": on_exit, + "config": {"choice_style": "single", "choice_auto_accept": True}, + } From 85158f4016593ecbc050733ca955c28121209025 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:52:41 +0100 Subject: [PATCH 138/297] Add eval_only option --- sense2vec/prodigy_recipes.py | 57 +++++++++++++++++++++++------------- 1 file changed, 36 insertions(+), 21 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 4cdcf32..13092dc 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -164,9 +164,16 @@ def to_patterns( n_freq=("Number of most frequent entries to limit to", "option", "f", int), threshold=("Similarity threshold to consider examples", "option", "t", float), eval_whole=("Evaluate whole dataset instead of session", "flag", "E", bool), + eval_only=("Don't annotate, only evaluate current set", "flag", "O", bool), ) def evaluate( - dataset, vectors_path, senses=None, n_freq=100_000, threshold=0.7, eval_whole=False + dataset, + vectors_path, + senses=None, + n_freq=100_000, + threshold=0.7, + eval_whole=False, + eval_only=False, ): """Evaluate a word vectors model by asking providing questions triples: is word A more similar to word B, or to word C? If the human mostly agrees @@ -177,6 +184,33 @@ def evaluate( s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) + def eval_dataset(set_id): + """Output summary about user agreement with the model.""" + msg = Printer() + db = connect() + data = db.get_dataset(set_id) + data = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] + if not data: + msg.warn("No annotations collected", exits=1) + agree_count = 0 + for eg in data: + choice = eg["accept"][0] + score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0] + score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0] + if score_choice > score_other: + agree_count += 1 + pc = agree_count / len(data) + text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" + msg.info(f"Evaluating data from '{set_id}'") + if pc > 0.5: + msg.good(text) + else: + msg.fail(text) + + if eval_only: + eval_dataset(dataset) + return None + def get_stream(): html = "{} {}" # Limit to most frequent entries @@ -226,27 +260,8 @@ def get_stream(): yield task def on_exit(ctrl): - """Output summary about user agreement with the model.""" - msg = Printer() set_id = dataset if eval_whole else ctrl.session_id - data = ctrl.db.get_dataset(set_id) - data = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] - if not data: - msg.warn("No annotations collected", exits=1) - agree_count = 0 - for eg in data: - choice = eg["accept"][0] - score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0] - score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0] - if score_choice > score_other: - agree_count += 1 - pc = agree_count / len(data) - text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" - msg.info(f"Evaluating data from '{set_id}'") - if pc > 0.5: - msg.good(text) - else: - msg.fail(text) + eval_dataset(set_id) return { "view_id": "choice", From 973ded35f8b1dcb725fe62df89769251e0b4b493 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 00:57:34 +0100 Subject: [PATCH 139/297] Add stats on high confidence scores --- sense2vec/prodigy_recipes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 13092dc..fc73429 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -193,12 +193,15 @@ def eval_dataset(set_id): if not data: msg.warn("No annotations collected", exits=1) agree_count = 0 + disagree_high_conf = 0 for eg in data: choice = eg["accept"][0] score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0] score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0] if score_choice > score_other: agree_count += 1 + elif eg["confidence"] > 0.8: + disagree_high_conf += 1 pc = agree_count / len(data) text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" msg.info(f"Evaluating data from '{set_id}'") @@ -206,6 +209,7 @@ def eval_dataset(set_id): msg.good(text) else: msg.fail(text) + msg.text(f"You disagreed on {disagree_high_conf} high confidence scores") if eval_only: eval_dataset(dataset) From 47550b0c95ed8a26d4251701c8755aaf48e52c29 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 01:09:33 +0100 Subject: [PATCH 140/297] Add show_scores option for debugging --- sense2vec/prodigy_recipes.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index fc73429..f2316fa 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -165,6 +165,7 @@ def to_patterns( threshold=("Similarity threshold to consider examples", "option", "t", float), eval_whole=("Evaluate whole dataset instead of session", "flag", "E", bool), eval_only=("Don't annotate, only evaluate current set", "flag", "O", bool), + show_scores=("Show all scores for debugging", "flag", "S", bool), ) def evaluate( dataset, @@ -174,6 +175,7 @@ def evaluate( threshold=0.7, eval_whole=False, eval_only=False, + show_scores=False, ): """Evaluate a word vectors model by asking providing questions triples: is word A more similar to word B, or to word C? If the human mostly agrees @@ -215,8 +217,13 @@ def eval_dataset(set_id): eval_dataset(dataset) return None + def get_html(word, sense, score=None): + html = f"{word} {sense}" + if show_scores and score: + html += f" {score:.4}" + return html + def get_stream(): - html = "{} {}" # Limit to most frequent entries keys = [key for key, _ in s2v.frequencies[:n_freq]] keys_by_sense = defaultdict(set) @@ -245,22 +252,24 @@ def get_stream(): confidence = 1.0 - (min(sim_ab, sim_ac) / max(sim_ab, sim_ac)) task = { "label": "Which one is more similar?", - "html": html.format(*s2v.split_key(key_a)), + "html": get_html(*s2v.split_key(key_a)), "key": key_a, "options": [ { "id": key_b, - "html": html.format(*s2v.split_key(key_b)), + "html": get_html(*s2v.split_key(key_b), sim_ab), "score": sim_ab, }, { "id": key_c, - "html": html.format(*s2v.split_key(key_c)), + "html": get_html(*s2v.split_key(key_c), sim_ac), "score": sim_ac, }, ], "confidence": confidence, } + if show_scores: + task["meta"] = {"confidence": f"{confidence:.4}"} yield task def on_exit(ctrl): From 5199153184f0264fc8563c0438cbfa10e618d435 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 01:43:11 +0100 Subject: [PATCH 141/297] Use more representative hash --- sense2vec/prodigy_recipes.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index f2316fa..b2e8b28 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -1,6 +1,7 @@ import prodigy from prodigy.components.db import connect from prodigy.util import log, split_string, set_hashes, TASK_HASH_ATTR +import murmurhash from sense2vec import Sense2Vec import srsly import spacy @@ -250,6 +251,8 @@ def get_stream(): current_keys[sense].remove(key_b) current_keys[sense].remove(key_c) confidence = 1.0 - (min(sim_ab, sim_ac) / max(sim_ab, sim_ac)) + # Get a more representative hash + task_hash = murmurhash.hash(" ".join([key_a] + sorted([key_b, key_c]))) task = { "label": "Which one is more similar?", "html": get_html(*s2v.split_key(key_a)), @@ -267,6 +270,7 @@ def get_stream(): }, ], "confidence": confidence, + TASK_HASH_ATTR: task_hash, } if show_scores: task["meta"] = {"confidence": f"{confidence:.4}"} From 79004f5ef95bc04ac6373e916c9f4ddd873769ce Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 02:35:49 +0100 Subject: [PATCH 142/297] Add most_similar strategy --- sense2vec/prodigy_recipes.py | 45 +++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 11 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index b2e8b28..43a01f8 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -161,6 +161,7 @@ def to_patterns( "sense2vec.evaluate", dataset=("Dataset to save annotations to", "positional", None, str), vectors_path=("Path to pretrained sense2vec vectors", "positional", None, str), + strategy=("Example selection strategy", "option", "st", str,), senses=("The senses to use (all if not set)", "option", "s", split_string), n_freq=("Number of most frequent entries to limit to", "option", "f", int), threshold=("Similarity threshold to consider examples", "option", "t", float), @@ -171,6 +172,7 @@ def to_patterns( def evaluate( dataset, vectors_path, + strategy="most_similar", senses=None, n_freq=100_000, threshold=0.7, @@ -182,30 +184,35 @@ def evaluate( is word A more similar to word B, or to word C? If the human mostly agrees with the model, the vectors model is good. """ + msg = Printer() random.seed(0) log("RECIPE: Starting recipe sense2vec.evaluate", locals()) + strategies = ["random", "most_similar"] + if strategy not in strategies: + msg.fail(f"Invalid strategy '{strategy}'. Expected: {strategies}", exits=1) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) def eval_dataset(set_id): """Output summary about user agreement with the model.""" - msg = Printer() db = connect() data = db.get_dataset(set_id) - data = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] - if not data: + accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] + rejected = [eg for eg in data if eg["answer"] == "reject"] + if not accepted and not rejected: msg.warn("No annotations collected", exits=1) + high_conf = 0.8 agree_count = 0 - disagree_high_conf = 0 - for eg in data: + disagree_high_conf = len([e for e in rejected if e["confidence"] > high_conf]) + for eg in accepted: choice = eg["accept"][0] score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0] score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0] if score_choice > score_other: agree_count += 1 - elif eg["confidence"] > 0.8: + elif eg["confidence"] > high_conf: disagree_high_conf += 1 - pc = agree_count / len(data) + pc = agree_count / (len(accepted) + len(rejected)) text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" msg.info(f"Evaluating data from '{set_id}'") if pc > 0.5: @@ -213,6 +220,7 @@ def eval_dataset(set_id): else: msg.fail(text) msg.text(f"You disagreed on {disagree_high_conf} high confidence scores") + msg.text(f"You rejected {len(rejected)} suggestions as not similar") if eval_only: eval_dataset(dataset) @@ -240,11 +248,23 @@ def get_stream(): current_keys = copy.deepcopy(keys_by_sense) while any(len(values) >= 3 for values in current_keys.values()): sense = random.choice(all_senses) - key_a, key_b, key_c = random.sample(current_keys[sense], 3) + if strategy == "most_similar": + key_a = random.choice(list(current_keys[sense])) + most_similar = s2v.most_similar(key_a, n=100) + options = [] + for key, score in most_similar: + if key in current_keys[sense]: + options.append((key, score)) + if len(options) < 2: + continue + key_b, sim_ab = options[round(len(options) / 2)] + key_c, sim_ac = options[-1] + else: + key_a, key_b, key_c = random.sample(current_keys[sense], 3) + sim_ab = s2v.similarity(key_a, key_b) + sim_ac = s2v.similarity(key_a, key_c) if len(set([key_a.lower(), key_b.lower(), key_c.lower()])) != 3: continue - sim_ab = s2v.similarity(key_a, key_b) - sim_ac = s2v.similarity(key_a, key_c) if sim_ab < threshold or sim_ac < threshold: continue current_keys[sense].remove(key_a) @@ -273,7 +293,10 @@ def get_stream(): TASK_HASH_ATTR: task_hash, } if show_scores: - task["meta"] = {"confidence": f"{confidence:.4}"} + task["meta"] = { + "confidence": f"{confidence:.4}", + "strategy": strategy, + } yield task def on_exit(ctrl): From e9b87b7618a12122d7ba4a25d637b7e5003460a5 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 13:55:02 +0100 Subject: [PATCH 143/297] Adjust HTML --- sense2vec/prodigy_recipes.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 43a01f8..5a689fb 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -11,12 +11,6 @@ import copy -HTML_TEMPLATE = """ -{{word}} -{{sense}} -""" - - @prodigy.recipe( "sense2vec.teach", dataset=("Dataset to save annotations to", "positional", None, str), @@ -46,6 +40,7 @@ def teach( log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) + html_template = "{{word}}{{sense}}" accept_keys = [] seen = set(accept_keys) seed_tasks = [] @@ -115,7 +110,7 @@ def get_stream(): "dataset": dataset, "stream": stream, "update": update, - "config": {"batch_size": batch_size, "html_template": HTML_TEMPLATE}, + "config": {"batch_size": batch_size, "html_template": html_template}, } @@ -226,9 +221,11 @@ def eval_dataset(set_id): eval_dataset(dataset) return None - def get_html(word, sense, score=None): - html = f"{word} {sense}" - if show_scores and score: + def get_html(word, sense, score=None, large=False): + html_word = f"{word}" + html_sense = f"{sense}" + html = f"{html_word} {html_sense}" + if show_scores and score is not None: html += f" {score:.4}" return html @@ -275,7 +272,7 @@ def get_stream(): task_hash = murmurhash.hash(" ".join([key_a] + sorted([key_b, key_c]))) task = { "label": "Which one is more similar?", - "html": get_html(*s2v.split_key(key_a)), + "html": get_html(*s2v.split_key(key_a), large=True), "key": key_a, "options": [ { From a41c04d152c6febb92182c8cf9a74e75f0a8e2b9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 13:55:44 +0100 Subject: [PATCH 144/297] Increment version --- README.md | 10 +++++----- setup.cfg | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 1bbbe9b..dd8cda5 100644 --- a/README.md +++ b/README.md @@ -78,13 +78,13 @@ most_similar = doc[3:6]._.s2v_most_similar(3) sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a2 +pip install sense2vec==1.0.0a3 ``` -The Reddit vectors model is attached to the -[latest release](https://github.com/explosion/sense2vec/releases). To load it -in, download the `.tar.gz` archive, unpack it and point `from_disk` to the -extracted data directory: +The Reddit vectors model is attached to +[this release](https://github.com/explosion/sense2vec/releases/tag/v1.0.0a2). To +load it in, download the `.tar.gz` archive, unpack it and point `from_disk` to +the extracted data directory: ```python from sense2vec import Sense2Vec diff --git a/setup.cfg b/setup.cfg index e88fcca..f6630ef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a2 +version = 1.0.0a3 description = Use NLP to go beyond vanilla word2vec url = https://github.com/explosion/sense2vec author = Explosion From 5fb5906d0ea27024350f31ad5945123c683b1c0f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 14:09:16 +0100 Subject: [PATCH 145/297] Tidy up --- sense2vec/prodigy_recipes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 5a689fb..909a0a2 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -254,7 +254,7 @@ def get_stream(): options.append((key, score)) if len(options) < 2: continue - key_b, sim_ab = options[round(len(options) / 2)] + key_b, sim_ab = options[len(options) // 2] key_c, sim_ac = options[-1] else: key_a, key_b, key_c = random.sample(current_keys[sense], 3) From 1f0b9fc387ad0328966bf44ea9313b18008e9b82 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 14:09:46 +0100 Subject: [PATCH 146/297] Document sense2vec.evaluate recipe --- README.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/README.md b/README.md index 1bbbe9b..c206c7b 100644 --- a/README.md +++ b/README.md @@ -601,6 +601,46 @@ prodigy sense2vec.to-patterns tech_phrases en_core_web_sm TECHNOLOGY --output-file /path/to/patterns.jsonl ``` +### recipe `sense2vec.evaluate` + +Evaluate a word vectors model by asking providing questions triples: is word A +more similar to word B, or to word C? If the human mostly agrees with the model, +the vectors model is good. The recipe will only ask about vectors with the same +sense and supports different example selection strategies. + +```bash +prodigy sense2vec.evaluate [dataset] [vectors_path] [--strategy] [--senses] +[--n-freq] [--threshold] [--eval-whole] [--eval-only] [--show-scores] +``` + +| Argument | Type | Description | +| --------------------- | ---------- | ------------------------------------------------------------------------------------------------------------- | +| `dataset` | positional | Dataset to save annotations to. | +| `vectors_path` | positional | Path to pretrained sense2vec vectors. | +| `--strategy`, `-st` | option | Example selection strategy. `most similar` (default) or `random`. | +| `--senses`, `-s` | option | Comma-separated list of senses to limit the selection to. If not set, all senses in the vectors will be used. | +| `--n-freq`, `-n` | option | Number of most frequent entries to limit to. | +| `--threshold`, `-t` | option | Minimum similarity threshold to consider examples. | +| `--eval-whole`, `-E` | flag | Evaluate the whole dataset instead of the current session. | +| `--eval-only`, `-O` | flag | Don't annotate, only evaluate the current dataset. | +| `--show-scores`, `-S` | flag | Show all scores for debugging. | + +#### Strategies + +| Name | Description | +| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `most_similar` | Pick a random word from a random sense and get its most similar entries of the same sense. Ask about the similarity to the last and middle entry from that selection. | +| `random` | Pick a random sample of 3 words from the same random sense. | + +#### Example + +```bash +prodigy sense2vec.evaluate vectors_eval /path/to/sense2vec_vectors +--senses NOUN,ORG,PRODUCT --threshold 0.5 +``` + +![UI preview of sense2vec.evaluate](https://user-images.githubusercontent.com/13643239/67994212-668cf400-fc44-11e9-8fe2-bf264ae32b0a.png) + ## Pretrained vectors The pretrained Reddit vectors support the following "senses", either From 6653e20a504cbce3382396c5ccc6088d06827cf0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 1 Nov 2019 14:10:15 +0100 Subject: [PATCH 147/297] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dd8cda5..31abb45 100644 --- a/README.md +++ b/README.md @@ -566,7 +566,7 @@ prodigy sense2vec.teach [dataset] [vectors_path] [--seeds] [--threshold] #### Example ```bash -prodigy sense2vec.teach tech_phrases /path/to/reddit_vectors-1.1.0 +prodigy sense2vec.teach tech_phrases /path/to/sense2vec_vectors --seeds "natural language processing, machine learning, artificial intelligence" ``` From 65f0344b2513dc1b7d74e325494880911b4ad7d7 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 13:35:32 +0100 Subject: [PATCH 148/297] WIP: integrate function registry with catalogue --- README.md | 16 +++++------- requirements.txt | 5 +++- scripts/02_preprocess.py | 8 ++++-- sense2vec/__init__.py | 2 +- sense2vec/component.py | 35 ++++++++++++++++++++----- sense2vec/sense2vec.py | 36 ++++++++++++++++---------- sense2vec/util.py | 55 +++++++++++++++++++++++++++++++++------- setup.cfg | 3 ++- 8 files changed, 116 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 31abb45..e44a4da 100644 --- a/README.md +++ b/README.md @@ -187,15 +187,13 @@ most_similar = s2v.most_similar("natural_language_processing|NOUN", n=10) Initialize the `Sense2Vec` object. -| Argument | Type | Description | -| -------------- | --------------------------- | ----------------------------------------------------------------------------------------------------------- | -| `shape` | tuple | The vector shape. Defaults to `(1000, 128)`. | -| `strings` | `spacy.strings.StringStore` | Optional string store. Will be created if it doesn't exist. | -| `make_key` | callable | Optional custom function that takes a word and sense string and creates the key (e.g. `"some_word|sense"`). | -| `split_key` | callable | Optional custom function that takes a key and returns the word and sense (e.g. `("some word", "sense")`). | -| `senses` | list | Optional list of all available senses. Used in methods that generate the best sense or other senses. | -| `vectors_name` | unicode | Optional name to assign to the `Vectors` table, to prevent clashes. Defaults to `"sense2vec"`. | -| **RETURNS** | `Sense2Vec` | The newly constructed object. | +| Argument | Type | Description | +| -------------- | --------------------------- | ---------------------------------------------------------------------------------------------------- | +| `shape` | tuple | The vector shape. Defaults to `(1000, 128)`. | +| `strings` | `spacy.strings.StringStore` | Optional string store. Will be created if it doesn't exist. | +| `senses` | list | Optional list of all available senses. Used in methods that generate the best sense or other senses. | +| `vectors_name` | unicode | Optional name to assign to the `Vectors` table, to prevent clashes. Defaults to `"sense2vec"`. | +| **RETURNS** | `Sense2Vec` | The newly constructed object. | ```python s2v = Sense2Vec(shape=(300, 128), senses=["VERB", "NOUN"]) diff --git a/requirements.txt b/requirements.txt index 529dcf0..aae2f07 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,9 @@ +# Our packages spacy>=2.2.2,<3.0.0 -numpy>=1.15.0 srsly>=0.2.0 +catalogue>=0.0.3 +# Third-party dependencies +numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" # Development requirements pytest>=5.2.0,<6.0.0 diff --git a/scripts/02_preprocess.py b/scripts/02_preprocess.py index b29f126..637c014 100644 --- a/scripts/02_preprocess.py +++ b/scripts/02_preprocess.py @@ -1,5 +1,5 @@ #!/usr/bin/env python -from sense2vec.util import merge_phrases, make_spacy_key +from sense2vec.util import make_key, make_spacy_key, merge_phrases import spacy from spacy.tokens import DocBin import plac @@ -51,7 +51,11 @@ def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): with output_file.open("w", encoding="utf8") as f: for doc in tqdm.tqdm(docs, desc="Docs", unit=""): doc = merge_phrases(doc) - words = [make_spacy_key(w, prefer_ents=True) for w in doc if not w.is_space] + words = [] + for token in doc: + if not token.is_space: + word, sense = make_spacy_key(token, prefer_ents=True) + words.append(make_key(word, sense)) f.write(" ".join(words) + "\n") lines_count += 1 words_count += len(words) diff --git a/sense2vec/__init__.py b/sense2vec/__init__.py index 31ac772..e332fe4 100644 --- a/sense2vec/__init__.py +++ b/sense2vec/__init__.py @@ -1,6 +1,6 @@ from .sense2vec import Sense2Vec # noqa: F401 from .component import Sense2VecComponent # noqa: F401 -from .util import importlib_metadata +from .util import importlib_metadata, registry # noqa: F401 try: # This needs to be imported in order for the entry points to be loaded diff --git a/sense2vec/component.py b/sense2vec/component.py index 9c5301d..2a6ad2a 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -1,4 +1,4 @@ -from typing import Tuple, Union, List +from typing import Tuple, Union, List, Dict from spacy import component from spacy.tokens import Doc, Token, Span from spacy.vocab import Vocab @@ -7,7 +7,7 @@ import numpy from .sense2vec import Sense2Vec -from .util import merge_phrases, get_phrases, make_spacy_key +from .util import ATTRS, registry, SimpleFrozenDict @component( @@ -38,6 +38,7 @@ def __init__( vocab: Vocab = None, shape: Tuple[int, int] = (1000, 128), merge_phrases: bool = False, + overrides: Dict[str, str] = SimpleFrozenDict(), **kwargs, ): """Initialize the pipeline component. @@ -47,10 +48,17 @@ def __init__( merge_phrases (bool): Merge sense2vec phrases into one token. RETURNS (Sense2VecComponent): The newly constructed object. """ - strings = vocab.strings if vocab is not None else None - self.s2v = Sense2Vec(shape=shape, strings=strings) self.first_run = True self.merge_phrases = merge_phrases + strings = vocab.strings if vocab is not None else None + self.s2v = Sense2Vec(shape=shape, strings=strings) + cfg = { + "make_spacy_key": ATTRS.make_spacy_key, + "get_phrases": ATTRS.get_phrases, + "merge_phrases": ATTRS.merge_phrases, + } + self.s2v.cfg.update(cfg) + self.s2v.cfg.update(overrides) @classmethod def from_nlp(cls, nlp: Language, **kwargs): @@ -74,6 +82,7 @@ def __call__(self, doc: Doc) -> Doc: # Store reference to s2v object on Doc to make sure it's right doc._._s2v = self.s2v if self.merge_phrases: + merge_phrases = registry.merge_phrases.get(doc._._s2v.cfg["merge_phrases"]) doc = merge_phrases(doc) return doc @@ -84,7 +93,7 @@ def init_component(self): not added. """ Doc.set_extension("_s2v", default=None) - Doc.set_extension("s2v_phrases", getter=get_phrases) + Doc.set_extension("s2v_phrases", getter=self.get_phrases) for obj in [Token, Span]: obj.set_extension("s2v_key", getter=self.s2v_key) obj.set_extension("in_s2v", getter=self.in_s2v) @@ -94,6 +103,16 @@ def init_component(self): obj.set_extension("s2v_most_similar", method=self.s2v_most_similar) obj.set_extension("s2v_similarity", method=self.s2v_similarity) + def get_phrases(self, doc: Doc) -> List[Span]: + """Extension attribute getter. Compile a list of sense2vec phrases based + on a processed Doc: named entities and noun chunks without determiners. + + doc (Doc): The Doc to get phrases from. + RETURNS (list): The phrases as a list of Span objects. + """ + func = registry.get_phrases.get(doc._._s2v.cfg["get_phrases"]) + return func(doc) + def in_s2v(self, obj: Union[Token, Span]) -> bool: """Extension attribute getter. Check if a token or span has a vector. @@ -125,9 +144,11 @@ def s2v_key(self, obj: Union[Token, Span]) -> str: obj (Token / Span): The object to create the key for. RETURNS (unicode): The key. """ - return make_spacy_key( - obj, obj.doc._._s2v.make_key, prefer_ents=self.merge_phrases + make_spacy_key = registry.make_spacy_key.get( + obj.doc._._s2v.cfg["make_spacy_key"] ) + word, sense = make_spacy_key(obj, prefer_ents=self.merge_phrases) + return obj.doc._._s2v.make_key(word, sense) def s2v_similarity( self, obj: Union[Token, Span], other: Union[Token, Span] diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index e327be5..e5d1f59 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -1,11 +1,11 @@ -from typing import Callable, Tuple, List, Union, Sequence, Dict +from typing import Tuple, List, Union, Sequence, Dict, Callable from pathlib import Path from spacy.vectors import Vectors from spacy.strings import StringStore import numpy import srsly -from .util import make_key, split_key +from .util import ATTRS, registry, SimpleFrozenDict class Sense2Vec(object): @@ -13,34 +13,32 @@ def __init__( self, shape: tuple = (1000, 128), strings: StringStore = None, - make_key: Callable[[str, str], str] = make_key, - split_key: Callable[[str], Tuple[str, str]] = split_key, senses: List[str] = [], vectors_name: str = "sense2vec", + overrides: Dict[str, str] = SimpleFrozenDict(), ): """Initialize the Sense2Vec object. shape (tuple): The vector shape. strings (StringStore): Optional string store. Will be created if it doesn't exist. - make_key (callable): Optional custom function that takes a word and - sense string and creates the key (e.g. "some_word|sense"). - split_key (callable): Optional custom function that takes a key and - returns the word and sense (e.g. ("some word", "sense")). senses (list): Optional list of all available senses. Used in methods that generate the best sense or other senses. vectors_name (unicode): Optional name to assign to the Vectors object. RETURNS (Sense2Vec): The newly constructed object. """ - self.make_key = make_key - self.split_key = split_key self.vectors = Vectors(shape=shape, name=vectors_name) self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} - self.cfg = {"senses": senses} + self.cfg = { + "senses": senses, + "make_key": ATTRS.make_key, + "split_key": ATTRS.split_key, + } + self.cfg.update(overrides) @property - def senses(self) -> List[str]: + def senses(self) -> Sequence[str]: """RETURNS (list): The available senses.""" return self.cfg.get("senses", []) @@ -105,6 +103,16 @@ def values(self): """YIELDS (numpy.ndarray): The vectors in the table.""" yield from self.vectors.values() + @property + def make_key(self) -> Callable: + """Get the function to make keys.""" + return registry.make_key.get(self.cfg["make_key"]) + + @property + def split_key(self) -> Callable: + """Get the function to split keys.""" + return registry.split_key.get(self.cfg["split_key"]) + def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): """Add a new vector to the table. @@ -271,7 +279,7 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) self.freqs = dict(data.get("freqs", [])) - self.cfg = data.get("cfg", {}) + self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) return self @@ -300,7 +308,7 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): strings_path = path / "strings.json" freqs_path = path / "freqs.json" self.vectors = Vectors().from_disk(path) - self.cfg = srsly.read_json(path / "cfg") + self.cfg.update(srsly.read_json(path / "cfg")) if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): diff --git a/sense2vec/util.py b/sense2vec/util.py index 309db3f..f44d928 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -1,7 +1,8 @@ -from typing import Union, Callable, List, Tuple, Set +from typing import Union, List, Tuple, Set import re from spacy.tokens import Doc, Token, Span from spacy.util import filter_spans +import catalogue try: import importlib.metadata as importlib_metadata # Python 3.8 @@ -9,9 +10,23 @@ import importlib_metadata # noqa: F401 -DEFAULT_SENSE = "?" +class ATTRS(object): + make_key: str = "default_make_key" + split_key: str = "default_split_key" + make_spacy_key: str = "default_make_spacy_key" + get_phrases: str = "default_get_phrases" + merge_phrases: str = "default_merge_phrases" +class registry(object): + make_key = catalogue.create("sense2vec", "make_key") + split_key = catalogue.create("sense2vec", "split_key") + make_spacy_key = catalogue.create("sense2vec", "make_spacy_key") + get_phrases = catalogue.create("sense2vec", "get_phrases") + merge_phrases = catalogue.create("sense2vec", "merge_phrases") + + +@registry.make_key.register(ATTRS.make_key) def make_key(word: str, sense: str) -> str: """Create a key from a word and sense, e.g. "usage_example|NOUN". @@ -23,6 +38,7 @@ def make_key(word: str, sense: str) -> str: return text + "|" + sense +@registry.split_key.register(ATTRS.split_key) def split_key(key: str) -> Tuple[str, str]: """Split a key into word and sense, e.g. ("usage example", "NOUN"). @@ -35,11 +51,10 @@ def split_key(key: str) -> Tuple[str, str]: return word, sense +@registry.make_spacy_key.register(ATTRS.make_spacy_key) def make_spacy_key( - obj: Union[Token, Span], - make_key: Callable[[str, str], str] = make_key, - prefer_ents: bool = False, -) -> str: + obj: Union[Token, Span], prefer_ents: bool = False +) -> Tuple[str, str]: """Create a key from a spaCy object, i.e. a Token or Span. If the object is a token, the part-of-speech tag (Token.pos_) is used for the sense and a special string is created for URLs. If the object is a Span and @@ -47,14 +62,13 @@ def make_spacy_key( span's root part-of-speech tag becomes the sense. obj (Token / Span): The spaCy object to create the key for. - make_key (callable): function that takes a word and sense string and - creates the key (e.g. "word|sense"). prefer_ents (bool): Prefer entity types for single tokens (i.e. token.ent_type instead of tokens.pos_). Should be enabled if phrases are merged into single tokens, because otherwise the entity sense would never be used. RETURNS (unicode): The key. """ + default_sense = "?" text = obj.text if isinstance(obj, Token): if obj.like_url: @@ -66,7 +80,7 @@ def make_spacy_key( sense = obj.pos_ elif isinstance(obj, Span): sense = obj.label_ or obj.root.pos_ - return make_key(text, sense or DEFAULT_SENSE) + return (text, sense or default_sense) def get_noun_phrases(doc: Doc) -> List[Span]: @@ -88,6 +102,7 @@ def get_noun_phrases(doc: Doc) -> List[Span]: return spans +@registry.get_phrases.register(ATTRS.get_phrases) def get_phrases(doc: Doc) -> List[Span]: """Compile a list of sense2vec phrases based on a processed Doc: named entities and noun chunks without determiners. @@ -118,6 +133,7 @@ def is_particle( return token.pos_ in pos or token.dep_ in deps +@registry.merge_phrases.register(ATTRS.merge_phrases) def merge_phrases(doc: Doc) -> Doc: """Transform a spaCy Doc to match the sense2vec format: merge entities into one token and merge noun chunks without determiners. @@ -131,3 +147,24 @@ def merge_phrases(doc: Doc) -> Doc: for span in spans: retokenizer.merge(span) return doc + + +class SimpleFrozenDict(dict): + """Simplified implementation of a frozen dict, mainly used as default + function or method argument (for arguments that should default to empty + dictionary). Will raise an error if user or spaCy attempts to add to dict. + """ + + err = ( + "Can't write to frozen dictionary. This is likely an internal error. " + "Are you writing to a default function argument?" + ) + + def __setitem__(self, key, value): + raise NotImplementedError(self.err) + + def pop(self, key, default=None): + raise NotImplementedError(self.err) + + def update(self, other): + raise NotImplementedError(self.err) diff --git a/setup.cfg b/setup.cfg index f6630ef..a36bfac 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,8 +28,9 @@ include_package_data = true python_requires = >=3.6 install_requires = spacy>=2.2.2,<3.0.0 - numpy>=1.15.0 srsly>=0.2.0 + catalogue>=0.0.3 + numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" [options.entry_points] From 7c4a4373edd4967d57d6416101ebfb749bb5b744 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 13:52:29 +0100 Subject: [PATCH 149/297] Add test for registry --- tests/test_sense2vec.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 63d00f9..29bdafe 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -1,6 +1,6 @@ import pytest import numpy -from sense2vec import Sense2Vec +from sense2vec import Sense2Vec, registry def test_sense2vec_object(): @@ -129,3 +129,29 @@ def test_sense2vec_to_from_bytes(): assert s2v.strings["test1"] in new_s2v2 with pytest.raises(KeyError): # can't resolve hash new_s2v2.strings[s2v.strings["test2"]] + + +def test_registry(): + """Test that custom functions are used internally if they're registered.""" + + @registry.make_key.register("custom_make_key") + def custom_make_key(word, sense): + return "{}###{}".format(word, sense) + + @registry.split_key.register("custom_split_key") + def custom_split_key(key): + return tuple(key.split("###")) + + overrides = {"make_key": "custom_make_key", "split_key": "custom_split_key"} + test_vector = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32) + data = [("clear", "NOUN", 100), ("clear", "VERB", 200), ("clear", "ADJ", 300)] + s2v = Sense2Vec(shape=(len(data), 4), overrides=overrides) + for word, sense, freq in data: + s2v.add(custom_make_key(word, sense), test_vector, freq) + s2v.cfg["senses"].append(sense) + assert "clear###NOUN" in s2v + other_senses = s2v.get_other_senses("clear###NOUN") + assert len(other_senses) == 2 + assert "clear###VERB" in other_senses + assert "clear###ADJ" in other_senses + assert s2v.get_best_sense("clear") == "clear###ADJ" From 316cf76b79a6bb4f4e5073f872fd74257d0f6c45 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 14:42:03 +0100 Subject: [PATCH 150/297] Update test_sense2vec.py --- tests/test_sense2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 29bdafe..5ef8c3e 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -136,7 +136,7 @@ def test_registry(): @registry.make_key.register("custom_make_key") def custom_make_key(word, sense): - return "{}###{}".format(word, sense) + return f"{word}###{sense}" @registry.split_key.register("custom_split_key") def custom_split_key(key): From 3959656d84c417f240604dade04f0d1114c5c1fe Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 14:42:09 +0100 Subject: [PATCH 151/297] Update docstrings and docs --- README.md | 201 +++++++++++++++++++++++++++++++++++------ sense2vec/component.py | 9 +- sense2vec/sense2vec.py | 2 + 3 files changed, 180 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index e44a4da..aa78e84 100644 --- a/README.md +++ b/README.md @@ -183,23 +183,29 @@ most_similar = s2v.most_similar("natural_language_processing|NOUN", n=10) ## 🎛 API -### method `Sense2Vec.__init__` +### class `Sense2Vec` + +The standalone `Sense2Vec` object that holds the vectors, strings and +frequencies. + +#### method `Sense2Vec.__init__` Initialize the `Sense2Vec` object. -| Argument | Type | Description | -| -------------- | --------------------------- | ---------------------------------------------------------------------------------------------------- | -| `shape` | tuple | The vector shape. Defaults to `(1000, 128)`. | -| `strings` | `spacy.strings.StringStore` | Optional string store. Will be created if it doesn't exist. | -| `senses` | list | Optional list of all available senses. Used in methods that generate the best sense or other senses. | -| `vectors_name` | unicode | Optional name to assign to the `Vectors` table, to prevent clashes. Defaults to `"sense2vec"`. | -| **RETURNS** | `Sense2Vec` | The newly constructed object. | +| Argument | Type | Description | +| -------------- | --------------------------- | ---------------------------------------------------------------------------------------------------------------------- | +| `shape` | tuple | The vector shape. Defaults to `(1000, 128)`. | +| `strings` | `spacy.strings.StringStore` | Optional string store. Will be created if it doesn't exist. | +| `senses` | list | Optional list of all available senses. Used in methods that generate the best sense or other senses. | +| `vectors_name` | unicode | Optional name to assign to the `Vectors` table, to prevent clashes. Defaults to `"sense2vec"`. | +| `overrides` | dict | Optional custom functions to use, mapped to names registered via the registry, e.g. `{"make_key": "custom_make_key"}`. | +| **RETURNS** | `Sense2Vec` | The newly constructed object. | ```python s2v = Sense2Vec(shape=(300, 128), senses=["VERB", "NOUN"]) ``` -### method `Sense2Vec.__len__` +#### method `Sense2Vec.__len__` The number of rows in the vectors table. @@ -212,7 +218,7 @@ s2v = Sense2Vec(shape=(300, 128)) assert len(s2v) == 300 ``` -### method `Sense2Vec.__contains__` +#### method `Sense2Vec.__contains__` Check if a key is in the vectors table. @@ -228,7 +234,7 @@ assert "avocado|NOUN" in s2v assert "avocado|VERB" not in s2v ``` -### method `Sense2Vec.__getitem__` +#### method `Sense2Vec.__getitem__` Retrieve a vector for a given key. Returns None if the key is not in the table. @@ -241,7 +247,7 @@ Retrieve a vector for a given key. Returns None if the key is not in the table. vec = s2v["avocado|NOUN"] ``` -### method `Sense2Vec.__setitem__` +#### method `Sense2Vec.__setitem__` Set a vector for a given key. Will raise an error if the key doesn't exist. To add a new entry, use `Sense2Vec.add`. @@ -256,7 +262,7 @@ vec = s2v["avocado|NOUN"] s2v["avacado|NOUN"] = vec ``` -### method `Sense2Vec.add` +#### method `Sense2Vec.add` Add a new vector to the table. @@ -271,7 +277,7 @@ vec = s2v["avocado|NOUN"] s2v.add("🥑|NOUN", vec, 1234) ``` -### method `Sense2Vec.get_freq` +#### method `Sense2Vec.get_freq` Get the frequency count for a given key. @@ -287,7 +293,7 @@ s2v.add("🥑|NOUN", vec, 1234) assert s2v.get_freq("🥑|NOUN") == 1234 ``` -### method `Sense2Vec.set_freq` +#### method `Sense2Vec.set_freq` Set a frequency count for a given key. @@ -300,7 +306,7 @@ Set a frequency count for a given key. s2v.set_freq("avocado|NOUN", 104294) ``` -### method `Sense2Vec.__iter__`, `Sense2Vec.items` +#### method `Sense2Vec.__iter__`, `Sense2Vec.items` Iterate over the entries in the vectors table. @@ -316,7 +322,7 @@ for key, vec in s2v.items(): print(key, vec) ``` -### method `Sense2Vec.keys` +#### method `Sense2Vec.keys` Iterate over the keys in the table. @@ -328,7 +334,7 @@ Iterate over the keys in the table. all_keys = list(s2v.keys()) ``` -### method `Sense2Vec.values` +#### method `Sense2Vec.values` Iterate over the vectors in the table. @@ -340,7 +346,7 @@ Iterate over the vectors in the table. all_vecs = list(s2v.values()) ``` -### property `Sense2Vec.senses` +#### property `Sense2Vec.senses` The available senses in the table, e.g. `"NOUN"` or `"VERB"` (added at initialization). @@ -354,7 +360,7 @@ s2v = Sense2Vec(senses=["VERB", "NOUN"]) assert "VERB" in s2v.senses ``` -### property `Sense2vec.frequencies` +#### property `Sense2vec.frequencies` The frequencies of they keys in the table, in descending order. @@ -367,7 +373,7 @@ most_frequent = s2v.frequencies[:10] key, score = s2v.frequencies[0] ``` -### method `Sense2vec.similarity` +#### method `Sense2vec.similarity` Make a semantic similarity estimate of two keys or two sets of keys. The default estimate is cosine similarity using an average of vectors. @@ -385,7 +391,7 @@ print(s2v.similarity(keys_a, keys_b)) assert s2v.similarity("machine_learning|NOUN", "machine_learning|NOUN") == 1.0 ``` -### method `Sense2Vec.most_similar` +#### method `Sense2Vec.most_similar` Get the most similar entries in the table. If more than one key is provided, the average of the vectors is used. @@ -404,7 +410,7 @@ most_similar = s2v.most_similar("natural_language_processing|NOUN", n=3) # ('deep_learning|NOUN', 0.8573361)] ``` -### method `Sense2Vec.get_other_senses` +#### method `Sense2Vec.get_other_senses` Find other entries for the same word with a different sense, e.g. `"duck|VERB"` for `"duck|NOUN"`. @@ -420,7 +426,7 @@ other_senses = s2v.get_other_senses("duck|NOUN") # ['duck|VERB', 'Duck|ORG', 'Duck|VERB', 'Duck|PERSON', 'Duck|ADJ'] ``` -### method `Sense2Vec.get_best_sense` +#### method `Sense2Vec.get_best_sense` Find the best-matching sense for a given word based on the available senses and frequency counts. Returns `None` if no match is found. @@ -435,7 +441,7 @@ frequency counts. Returns `None` if no match is found. assert s2v.get_best_sense("duck") == "duck|NOUN" ``` -### method `Sense2Vec.to_bytes` +#### method `Sense2Vec.to_bytes` Serialize a `Sense2Vec` object to a bytestring. @@ -448,7 +454,7 @@ Serialize a `Sense2Vec` object to a bytestring. s2v_bytes = s2v.to_bytes() ``` -### method `Sense2Vec.from_bytes` +#### method `Sense2Vec.from_bytes` Load a `Sense2Vec` object from a bytestring. @@ -463,7 +469,7 @@ s2v_bytes = s2v.to_bytes() new_s2v = Sense2Vec().from_bytes(s2v_bytes) ``` -### method `Sense2Vec.to_disk` +#### method `Sense2Vec.to_disk` Serialize a `Sense2Vec` object to a directory. @@ -476,13 +482,13 @@ Serialize a `Sense2Vec` object to a directory. s2v.to_disk("/path/to/sense2vec") ``` -### method `Sense2Vec.from_disk` +#### method `Sense2Vec.from_disk` Load a `Sense2Vec` object from a directory. | Argument | Type | Description | | ----------- | ---------------- | ----------------------------------------- | -| `path` | unicode / `Path` | The path. to load from | +| `path` | unicode / `Path` | The path to load from | | `exclude` | list | Names of serialization fields to exclude. | | **RETURNS** | `Sense2Vec` | The loaded object. | @@ -491,6 +497,143 @@ s2v.to_disk("/path/to/sense2vec") new_s2v = Sense2Vec().from_disk("/path/to/sense2vec") ``` +--- + +### class `Sense2VecComponent` + +The pipeline component to add sense2vec to spaCy pipelines. + +#### method `Sense2VecComponent.__init__` + +Initialize the pipeline component. + +| Argument | Type | Description | +| --------------- | --------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared `Vocab`. Mostly used for the shared `StringStore`. | +| `shape` | tuple | The vector shape. | +| `merge_phrases` | bool | Whether to merge sense2vec phrases into one token. Defaults to `False`. | +| `overrides` | Optional custom functions to use, mapped to names registred via the registry, e.g. `{"make_key": "custom_make_key"}`. | +| **RETURNS** | `Sense2VecComponent` | The newly constructed object. | + +```python +s2v = Sense2VecComponent(nlp.vocab) +``` + +#### classmethod `Sense2VecComponent.from_nlp` + +Initialize the component from an nlp object. Mostly used as the component +factory for the entry point (see setup.cfg) and to auto-register via the +`@spacy.component` decorator. + +| Argument | Type | Description | +| ----------- | -------------------- | ----------------------------- | +| `nlp` | `Language` | The `nlp` object. | +| `**cfg` | - | Optional config parameters. | +| **RETURNS** | `Sense2VecComponent` | The newly constructed object. | + +```python +s2v = Sense2VecComponent.from_nlp(nlp) +``` + +#### method `Sense2VecComponent.__call__` + +Process a `Doc` object with the component. Typically only called as part of the +spaCy pipeline and not directly. + +| Argument | Type | Description | +| ----------- | ----- | ------------------------ | +| `doc` | `Doc` | The document to process. | +| **RETURNS** | `Doc` | the processed document. | + +#### method `Sense2Vec.init_component` + +Register the component-specific extension attributes here and only if the +component is added to the pipeline and used – otherwise, tokens will still get +the attributes even if the component is only created and not added. + +#### method `Sense2VecComponent.to_bytes` + +Serialize the component to a bytestring. Also called when the component is added +to the pipeline and you run `nlp.to_bytes`. + +| Argument | Type | Description | +| ----------- | ----- | ------------------------- | +| **RETURNS** | bytes | The serialized component. | + +#### method `Sense2VecComponent.from_bytes` + +Load a component from a bytestring. Also called when you run `nlp.from_bytes`. + +| Argument | Type | Description | +| ------------ | -------------------- | ------------------ | +| `bytes_data` | bytes | The data to load. | +| **RETURNS** | `Sense2VecComponent` | The loaded object. | + +#### method `Sense2VecComponent.to_disk` + +Serialize the component to a directory. Also called when the component is added +to the pipeline and you run `nlp.to_disk`. + +| Argument | Type | Description | +| -------- | ---------------- | ----------- | +| `path` | unicode / `Path` | The path. | + +#### method `Sense2VecComponent.from_disk` + +Load a `Sense2Vec` object from a directory. Also called when you run +`nlp.from_disk`. + +| Argument | Type | Description | +| ----------- | -------------------- | --------------------- | +| `path` | unicode / `Path` | The path to load from | +| **RETURNS** | `Sense2VecComponent` | The loaded object. | + +--- + +### class `registry` + +Function registry (powered by +[`catalogue`](https://github.com/explosion/catalogue)) to easily customize the +functions used to generate keys and phrases. Allows you to decorate and name +custom functions, swap them out and serialize the custom names when you save out +the model. The following registry options are available: + +| Name | Description | +| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `registry.make_key` | Given a `word` and `sense`, return a string of the key, e.g. `"word|sense".` | +| `registry.split_key` | Given a string key, return a `(word, sense)` tuple. | +| `registry.make_spacy_key` | Given a spaCy object (`Token` or `Span`) and a boolean `prefer_ents` keyword argument (whether to prefer the entity label for single tokens), return a `(word, sense)` tuple. Used in extension attributes to generate a key for tokens and spans. | | +| `registry.get_phrases` | Given a spaCy `Doc`, return a list of `Span` objects used for sense2vec phrases (typically noun phrases and named entities). | +| `registry.merge_phrases` | Given a spaCy `Doc`, get all sense2vec phrases and merge them into single tokens.  | + +Each registry has a `register` method that can be used as a function decorator +and takes one argument, the name of the custom function. + +```python +from sense2vec import registry + +@registry.make_key.register("custom") +def custom_make_key(word, sense): + return f"{word}###{sense}" + +@registry.split_key.register("custom") +def custom_split_key(key): + word, sense = key.split("###") + return word, sense +``` + +When initializing the `Sense2Vec` object, you can now pass in a dictionary of +overrides with the names of your custom registered functions. + +```python +overrides = {"make_key": "custom", "split_key": "custom"} +s2v = Sense2Vec(overrides=overrides) +``` + +This makes it easy to experiment with different strategies and serializing the +strategies as plain strings (instead of having to pass around and/or pickle the +functions themselves). + ## 🚂 Training your own sense2vec vectors The [`/scripts`](/scripts) directory contains command line utilities for diff --git a/sense2vec/component.py b/sense2vec/component.py index 2a6ad2a..7a95846 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -46,6 +46,8 @@ def __init__( vocab (Vocab): The shared vocab. Mostly used for the shared StringStore. shape (tuple): The vector shape. merge_phrases (bool): Merge sense2vec phrases into one token. + overrides (dict): Optional custom functions to use, mapped to names + registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2VecComponent): The newly constructed object. """ self.first_run = True @@ -61,14 +63,15 @@ def __init__( self.s2v.cfg.update(overrides) @classmethod - def from_nlp(cls, nlp: Language, **kwargs): + def from_nlp(cls, nlp: Language, **cfg): """Initialize the component from an nlp object. Mostly used as the - component factory for the entry point (see setup.py). + component factory for the entry point (see setup.cfg). nlp (Language): The nlp object. + **cfg: Optional config parameters. RETURNS (Sense2VecComponent): The newly constructed object. """ - return cls(vocab=nlp.vocab, **kwargs) + return cls(vocab=nlp.vocab, **cfg) def __call__(self, doc: Doc) -> Doc: """Process a Doc object with the component. diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index e5d1f59..8b96766 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -25,6 +25,8 @@ def __init__( senses (list): Optional list of all available senses. Used in methods that generate the best sense or other senses. vectors_name (unicode): Optional name to assign to the Vectors object. + overrides (dict): Optional custom functions to use, mapped to names + registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2Vec): The newly constructed object. """ self.vectors = Vectors(shape=shape, name=vectors_name) From 23b370de6f5791e6c7e3806dcdd3426553bb903d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 15:54:27 +0100 Subject: [PATCH 152/297] Use new catalogue --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index aae2f07..a7827ff 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our packages spacy>=2.2.2,<3.0.0 srsly>=0.2.0 -catalogue>=0.0.3 +catalogue>=0.0.4 # Third-party dependencies numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" diff --git a/setup.cfg b/setup.cfg index a36bfac..460aade 100644 --- a/setup.cfg +++ b/setup.cfg @@ -29,7 +29,7 @@ python_requires = >=3.6 install_requires = spacy>=2.2.2,<3.0.0 srsly>=0.2.0 - catalogue>=0.0.3 + catalogue>=0.0.4 numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" From ad4cee1a5ecc98ff858cec7e130b638066803d2c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 16:18:28 +0100 Subject: [PATCH 153/297] Replace ATTRS with "default" With nested registry the specific names are not needed anymore --- sense2vec/component.py | 8 ++++---- sense2vec/sense2vec.py | 8 ++------ sense2vec/util.py | 18 +++++------------- 3 files changed, 11 insertions(+), 23 deletions(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index 7a95846..80bd52c 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -7,7 +7,7 @@ import numpy from .sense2vec import Sense2Vec -from .util import ATTRS, registry, SimpleFrozenDict +from .util import registry, SimpleFrozenDict @component( @@ -55,9 +55,9 @@ def __init__( strings = vocab.strings if vocab is not None else None self.s2v = Sense2Vec(shape=shape, strings=strings) cfg = { - "make_spacy_key": ATTRS.make_spacy_key, - "get_phrases": ATTRS.get_phrases, - "merge_phrases": ATTRS.merge_phrases, + "make_spacy_key": "default", + "get_phrases": "default", + "merge_phrases": "default", } self.s2v.cfg.update(cfg) self.s2v.cfg.update(overrides) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 8b96766..9645f37 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -5,7 +5,7 @@ import numpy import srsly -from .util import ATTRS, registry, SimpleFrozenDict +from .util import registry, SimpleFrozenDict class Sense2Vec(object): @@ -32,11 +32,7 @@ def __init__( self.vectors = Vectors(shape=shape, name=vectors_name) self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} - self.cfg = { - "senses": senses, - "make_key": ATTRS.make_key, - "split_key": ATTRS.split_key, - } + self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"} self.cfg.update(overrides) @property diff --git a/sense2vec/util.py b/sense2vec/util.py index f44d928..7971948 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -10,14 +10,6 @@ import importlib_metadata # noqa: F401 -class ATTRS(object): - make_key: str = "default_make_key" - split_key: str = "default_split_key" - make_spacy_key: str = "default_make_spacy_key" - get_phrases: str = "default_get_phrases" - merge_phrases: str = "default_merge_phrases" - - class registry(object): make_key = catalogue.create("sense2vec", "make_key") split_key = catalogue.create("sense2vec", "split_key") @@ -26,7 +18,7 @@ class registry(object): merge_phrases = catalogue.create("sense2vec", "merge_phrases") -@registry.make_key.register(ATTRS.make_key) +@registry.make_key.register("default") def make_key(word: str, sense: str) -> str: """Create a key from a word and sense, e.g. "usage_example|NOUN". @@ -38,7 +30,7 @@ def make_key(word: str, sense: str) -> str: return text + "|" + sense -@registry.split_key.register(ATTRS.split_key) +@registry.split_key.register("default") def split_key(key: str) -> Tuple[str, str]: """Split a key into word and sense, e.g. ("usage example", "NOUN"). @@ -51,7 +43,7 @@ def split_key(key: str) -> Tuple[str, str]: return word, sense -@registry.make_spacy_key.register(ATTRS.make_spacy_key) +@registry.make_spacy_key.register("default") def make_spacy_key( obj: Union[Token, Span], prefer_ents: bool = False ) -> Tuple[str, str]: @@ -102,7 +94,7 @@ def get_noun_phrases(doc: Doc) -> List[Span]: return spans -@registry.get_phrases.register(ATTRS.get_phrases) +@registry.get_phrases.register("default") def get_phrases(doc: Doc) -> List[Span]: """Compile a list of sense2vec phrases based on a processed Doc: named entities and noun chunks without determiners. @@ -133,7 +125,7 @@ def is_particle( return token.pos_ in pos or token.dep_ in deps -@registry.merge_phrases.register(ATTRS.merge_phrases) +@registry.merge_phrases.register("default") def merge_phrases(doc: Doc) -> Doc: """Transform a spaCy Doc to match the sense2vec format: merge entities into one token and merge noun chunks without determiners. From c39ac102402756883a5dbf8e56bde1a8d54ae487 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 16:28:31 +0100 Subject: [PATCH 154/297] Increment version [ci skip] --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 460aade..a21b297 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a3 +version = 1.0.0a4 description = Use NLP to go beyond vanilla word2vec url = https://github.com/explosion/sense2vec author = Explosion From 88fdb8b2e51d151190097f4a0a91afdbb7c92745 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 16:29:33 +0100 Subject: [PATCH 155/297] Update version [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index aa78e84..743cec7 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ most_similar = doc[3:6]._.s2v_most_similar(3) sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a3 +pip install sense2vec==1.0.0a4 ``` The Reddit vectors model is attached to From 1d5a39d61221645a6fae275d5ee8be7a6726937a Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Sat, 2 Nov 2019 16:34:26 +0100 Subject: [PATCH 156/297] Read vector size from vectors --- scripts/05_export.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/scripts/05_export.py b/scripts/05_export.py index 93d41fc..0dbb109 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -7,13 +7,27 @@ import numpy +def _get_shape(file_): + """Return a tuple with (number of entries, vector dimensions). Handle + both word2vec/FastText format, which has a header with this, or GloVe's + format, which doesn't.""" + first_line = next(file_).split() + if len(first_line) == 2: + return tuple(int(size) for size in first_line), file_ + count = 1 + for line in file_: + count += 1 + file_.seek(0) + shape = (count, len(first_line)-1) + return shape, file_ + + @plac.annotations( in_file=("Vectors file", "positional", None, str), vocab_file=("Vocabulary file", "positional", None, str), out_dir=("Path to output directory", "positional", None, str), - vector_size=("Dimension of word vector representations", "option", "s", int), ) -def main(in_file, vocab_file, out_dir, vector_size=128): +def main(in_file, vocab_file, out_dir): """ Step 5: Export a sense2vec component @@ -32,6 +46,7 @@ def main(in_file, vocab_file, out_dir, vector_size=128): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") with input_path.open("r", encoding="utf8") as f: + (n_vectors, vector_size), f = _get_shape(f) vectors_data = f.readlines() with vocab_path.open("r", encoding="utf8") as f: vocab_data = f.readlines() @@ -42,6 +57,8 @@ def main(in_file, vocab_file, out_dir, vector_size=128): key = item[0] if key == "": continue + if "|" not in key: + continue vec = item[1:] if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) From c03bcaa30e807a4311037b0683b6c8ce5c9ac9a4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 16:38:47 +0100 Subject: [PATCH 157/297] Auto-format --- scripts/05_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/05_export.py b/scripts/05_export.py index 0dbb109..98f15ce 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -18,7 +18,7 @@ def _get_shape(file_): for line in file_: count += 1 file_.seek(0) - shape = (count, len(first_line)-1) + shape = (count, len(first_line) - 1) return shape, file_ From c16737eb2ed62635d28d12cbda3567a452996d70 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 16:38:59 +0100 Subject: [PATCH 158/297] Don't hardcode assumptions about key [ci skip] --- scripts/05_export.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/05_export.py b/scripts/05_export.py index 98f15ce..13e132f 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -55,14 +55,14 @@ def main(in_file, vocab_file, out_dir): for item in vectors_data: item = item.rstrip().rsplit(" ", vector_size) key = item[0] - if key == "": - continue - if "|" not in key: + try: + _, sense = split_key(key) + except ValueError: continue vec = item[1:] if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) - all_senses.add(split_key(key)[1]) + all_senses.add(sense) data.append((key, numpy.asarray(vec, dtype=numpy.float32))) s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses) for key, vector in data: From 46c2c6cbfcf35b4771aa3b3ca4a56d772302ed6c Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 17:09:28 +0100 Subject: [PATCH 159/297] Adjust GloVe scripts [ci skip] --- scripts/03_glove_build_counts.py | 12 ++++++------ scripts/04_glove_train_vectors.py | 8 ++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/scripts/03_glove_build_counts.py b/scripts/03_glove_build_counts.py index f53bf3c..57940f0 100644 --- a/scripts/03_glove_build_counts.py +++ b/scripts/03_glove_build_counts.py @@ -24,9 +24,9 @@ def main( collect unigram counts and construct and shuffle cooccurrence data. See here for installation instructions: https://github.com/stanfordnlp/GloVe - Note that this script will call into GloVe in a subprocess and expects you - to pass in the GloVe build directory (/build if you run the Makefile). The - commands will also be printed if you want to run them separately. + Note that this script will call into GloVe and expects you to pass in the + GloVe build directory (/build if you run the Makefile). The commands will + also be printed if you want to run them separately. """ msg = Printer() input_path = Path(in_dir) @@ -54,7 +54,7 @@ def main( ) print(cmd) vocab_cmd = os.system(cmd) - if vocab_cmd == 1 or not Path(vocab_file).exists(): + if vocab_cmd != 0 or not Path(vocab_file).exists(): msg.fail("Failed creating vocab counts", exits=1) msg.good("Created vocab counts", vocab_file) @@ -66,7 +66,7 @@ def main( ) print(cmd) cooccur_cmd = os.system(cmd) - if cooccur_cmd == 1 or not Path(cooc_file).exists(): + if cooccur_cmd != 0 or not Path(cooc_file).exists(): msg.fail("Failed creating cooccurrence statistics", exits=1) msg.good("Created cooccurrence statistics", cooc_file) @@ -77,7 +77,7 @@ def main( ) print(cmd) shuffle_cmd = os.system(cmd) - if shuffle_cmd == 1 or not Path(cooc_shuffle_file).exists(): + if shuffle_cmd != 0 or not Path(cooc_shuffle_file).exists(): msg.fail("Failed to shuffle cooccurrence file", exits=1) msg.good("Shuffled cooccurrence file", cooc_shuffle_file) diff --git a/scripts/04_glove_train_vectors.py b/scripts/04_glove_train_vectors.py index d20d517..228afc9 100644 --- a/scripts/04_glove_train_vectors.py +++ b/scripts/04_glove_train_vectors.py @@ -33,9 +33,9 @@ def main( Expects a file containing the shuffled cooccurrences and a vocab file and will output a plain-text vectors file. - Note that this script will call into GloVe in a subprocess and expects you - to pass in the GloVe build directory (/build if you run the Makefile). The - commands will also be printed if you want to run them separately. + Note that this script will call into GloVe and expects you to pass in the + GloVe build directory (/build if you run the Makefile). The commands will + also be printed if you want to run them separately. """ msg = Printer() output_path = Path(out_dir) @@ -58,7 +58,7 @@ def main( ) print(cmd) train_cmd = os.system(cmd) - if train_cmd == 1: + if train_cmd != 0: msg.fail("Failed training vectors", exits=1) msg.good("Successfully trained vectors") From dbc0ef1c00e924e7441af83bd58daa4dfd39f03e Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 17:35:23 +0100 Subject: [PATCH 160/297] Raise error for invalid frequencies --- sense2vec/sense2vec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 9645f37..cd13841 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -140,6 +140,8 @@ def set_freq(self, key: Union[str, int], freq: int): key (unicode / int): The key to set the count for. freq (int): The frequency count. """ + if not isinstance(freq, int): + raise ValueError(f"Invalid frequency count: {repr(freq)} for '{key}'") key = self.ensure_int_key(key) self.freqs[key] = freq From 36320c9c6d412ee1f7c842a32203bf3482a53cc6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 17:35:46 +0100 Subject: [PATCH 161/297] Make ensure_int_key add to string store --- sense2vec/sense2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index cd13841..9a1bfaf 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -151,7 +151,7 @@ def ensure_int_key(self, key: Union[str, int]) -> int: key (unicode / int): The key. RETURNS (int): The integer key. """ - return key if isinstance(key, int) else self.strings[key] + return key if isinstance(key, int) else self.strings.add(key) def similarity( self, From f3ff85c2acb625b7f67c3e163c80f3ba1560c60b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 17:36:17 +0100 Subject: [PATCH 162/297] Add alternative scripts for fastText [ci skip] --- README.md | 35 ++++++----- scripts/04_fasttext_train_vectors.py | 86 ++++++++++++++++++++++++++++ scripts/05_export.py | 14 ++++- 3 files changed, 117 insertions(+), 18 deletions(-) create mode 100644 scripts/04_fasttext_train_vectors.py diff --git a/README.md b/README.md index 743cec7..0958627 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,10 @@ models. - spaCy **pipeline component** and **extension attributes**. - Fully **serializable** so you can easily ship your sense2vec vectors with your spaCy model packages. -- **Train your own vectors** using a pretrained spaCy model and raw text of your - choice ([details](#-training-your-own-sense2vec-vectors)). +- **Train your own vectors** using a pretrained spaCy model, raw text of your + choice and [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via + [fastText](https://github.com/facebookresearch/fastText) + ([details](#-training-your-own-sense2vec-vectors)). - [Prodigy](https://prodi.gy) annotation recipes for creating lists of similar multi-word phrases and converting them to match patterns, e.g. for rule-based NER or to boostrap NER annotation ([details & examples](#-prodigy-recipes)). @@ -652,24 +654,27 @@ To train your own sense2vec vectors, you'll need the following: [syntax iterator for noun phrases](https://spacy.io/usage/adding-languages#syntax-iterators), you'll need to write your own. (The `doc.noun_chunks` and `doc.ents` are what sense2vec uses to determine what's a phrase.) -- [GloVe](https://github.com/stanfordnlp/GloVe) installed and built. You should - be able to clone the repo and run `make` in the directory. +- [GloVe](https://github.com/stanfordnlp/GloVe) or + [fastText](https://github.com/facebookresearch/fastText) installed and built. + You should be able to clone the repo and run `make` in the respective + directory. ### Step-by-step process The training process is split up into several steps to allow you to resume at any given point. Processing scripts are designed to operate on single files, -making it easy to paralellize the work. The scripts in this repo require -[Glove](https://github.com/stanfordnlp/GloVe), which you need to clone and -`make`. - -| | Script | Description | -| ------ | ---------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| **1.** | [`01_parse.py`](scripts/01_parse.py) | Use spaCy to parse the raw text and output binary collections of `Doc` objects (see [`DocBin`](https://spacy.io/api/docbin)). | -| **2.** | [`02_preprocess.py`](scripts/02_preprocess.py) | Load a collection of parsed `Doc` objects produced in the previous step and output text files in the sense2vec format (one sentence per line and merged phrases with senses). | -| **3.** | [`03_glove_build_counts.py`](scripts/03_glove_build_counts.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) to build the vocabulary and counts. | -| **4.** | [`04_glove_train_vectors.py`](scripts/04_glove_train_vectors.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) to train vectors. | -| **5.** | [`05_export.py`](scripts/05_export.py) | Load the vectors and frequencies and output a sense2vec component that can be loaded via `Sense2Vec.from_disk`. | +making it easy to paralellize the work. The scripts in this repo require either +[Glove](https://github.com/stanfordnlp/GloVe) or +[fastText](https://github.com/facebookresearch/fastText), which you need to +clone and `make`. + +| | Script | Description | +| ------ | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| **1.** | [`01_parse.py`](scripts/01_parse.py) | Use spaCy to parse the raw text and output binary collections of `Doc` objects (see [`DocBin`](https://spacy.io/api/docbin)). | +| **2.** | [`02_preprocess.py`](scripts/02_preprocess.py) | Load a collection of parsed `Doc` objects produced in the previous step and output text files in the sense2vec format (one sentence per line and merged phrases with senses). | +| **3.** | [`03_glove_build_counts.py`](scripts/03_glove_build_counts.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) to build the vocabulary and counts. Skip this step if you're using Word2Vec via [FastText](https://github.com/facebookresearch/fastText). | +| **4.** | [`04_glove_train_vectors.py`](scripts/04_glove_train_vectors.py)
[`04_fasttext_train_vectors.py`](scripts/04_fasttext_train_vectors.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) or [FastText](https://github.com/facebookresearch/fastText) to train vectors. | +| **5.** | [`05_export.py`](scripts/05_export.py) | Load the vectors and frequencies and output a sense2vec component that can be loaded via `Sense2Vec.from_disk`. | For more detailed documentation of the scripts, check out the source or run them with `--help`. For example, `python scripts/01_parse.py --help`. diff --git a/scripts/04_fasttext_train_vectors.py b/scripts/04_fasttext_train_vectors.py new file mode 100644 index 0000000..36311c5 --- /dev/null +++ b/scripts/04_fasttext_train_vectors.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python +import plac +import os +from pathlib import Path +from wasabi import Printer + + +@plac.annotations( + fasttext_bin=("Path to the fasttext binary", "positional", None, str), + in_dir=("Directory with preprocessed .s2v files", "positional", None, str), + out_dir=("Path to output directory", "positional", None, str), + n_threads=("Number of threads", "option", "t", int), + min_count=("Minimum count for inclusion in vocab", "option", "c", int), + vector_size=("Dimension of word vector representations", "option", "s", int), + verbose=("Set verbosity: 0, 1, or 2", "option", "v", int), +) +def main( + fasttext_bin, + in_dir, + out_dir, + n_threads=10, + min_count=50, + vector_size=300, + verbose=2, +): + """ + Step 4: Train the vectors + + Expects a directory of preprocessed .s2v input files, will concatenate them + (using a temporary file on disk) and will use fastText to train a word2vec + model. See here for installation instructions: + https://github.com/facebookresearch/fastText + + Note that this script will call into fastText and expects you to pass in the + built fasttext binary. The command will also be printed if you want to run + it separately. + """ + msg = Printer() + input_path = Path(in_dir) + output_path = Path(out_dir) + if not Path(fasttext_bin).exists(): + msg.fail("Can't find fastText binary", fasttext_bin, exits=1) + if not input_path.exists() or not input_path.is_dir(): + msg.fail("Not a valid input directory", in_dir, exits=1) + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory {out_dir}") + output_file = output_path / "vectors" + # fastText expects only one input file and only reads from disk and not + # stdin, so we need to create a temporary file that concatenates the inputs + tmp_path = input_path / "s2v_input.tmp" + input_files = [p for p in input_path.iterdir() if p.suffix == ".s2v"] + if not input_files: + msg.fail("Input directory contains no .s2v files", in_dir, exits=1) + with tmp_path.open("a", encoding="utf8") as tmp_file: + for input_file in input_files: + with input_file.open("r", encoding="utf8") as f: + tmp_file.write(f.read()) + msg.info("Created temporary merged input file", tmp_path) + + msg.info("Training vectors") + cmd = ( + f"{fasttext_bin} skipgram -thread {n_threads} -input {tmp_path} " + f"-output {output_file} -dim {vector_size} -minn 0 -maxn 0 " + f"-minCount {min_count} -verbose {verbose}" + ) + print(cmd) + train_cmd = os.system(cmd) + tmp_path.unlink() + msg.good("Deleted temporary input file", tmp_path) + if train_cmd != 0: + msg.fail("Failed training vectors", exits=1) + msg.good("Successfully trained vectors", out_dir) + + msg.info("Creating vocabulary") + vocab_file = output_path / "vocab.txt" + cmd = f"{fasttext_bin} dump {output_file.with_suffix('.bin')} dict > {vocab_file}" + print(cmd) + vocab_cmd = os.system(cmd) + if vocab_cmd != 0: + msg.fail("Failed creating vocabulary", exits=1) + msg.good("Successfully created vocabulary file", vocab_file) + + +if __name__ == "__main__": + plac.call(main) diff --git a/scripts/05_export.py b/scripts/05_export.py index 13e132f..c83f520 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -23,7 +23,7 @@ def _get_shape(file_): @plac.annotations( - in_file=("Vectors file", "positional", None, str), + in_file=("Vectors file (text-based)", "positional", None, str), vocab_file=("Vocabulary file", "positional", None, str), out_dir=("Path to output directory", "positional", None, str), ) @@ -40,6 +40,8 @@ def main(in_file, vocab_file, out_dir): output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) + if input_path.suffix == ".bin": + msg.fail("Need text-based vectors file, not binary", in_file, exits=1) if not vocab_path.exists(): msg.fail("Can't find vocab file", vocab_file, exits=1) if not output_path.exists(): @@ -68,8 +70,14 @@ def main(in_file, vocab_file, out_dir): for key, vector in data: s2v.add(key, vector) for item in vocab_data: - key, freq = item.rstrip().rsplit(" ", 1) - s2v.set_freq(key, freq) + item = item.rstrip() + if item.endswith(" word"): # for fastText vocabs + item = item[:-5] + try: + key, freq = item.rsplit(" ", 1) + except ValueError: + continue + s2v.set_freq(key, int(freq)) msg.good("Created the sense2vec model") msg.info(f"{len(data)} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) From 40fdbaf5418a7a5274b6e1b5e800f6e2590bf86d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 17:38:16 +0100 Subject: [PATCH 163/297] Update README.md [ci skip] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 0958627..ab72a43 100644 --- a/README.md +++ b/README.md @@ -25,8 +25,8 @@ models. - spaCy **pipeline component** and **extension attributes**. - Fully **serializable** so you can easily ship your sense2vec vectors with your spaCy model packages. -- **Train your own vectors** using a pretrained spaCy model, raw text of your - choice and [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via +- **Train your own vectors** using a pretrained spaCy model, raw text and + [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via [fastText](https://github.com/facebookresearch/fastText) ([details](#-training-your-own-sense2vec-vectors)). - [Prodigy](https://prodi.gy) annotation recipes for creating lists of similar From 7bec6612c294ea21c56ea11b81e138a06cb21235 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 17:40:37 +0100 Subject: [PATCH 164/297] Increment version [ci skip] --- README.md | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ab72a43..8ecac16 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ most_similar = doc[3:6]._.s2v_most_similar(3) sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a4 +pip install sense2vec==1.0.0a5 ``` The Reddit vectors model is attached to diff --git a/setup.cfg b/setup.cfg index a21b297..92cbe91 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a4 +version = 1.0.0a5 description = Use NLP to go beyond vanilla word2vec url = https://github.com/explosion/sense2vec author = Explosion From 4ff172ebd577d90710a00874322a295efeb2617f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 18:31:33 +0100 Subject: [PATCH 165/297] Add type and dimensions to vector filename [ci skip] --- scripts/04_fasttext_train_vectors.py | 2 +- scripts/04_glove_train_vectors.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/04_fasttext_train_vectors.py b/scripts/04_fasttext_train_vectors.py index 36311c5..d287d76 100644 --- a/scripts/04_fasttext_train_vectors.py +++ b/scripts/04_fasttext_train_vectors.py @@ -45,7 +45,7 @@ def main( if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") - output_file = output_path / "vectors" + output_file = output_path / f"vectors_w2v_{vector_size}dim" # fastText expects only one input file and only reads from disk and not # stdin, so we need to create a temporary file that concatenates the inputs tmp_path = input_path / "s2v_input.tmp" diff --git a/scripts/04_glove_train_vectors.py b/scripts/04_glove_train_vectors.py index 228afc9..1f9b19d 100644 --- a/scripts/04_glove_train_vectors.py +++ b/scripts/04_glove_train_vectors.py @@ -48,7 +48,7 @@ def main( if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") - output_file = output_path / "vectors" + output_file = output_path / f"vectors_glove_{vector_size}dim" msg.info("Training vectors") cmd = ( f"{glove_dir}/glove -save-file {output_file} -threads {n_threads} " From 832f9440d2a6f7117b5e8e3a2a653400c375caba Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 18:57:30 +0100 Subject: [PATCH 166/297] Add exclude_senses and batch_size setting --- sense2vec/prodigy_recipes.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 909a0a2..3b39eed 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -158,8 +158,10 @@ def to_patterns( vectors_path=("Path to pretrained sense2vec vectors", "positional", None, str), strategy=("Example selection strategy", "option", "st", str,), senses=("The senses to use (all if not set)", "option", "s", split_string), + exclude_senses=("The senses to exclude", "option", "es", split_string), n_freq=("Number of most frequent entries to limit to", "option", "f", int), threshold=("Similarity threshold to consider examples", "option", "t", float), + batch_size=("The batch size to use", "option", "b", int), eval_whole=("Evaluate whole dataset instead of session", "flag", "E", bool), eval_only=("Don't annotate, only evaluate current set", "flag", "O", bool), show_scores=("Show all scores for debugging", "flag", "S", bool), @@ -169,8 +171,22 @@ def evaluate( vectors_path, strategy="most_similar", senses=None, + exclude_senses=( + "SYM", + "MONEY", + "ORDINAL", + "CARDINAL", + "DATE", + "TIME", + "PERCENT", + "QUANTITY", + "NUM", + "X", + "PUNCT", + ), n_freq=100_000, threshold=0.7, + batch_size=5, eval_whole=False, eval_only=False, show_scores=False, @@ -235,7 +251,7 @@ def get_stream(): keys_by_sense = defaultdict(set) for key in keys: sense = s2v.split_key(key)[1] - if senses is None or sense in senses: + if (senses is None or sense in senses) and sense not in exclude_senses: keys_by_sense[sense].add(key) keys_by_sense = {s: keys for s, keys in keys_by_sense.items() if len(keys) >= 3} all_senses = list(keys_by_sense.keys()) @@ -305,5 +321,9 @@ def on_exit(ctrl): "dataset": dataset, "stream": get_stream(), "on_exit": on_exit, - "config": {"choice_style": "single", "choice_auto_accept": True}, + "config": { + "batch_size": batch_size, + "choice_style": "single", + "choice_auto_accept": True, + }, } From d0f9f1a2a7e7c1b79ed72a097996fc06b9cdd3ee Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 2 Nov 2019 18:57:42 +0100 Subject: [PATCH 167/297] Adjust most_similar settings --- sense2vec/prodigy_recipes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 3b39eed..5102963 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -263,14 +263,14 @@ def get_stream(): sense = random.choice(all_senses) if strategy == "most_similar": key_a = random.choice(list(current_keys[sense])) - most_similar = s2v.most_similar(key_a, n=100) + most_similar = s2v.most_similar(key_a, n=200) options = [] for key, score in most_similar: if key in current_keys[sense]: options.append((key, score)) if len(options) < 2: continue - key_b, sim_ab = options[len(options) // 2] + key_b, sim_ab = options[len(options) // 4] key_c, sim_ac = options[-1] else: key_a, key_b, key_c = random.sample(current_keys[sense], 3) From 040d900ee6b2fc1ccd266fe060495d017e1ef3c2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 17:41:26 +0100 Subject: [PATCH 168/297] Update Prodigy recipes [ci skip] --- README.md | 131 +++++++--- sense2vec/prodigy_recipes.py | 465 ++++++++++++++++++++++++++++------- setup.cfg | 6 +- 3 files changed, 487 insertions(+), 115 deletions(-) diff --git a/README.md b/README.md index b495a56..2b4865a 100644 --- a/README.md +++ b/README.md @@ -685,7 +685,15 @@ This package also seamlessly integrates with the [Prodigy](https://prodi.gy) annotation tool and exposes recipes for using sense2vec vectors to quickly generate lists of multi-word phrases and bootstrap NER annotations. To use a recipe, `sense2vec` needs to be installed in the same environment as Prodigy. -The following recipes are available: +The following recipes are available – see below for more detailed docs. + +| Recipe | Description | +| ------------------------------------------------------------------- | -------------------------------------------------------------------- | +| [`sense2vec.teach`](#recipe-sense2vecteach) | Bootstrap a terminology list using sense2vec. | +| [`sense2vec.to-patterns`](#recipe-sense2vecto-patterns) | Convert phrases dataset to token-based match patterns. | +| [`sense2vec.eval`](#recipe-sense2veceval) | Evaluate a sense2vec model by asking about phrase triples. | +| [`sense2vec.eval-most-similar`](#recipe-sense2veceval-most-similar) | Evaluate a sense2vec model by correcting the most similar entries. | +| [`sense2vec.eval-ab`](#recipe-sense2veceval-ab) | Perform an A/B evaluation of two pretrained sense2vec vector models. | ### recipe `sense2vec.teach` @@ -718,8 +726,8 @@ prodigy sense2vec.teach tech_phrases /path/to/sense2vec_vectors ### recipe `sense2vec.to-patterns` -Convert a list of seed phrases to a list of token-based match patterns that can -be used with +Convert a dataset of phrases collected with `sense2vec.teach` to token-based +match patterns that can be used with [spaCy's `EntityRuler`](https://spacy.io/usage/rule-based-matching#entityruler) or recipes like `ner.match`. If no output file is specified, the patterns are written to stdout. The examples are tokenized so that multi-token terms are @@ -747,45 +755,112 @@ prodigy sense2vec.to-patterns tech_phrases en_core_web_sm TECHNOLOGY --output-file /path/to/patterns.jsonl ``` -### recipe `sense2vec.evaluate` +### recipe `sense2vec.eval` -Evaluate a word vectors model by asking providing questions triples: is word A -more similar to word B, or to word C? If the human mostly agrees with the model, -the vectors model is good. The recipe will only ask about vectors with the same +Evaluate a sense2vec model by asking about phrase triples: is word A more +similar to word B, or to word C? If the human mostly agrees with the model, the +vectors model is good. The recipe will only ask about vectors with the same sense and supports different example selection strategies. ```bash -prodigy sense2vec.evaluate [dataset] [vectors_path] [--strategy] [--senses] -[--n-freq] [--threshold] [--eval-whole] [--eval-only] [--show-scores] -``` - -| Argument | Type | Description | -| --------------------- | ---------- | ------------------------------------------------------------------------------------------------------------- | -| `dataset` | positional | Dataset to save annotations to. | -| `vectors_path` | positional | Path to pretrained sense2vec vectors. | -| `--strategy`, `-st` | option | Example selection strategy. `most similar` (default) or `random`. | -| `--senses`, `-s` | option | Comma-separated list of senses to limit the selection to. If not set, all senses in the vectors will be used. | -| `--n-freq`, `-n` | option | Number of most frequent entries to limit to. | -| `--threshold`, `-t` | option | Minimum similarity threshold to consider examples. | -| `--eval-whole`, `-E` | flag | Evaluate the whole dataset instead of the current session. | -| `--eval-only`, `-O` | flag | Don't annotate, only evaluate the current dataset. | -| `--show-scores`, `-S` | flag | Show all scores for debugging. | +prodigy sense2vec.eval [dataset] [vectors_path] [--strategy] [--senses] +[--exclude-senses] [--n-freq] [--threshold] [--batch-size] [--eval-whole] +[--eval-only] [--show-scores] +``` + +| Argument | Type | Description | +| ------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------- | +| `dataset` | positional | Dataset to save annotations to. | +| `vectors_path` | positional | Path to pretrained sense2vec vectors. | +| `--strategy`, `-st` | option | Example selection strategy. `most similar` (default) or `random`. | +| `--senses`, `-s` | option | Comma-separated list of senses to limit the selection to. If not set, all senses in the vectors will be used. | +| `--exclude-senses`, `-es` | option | Comma-separated list of senses to exclude. See `prodigy_recipes.EVAL_EXCLUDE_SENSES` fro the defaults. | +| `--n-freq`, `-f` | option | Number of most frequent entries to limit to. | +| `--threshold`, `-t` | option | Minimum similarity threshold to consider examples. | +| `--batch-size`, `-b` | option | Batch size to use. | +| `--eval-whole`, `-E` | flag | Evaluate the whole dataset instead of the current session. | +| `--eval-only`, `-O` | flag | Don't annotate, only evaluate the current dataset. | +| `--show-scores`, `-S` | flag | Show all scores for debugging. | #### Strategies -| Name | Description | -| -------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `most_similar` | Pick a random word from a random sense and get its most similar entries of the same sense. Ask about the similarity to the last and middle entry from that selection. | -| `random` | Pick a random sample of 3 words from the same random sense. | +| Name | Description | +| -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `most_similar` | Pick a random word from a random sense and get its most similar entries of the same sense. Ask about the similarity to the last and middle entry from that selection. | +| `most_least_similar` | Pick a random word from a random sense and get its least similar entry and then the least similar entry of that. | +| `random` | Pick a random sample of 3 words from the same random sense. | #### Example ```bash -prodigy sense2vec.evaluate vectors_eval /path/to/sense2vec_vectors +prodigy sense2vec.eval vectors_eval /path/to/sense2vec_vectors --senses NOUN,ORG,PRODUCT --threshold 0.5 ``` -![UI preview of sense2vec.evaluate](https://user-images.githubusercontent.com/13643239/67994212-668cf400-fc44-11e9-8fe2-bf264ae32b0a.png) +![UI preview of sense2vec.eval](https://user-images.githubusercontent.com/13643239/67994212-668cf400-fc44-11e9-8fe2-bf264ae32b0a.png) + +### recipe `sense2vec.eval-most-similar` + +Evaluate a vectors model by looking at the most similar entries it returns for a +random phrase and unselecting the mistakes. + +```bash +prodigy sense2vec.eval [dataset] [vectors_path] [--senses] [--exclude-senses] +[--n-freq] [--n-similar] [--batch-size] [--eval-whole] [--eval-only] +[--show-scores] +``` + +| Argument | Type | Description | +| ------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------- | +| `dataset` | positional | Dataset to save annotations to. | +| `vectors_path` | positional | Path to pretrained sense2vec vectors. | +| `--senses`, `-s` | option | Comma-separated list of senses to limit the selection to. If not set, all senses in the vectors will be used. | +| `--exclude-senses`, `-es` | option | Comma-separated list of senses to exclude. See `prodigy_recipes.EVAL_EXCLUDE_SENSES` fro the defaults. | +| `--n-freq`, `-f` | option | Number of most frequent entries to limit to. | +| `--n-similar`, `-n` | option | Number of similar items to check. Defaults to `10`. | +| `--batch-size`, `-b` | option | Batch size to use. | +| `--eval-whole`, `-E` | flag | Evaluate the whole dataset instead of the current session. | +| `--eval-only`, `-O` | flag | Don't annotate, only evaluate the current dataset. | +| `--show-scores`, `-S` | flag | Show all scores for debugging. | + +```bash +prodigy sense2vec.eval-most-similar vectors_eval_sim /path/to/sense2vec_vectors +--senses NOUN,ORG,PRODUCT +``` + +### recipe `sense2vec.eval-ab` + +Perform an A/B evaluation of two pretrained sense2vec vector models by comparing +the most similar entries they return for a random phrase. The UI shows two +randomized options with the most similar entries of each model and highlights +the phrases that differ. At the end of the annotation session the overall stats +and preferred model are shown. + +```bash +prodigy sense2vec.eval [dataset] [vectors_path_a] [vectors_path_b] [--senses] +[--exclude-senses] [--n-freq] [--n-similar] [--batch-size] [--eval-whole] +[--eval-only] [--show-mapping] +``` + +| Argument | Type | Description | +| ------------------------- | ---------- | ------------------------------------------------------------------------------------------------------------- | +| `dataset` | positional | Dataset to save annotations to. | +| `vectors_path_a` | positional | Path to pretrained sense2vec vectors. | +| `vectors_path_b` | positional | Path to pretrained sense2vec vectors. | +| `--senses`, `-s` | option | Comma-separated list of senses to limit the selection to. If not set, all senses in the vectors will be used. | +| `--exclude-senses`, `-es` | option | Comma-separated list of senses to exclude. See `prodigy_recipes.EVAL_EXCLUDE_SENSES` fro the defaults. | +| `--n-freq`, `-f` | option | Number of most frequent entries to limit to. | +| `--n-similar`, `-n` | option | Number of similar items to check. Defaults to `10`. | +| `--batch-size`, `-b` | option | Batch size to use. | +| `--eval-whole`, `-E` | flag | Evaluate the whole dataset instead of the current session. | +| `--eval-only`, `-O` | flag | Don't annotate, only evaluate the current dataset. | +| `--show-mapping`, `-S` | flag | Show which models are option 1 and option 2 in the UI (for debugging). | + +```bash +prodigy sense2vec.eval-ab vectors_eval_sim /path/to/sense2vec_vectors_a /path/to/sense2vec_vectors_b --senses NOUN,ORG,PRODUCT +``` + +![UI preview of sense2vec.eval-ab](https://user-images.githubusercontent.com/13643239/68088514-46d21780-fe60-11e9-9b29-fe313bb2154d.png) ## Pretrained vectors diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 5102963..db23cf8 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -1,14 +1,22 @@ import prodigy from prodigy.components.db import connect -from prodigy.util import log, split_string, set_hashes, TASK_HASH_ATTR +from prodigy.util import log, split_string, set_hashes, TASK_HASH_ATTR, INPUT_HASH_ATTR import murmurhash from sense2vec import Sense2Vec import srsly import spacy import random from wasabi import Printer -from collections import defaultdict +from collections import defaultdict, Counter import copy +import catalogue + + +# fmt: off +eval_strategies = catalogue.create("prodigy", "sense2vec.eval") +EVAL_EXCLUDE_SENSES = ("SYM", "MONEY", "ORDINAL", "CARDINAL", "DATE", "TIME", + "PERCENT", "QUANTITY", "NUM", "X", "PUNCT") +# fmt: on @prodigy.recipe( @@ -127,10 +135,11 @@ def to_patterns( dataset, spacy_model, label, output_file="-", case_sensitive=False, dry=False ): """ - Convert a list of seed phrases to a list of token-based match patterns that - can be used with spaCy's EntityRuler or recipes like ner.match. If no output - file is specified, the patterns are written to stdout. The examples are - tokenized so that multi-token terms are represented correctly, e.g.: + Convert a dataset of phrases collected with sense2vec.teach to token-based + match patterns that can be used with spaCy's EntityRuler or recipes like + ner.match. If no output file is specified, the patterns are written to + stdout. The examples are tokenized so that multi-token terms are represented + correctly, e.g.: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} """ log("RECIPE: Starting recipe sense2vec.to-patterns", locals()) @@ -153,7 +162,7 @@ def to_patterns( @prodigy.recipe( - "sense2vec.evaluate", + "sense2vec.eval", dataset=("Dataset to save annotations to", "positional", None, str), vectors_path=("Path to pretrained sense2vec vectors", "positional", None, str), strategy=("Example selection strategy", "option", "st", str,), @@ -171,73 +180,31 @@ def evaluate( vectors_path, strategy="most_similar", senses=None, - exclude_senses=( - "SYM", - "MONEY", - "ORDINAL", - "CARDINAL", - "DATE", - "TIME", - "PERCENT", - "QUANTITY", - "NUM", - "X", - "PUNCT", - ), + exclude_senses=EVAL_EXCLUDE_SENSES, n_freq=100_000, threshold=0.7, - batch_size=5, + batch_size=10, eval_whole=False, eval_only=False, show_scores=False, ): - """Evaluate a word vectors model by asking providing questions triples: - is word A more similar to word B, or to word C? If the human mostly agrees - with the model, the vectors model is good. + """ + Evaluate a sense2vec model by asking about phrase triples: is word A more + similar to word B, or to word C? If the human mostly agrees with the model, + the vectors model is good. """ msg = Printer() random.seed(0) - log("RECIPE: Starting recipe sense2vec.evaluate", locals()) - strategies = ["random", "most_similar"] - if strategy not in strategies: - msg.fail(f"Invalid strategy '{strategy}'. Expected: {strategies}", exits=1) + log("RECIPE: Starting recipe sense2vec.eval", locals()) + strategies = eval_strategies.get_all() + if strategy not in strategies.keys(): + err = f"Invalid strategy '{strategy}'. Expected: {list(strategies.keys())}" + msg.fail(err, exits=1) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) - def eval_dataset(set_id): - """Output summary about user agreement with the model.""" - db = connect() - data = db.get_dataset(set_id) - accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] - rejected = [eg for eg in data if eg["answer"] == "reject"] - if not accepted and not rejected: - msg.warn("No annotations collected", exits=1) - high_conf = 0.8 - agree_count = 0 - disagree_high_conf = len([e for e in rejected if e["confidence"] > high_conf]) - for eg in accepted: - choice = eg["accept"][0] - score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0] - score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0] - if score_choice > score_other: - agree_count += 1 - elif eg["confidence"] > high_conf: - disagree_high_conf += 1 - pc = agree_count / (len(accepted) + len(rejected)) - text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" - msg.info(f"Evaluating data from '{set_id}'") - if pc > 0.5: - msg.good(text) - else: - msg.fail(text) - msg.text(f"You disagreed on {disagree_high_conf} high confidence scores") - msg.text(f"You rejected {len(rejected)} suggestions as not similar") - - if eval_only: - eval_dataset(dataset) - return None - - def get_html(word, sense, score=None, large=False): + def get_html(key, score=None, large=False): + word, sense = s2v.split_key(key) html_word = f"{word}" html_sense = f"{sense}" html = f"{html_word} {html_sense}" @@ -246,64 +213,59 @@ def get_html(word, sense, score=None, large=False): return html def get_stream(): + strategy_func = eval_strategies.get(strategy) + log(f"RECIPE: Using strategy {strategy}") # Limit to most frequent entries keys = [key for key, _ in s2v.frequencies[:n_freq]] keys_by_sense = defaultdict(set) for key in keys: - sense = s2v.split_key(key)[1] + try: + sense = s2v.split_key(key)[1] + except ValueError: + continue if (senses is None or sense in senses) and sense not in exclude_senses: keys_by_sense[sense].add(key) keys_by_sense = {s: keys for s, keys in keys_by_sense.items() if len(keys) >= 3} all_senses = list(keys_by_sense.keys()) total_keys = sum(len(keys) for keys in keys_by_sense.values()) log(f"RECIPE: Using {total_keys} entries for {len(all_senses)} senses") + n_passes = 1 while True: + log(f"RECIPE: Iterating over the data ({n_passes})") current_keys = copy.deepcopy(keys_by_sense) while any(len(values) >= 3 for values in current_keys.values()): sense = random.choice(all_senses) - if strategy == "most_similar": - key_a = random.choice(list(current_keys[sense])) - most_similar = s2v.most_similar(key_a, n=200) - options = [] - for key, score in most_similar: - if key in current_keys[sense]: - options.append((key, score)) - if len(options) < 2: - continue - key_b, sim_ab = options[len(options) // 4] - key_c, sim_ac = options[-1] - else: - key_a, key_b, key_c = random.sample(current_keys[sense], 3) - sim_ab = s2v.similarity(key_a, key_b) - sim_ac = s2v.similarity(key_a, key_c) + all_keys = list(current_keys[sense]) + key_a, key_b, key_c, sim_ab, sim_ac = strategy_func(s2v, all_keys) if len(set([key_a.lower(), key_b.lower(), key_c.lower()])) != 3: continue if sim_ab < threshold or sim_ac < threshold: continue - current_keys[sense].remove(key_a) - current_keys[sense].remove(key_b) - current_keys[sense].remove(key_c) + for key in (key_a, key_b, key_c): + current_keys[sense].remove(key) confidence = 1.0 - (min(sim_ab, sim_ac) / max(sim_ab, sim_ac)) - # Get a more representative hash + input_hash = murmurhash.hash(key_a) task_hash = murmurhash.hash(" ".join([key_a] + sorted([key_b, key_c]))) task = { "label": "Which one is more similar?", - "html": get_html(*s2v.split_key(key_a), large=True), + "html": get_html(key_a, large=True), + "text": f"{key_a}: {key_b}, {key_c}", "key": key_a, "options": [ { "id": key_b, - "html": get_html(*s2v.split_key(key_b), sim_ab), + "html": get_html(key_b, sim_ab), "score": sim_ab, }, { "id": key_c, - "html": get_html(*s2v.split_key(key_c), sim_ac), + "html": get_html(key_c, sim_ac), "score": sim_ac, }, ], "confidence": confidence, TASK_HASH_ATTR: task_hash, + INPUT_HASH_ATTR: input_hash, } if show_scores: task["meta"] = { @@ -311,11 +273,344 @@ def get_stream(): "strategy": strategy, } yield task + n_passes += 1 + + def eval_dataset(set_id): + """Output summary about user agreement with the model.""" + DB = connect() + data = DB.get_dataset(set_id) + accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] + rejected = [eg for eg in data if eg["answer"] == "reject"] + if not accepted and not rejected: + msg.warn("No annotations collected", exits=1) + high_conf = 0.8 + agree_count = 0 + disagree_high_conf = len([e for e in rejected if e["confidence"] > high_conf]) + for eg in accepted: + choice = eg["accept"][0] + score_choice = [o["score"] for o in eg["options"] if o["id"] == choice][0] + score_other = [o["score"] for o in eg["options"] if o["id"] != choice][0] + if score_choice > score_other: + agree_count += 1 + elif eg["confidence"] > high_conf: + disagree_high_conf += 1 + pc = agree_count / (len(accepted) + len(rejected)) + text = f"You agreed {agree_count} / {len(data)} times ({pc:.0%})" + msg.info(f"Evaluating data from '{set_id}'") + if pc > 0.5: + msg.good(text) + else: + msg.fail(text) + msg.text(f"You disagreed on {disagree_high_conf} high confidence scores") + msg.text(f"You rejected {len(rejected)} suggestions as not similar") def on_exit(ctrl): set_id = dataset if eval_whole else ctrl.session_id eval_dataset(set_id) + if eval_only: + eval_dataset(dataset) + return None + + return { + "view_id": "choice", + "dataset": dataset, + "stream": get_stream(), + "on_exit": on_exit, + "config": { + "batch_size": batch_size, + "choice_style": "single", + "choice_auto_accept": True, + }, + } + + +@eval_strategies.register("random") +def eval_strategy_random(s2v, keys): + key_a, key_b, key_c = random.sample(keys, 3) + sim_ab = s2v.similarity(key_a, key_b) + sim_ac = s2v.similarity(key_a, key_c) + return key_a, key_b, key_c, sim_ab, sim_ac + + +@eval_strategies.register("most_similar") +def eval_strategy_most_similar(s2v, keys): + key_a = random.choice(keys) + most_similar = s2v.most_similar(key_a, n=min(2000, len(s2v))) + options = [(key, score) for key, score in most_similar if key in keys] + if len(options) < 2: + return eval_strategy_most_similar(s2v, keys) + key_b, sim_ab = options[len(options) // 2] + key_c, sim_ac = options[-1] + return key_a, key_b, key_c, sim_ab, sim_ac + + +@eval_strategies.register("most_least_similar") +def eval_strategy_most_least_similar(s2v, keys): + n_similar = 100 + key_a = random.choice(keys) + most_similar_a = s2v.most_similar(key_a, n=n_similar) + options_a = [(key, score) for key, score in most_similar_a if key in keys] + if len(options_a) < 1: + return eval_strategy_most_least_similar(s2v, keys) + key_b, sim_ab = options_a[-1] + most_similar_b = s2v.most_similar(key_b, n=n_similar) + options_b = [(key, score) for key, score in most_similar_b if key in keys] + if len(options_b) < 1: + return eval_strategy_most_least_similar(s2v, keys) + key_c, sim_ac = options_b[-1] + return key_a, key_b, key_c, sim_ab, sim_ac + + +@prodigy.recipe( + "sense2vec.eval-most-similar", + dataset=("Dataset to save annotations to", "positional", None, str), + vectors_path=("Path to pretrained sense2vec vectors", "positional", None, str), + senses=("The senses to use (all if not set)", "option", "s", split_string), + exclude_senses=("The senses to exclude", "option", "es", split_string), + n_freq=("Number of most frequent entries to limit to", "option", "f", int), + n_similar=("Number of similar items to check", "option", "n", int), + batch_size=("The batch size to use", "option", "b", int), + eval_whole=("Evaluate whole dataset instead of session", "flag", "E", bool), + eval_only=("Don't annotate, only evaluate current set", "flag", "O", bool), + show_scores=("Show all scores for debugging", "flag", "S", bool), +) +def eval_most_similar( + dataset, + vectors_path, + senses=None, + exclude_senses=EVAL_EXCLUDE_SENSES, + n_freq=100_000, + n_similar=10, + batch_size=5, + eval_whole=False, + eval_only=False, + show_scores=False, +): + """ + Evaluate a vectors model by looking at the most similar entries it returns + for a random phrase and unselecting the mistakes. + """ + log("RECIPE: Starting recipe sense2vec.eval-most-similar", locals()) + msg = Printer() + random.seed(0) + s2v = Sense2Vec().from_disk(vectors_path) + log("RECIPE: Loaded sense2vec vectors", vectors_path) + seen = set() + DB = connect() + if dataset in DB: + examples = DB.get_dataset(dataset) + seen.update([eg["text"] for eg in examples if eg["answer"] == "accept"]) + log(f"RECIPE: Skipping {len(seen)} terms already in dataset") + + def get_html(key, score=None, large=False): + word, sense = s2v.split_key(key) + html_word = f"{word}" + html_sense = f"{sense}" + html = f"{html_word} {html_sense}" + if show_scores and score is not None: + html += f" {score:.4}" + return html + + def get_stream(): + keys = [key for key, _ in s2v.frequencies[:n_freq] if key not in seen] + while len(keys): + key = random.choice(keys) + keys.remove(key) + word, sense = s2v.split_key(key) + if sense in exclude_senses or (senses is not None and sense not in senses): + continue + most_similar = s2v.most_similar(key, n=n_similar) + options = [{"id": k, "html": get_html(k, s)} for k, s in most_similar] + task_hash = murmurhash.hash(key) + task = { + "html": get_html(key, large=True), + "text": key, + "options": options, + "accept": [key for key, _ in most_similar], # pre-select all + TASK_HASH_ATTR: task_hash, + INPUT_HASH_ATTR: task_hash, + } + yield task + + def eval_dataset(set_id): + DB = connect() + data = DB.get_dataset(set_id) + accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] + rejected = [eg for eg in data if eg["answer"] == "reject"] + ignored = [eg for eg in data if eg["answer"] == "ignore"] + if not accepted and not rejected: + msg.warn("No annotations collected", exits=1) + total_count = 0 + agree_count = 0 + for eg in accepted: + total_count += len(eg.get("options", [])) + agree_count += len(eg.get("accept", [])) + msg.info(f"Evaluating data from '{set_id}'") + msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") + pc = agree_count / total_count + text = f"You agreed {agree_count} / {total_count} times ({pc:.0%})" + if pc > 0.5: + msg.good(text) + else: + msg.fail(text) + + def on_exit(ctrl): + set_id = dataset if eval_whole else ctrl.session_id + eval_dataset(set_id) + + if eval_only: + eval_dataset(dataset) + return None + + return { + "view_id": "choice", + "dataset": dataset, + "stream": get_stream(), + "on_exit": on_exit, + "config": {"choice_style": "multiple", "batch_size": batch_size}, + } + + +@prodigy.recipe( + "sense2vec.eval-ab", + dataset=("Dataset to save annotations to", "positional", None, str), + vectors_path_a=("Path to pretrained sense2vec vectors", "positional", None, str), + vectors_path_b=("Path to pretrained sense2vec vectors", "positional", None, str), + senses=("The senses to use (all if not set)", "option", "s", split_string), + exclude_senses=("The senses to exclude", "option", "es", split_string), + n_freq=("Number of most frequent entries to limit to", "option", "f", int), + batch_size=("The batch size to use", "option", "b", int), + eval_whole=("Evaluate whole dataset instead of session", "flag", "E", bool), + eval_only=("Don't annotate, only evaluate current set", "flag", "O", bool), + show_mapping=("Show A/B mapping for debugging", "flag", "S", bool), +) +def eval_ab( + dataset, + vectors_path_a, + vectors_path_b, + senses=None, + exclude_senses=EVAL_EXCLUDE_SENSES, + n_freq=100_000, + n_similar=10, + batch_size=5, + eval_whole=False, + eval_only=False, + show_mapping=False, +): + """ + Perform an A/B evaluation of two pretrained sense2vec vector models by + comparing the most similar entries they return for a random phrase. The + UI shows two randomized options with the most similar entries of each model + and highlights the phrases that differ. At the end of the annotation + session the overall stats and preferred model are shown. + """ + log("RECIPE: Starting recipe sense2vec.eval-ab", locals()) + msg = Printer() + random.seed(0) + s2v_a = Sense2Vec().from_disk(vectors_path_a) + s2v_b = Sense2Vec().from_disk(vectors_path_b) + mapping = {"A": vectors_path_a, "B": vectors_path_b} + log("RECIPE: Loaded sense2vec vectors", (vectors_path_a, vectors_path_b)) + seen = set() + DB = connect() + if dataset in DB: + examples = DB.get_dataset(dataset) + seen.update([eg["text"] for eg in examples if eg["answer"] == "accept"]) + log(f"RECIPE: Skipping {len(seen)} terms already in dataset") + + def get_term_html(key): + word, sense = s2v_a.split_key(key) + return ( + f"{word} " + f"{sense}" + ) + + def get_option_html(most_similar, overlap): + html = [] + for key in most_similar: + font_weight = "normal" if key in overlap else "bold" + border_color = "#f6f6f6" if key in overlap else "#ccc" + word, sense = s2v_a.split_key(key) + html.append( + f"{word} {sense}" + ) + html = " ".join(html) if html else "No results" + return ( + f"
{html}
" + ) + + def get_stream(): + keys_a = [key for key, _ in s2v_a.frequencies[:n_freq] if key not in seen] + keys_b = [key for key, _ in s2v_b.frequencies[:n_freq] if key not in seen] + while len(keys_a): + key = random.choice(keys_a) + keys_a.remove(key) + word, sense = s2v_a.split_key(key) + if sense in exclude_senses or (senses is not None and sense not in senses): + continue + if key not in keys_b: + continue + similar_a = set(s2v_a.most_similar(key, n=n_similar)) + similar_b = set(s2v_b.most_similar(key, n=n_similar)) + overlap = similar_a.intersection(similar_b) + options = [ + {"id": "A", "html": get_option_html(similar_a, overlap)}, + {"id": "B", "html": get_option_html(similar_b, overlap)}, + ] + random.shuffle(options) + task_hash = murmurhash.hash(key) + task = { + "html": get_term_html(key), + "text": key, + "options": options, + TASK_HASH_ATTR: task_hash, + INPUT_HASH_ATTR: task_hash, + } + if show_mapping: + task["meta"] = { + i + 1: f"{opt['id']} ({mapping[opt['id']]})" + for i, opt in enumerate(options) + } + yield task + + def eval_dataset(set_id): + DB = connect() + data = DB.get_dataset(set_id) + accepted = [eg for eg in data if eg["answer"] == "accept" and eg.get("accept")] + rejected = [eg for eg in data if eg["answer"] == "reject"] + ignored = [eg for eg in data if eg["answer"] == "ignore"] + if not accepted and not rejected: + msg.warn("No annotations collected", exits=1) + counts = Counter() + for eg in accepted: + for model_id in eg["accept"]: + counts[model_id] += 1 + preference = max(counts) + ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}" + msg.info(f"Evaluating data from {set_id}") + msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") + if counts["A"] == counts["B"]: + msg.warn(f"No preference ({ratio})") + else: + msg.good(f"You preferred vectors {preference} ({ratio})") + msg.text(mapping[preference]) + + def on_exit(ctrl): + set_id = dataset if eval_whole else ctrl.session_id + eval_dataset(set_id) + + if eval_only: + eval_dataset(dataset) + return None + return { "view_id": "choice", "dataset": dataset, diff --git a/setup.cfg b/setup.cfg index b1957f7..5f27f27 100644 --- a/setup.cfg +++ b/setup.cfg @@ -38,8 +38,10 @@ spacy_factories = sense2vec = sense2vec:Sense2VecComponent.from_nlp prodigy_recipes = sense2vec.teach = sense2vec:prodigy_recipes.teach - sens2vec.to_patterns = sense2vec:prodigy_recipes.to_patterns - sens2vec.evaluate = sense2vec:prodigy_recipes.evaluate + sens2vec.to-patterns = sense2vec:prodigy_recipes.to_patterns + sense2vec.eval = sense2vec:prodigy_recipes.evaluate + sense2vec.eval-most-similar = sense2vec:prodigy_recipes.eval_most_similar + sense2vec.eval-ab = sense2vec:prodigy_recipes.eval_ab [bdist_wheel] universal = true From 3e8420103ec63d06849f1483194dd30bbaa0bd8d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 17:56:04 +0100 Subject: [PATCH 169/297] Tidy up --- sense2vec/prodigy_recipes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index db23cf8..4cd3d3c 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -575,10 +575,8 @@ def get_stream(): INPUT_HASH_ATTR: task_hash, } if show_mapping: - task["meta"] = { - i + 1: f"{opt['id']} ({mapping[opt['id']]})" - for i, opt in enumerate(options) - } + opt_map = [f"{opt['id']} ({mapping[opt['id']]})" for opt in options] + task["meta"] = {i + 1: opt for i, opt in enumerate(opt_map)} yield task def eval_dataset(set_id): From 108471adc38e61726ade2af33fa5c2b41527d9f4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 17:56:14 +0100 Subject: [PATCH 170/297] Fix key list bug [ci skip] --- sense2vec/prodigy_recipes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 4cd3d3c..ce627e0 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -558,8 +558,8 @@ def get_stream(): continue if key not in keys_b: continue - similar_a = set(s2v_a.most_similar(key, n=n_similar)) - similar_b = set(s2v_b.most_similar(key, n=n_similar)) + similar_a = set([k for k, _ in s2v_a.most_similar(key, n=n_similar)]) + similar_b = set([k for k, _ in s2v_b.most_similar(key, n=n_similar)]) overlap = similar_a.intersection(similar_b) options = [ {"id": "A", "html": get_option_html(similar_a, overlap)}, From 42b619b94263651bcdc8de91910ca7e601bc08c2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 18:14:45 +0100 Subject: [PATCH 171/297] Tidy up --- sense2vec/prodigy_recipes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index ce627e0..83e300c 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -593,7 +593,7 @@ def eval_dataset(set_id): counts[model_id] += 1 preference = max(counts) ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}" - msg.info(f"Evaluating data from {set_id}") + msg.info(f"Evaluating data from '{set_id}'") msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") if counts["A"] == counts["B"]: msg.warn(f"No preference ({ratio})") From dd51f94330a1d4ea0a5ccc9c0485a62f1e6703ea Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 18:14:51 +0100 Subject: [PATCH 172/297] Add percentage to eval-ab [ci skip] --- sense2vec/prodigy_recipes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 83e300c..9936bc4 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -598,7 +598,8 @@ def eval_dataset(set_id): if counts["A"] == counts["B"]: msg.warn(f"No preference ({ratio})") else: - msg.good(f"You preferred vectors {preference} ({ratio})") + pc = counts[preference] / sum(counts.values()) + msg.good(f"You preferred vectors {preference} {ratio} ({pc:.0%})") msg.text(mapping[preference]) def on_exit(ctrl): From fa8ca223ff2e39ef1d8911486f6d1979ed1e76f4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 18:17:23 +0100 Subject: [PATCH 173/297] Increment version [ci skip] --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 5f27f27..22d5e25 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a5 +version = 1.0.0a6 description = Use NLP to go beyond vanilla word2vec url = https://github.com/explosion/sense2vec author = Explosion From 6d2ed859a9d2fd88edc244655bf28d183dd9a885 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 18:17:46 +0100 Subject: [PATCH 174/297] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2b4865a..3b9f5b2 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ most_similar = doc[3:6]._.s2v_most_similar(3) sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a5 +pip install sense2vec==1.0.0a6 ``` The Reddit vectors model is attached to From 286ce76c0af60a73c6a9624bbacf755f59e58385 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 18:37:29 +0100 Subject: [PATCH 175/297] Update README.md [ci skip] --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 3b9f5b2..33bb154 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,8 @@ models. multi-word phrases and converting them to match patterns, e.g. for rule-based NER or to boostrap NER annotation ([details & examples](#-prodigy-recipes)). +![](https://user-images.githubusercontent.com/13643239/68089415-db407800-fe68-11e9-9c45-47338dea49a9.jpg) + ## 🚀 Usage Examples ### Standalone usage From 8c034af0eb7afa0134444a98c364e37e72e80925 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 18:38:25 +0100 Subject: [PATCH 176/297] Update README.md [ci skip] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 33bb154..be2d3ed 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ models. ## ✨ Features +![](https://user-images.githubusercontent.com/13643239/68089415-db407800-fe68-11e9-9c45-47338dea49a9.jpg) + - Query **vectors for multi-word phrases** based on part-of-speech tags and entity labels. - spaCy **pipeline component** and **extension attributes**. @@ -33,8 +35,6 @@ models. multi-word phrases and converting them to match patterns, e.g. for rule-based NER or to boostrap NER annotation ([details & examples](#-prodigy-recipes)). -![](https://user-images.githubusercontent.com/13643239/68089415-db407800-fe68-11e9-9c45-47338dea49a9.jpg) - ## 🚀 Usage Examples ### Standalone usage From 1dba58483f3b3a1013c9857776582d1df534466f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 21:17:27 +0100 Subject: [PATCH 177/297] Update README.md --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index be2d3ed..300ba02 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,10 @@ models. [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via [fastText](https://github.com/facebookresearch/fastText) ([details](#-training-your-own-sense2vec-vectors)). -- [Prodigy](https://prodi.gy) annotation recipes for creating lists of similar - multi-word phrases and converting them to match patterns, e.g. for rule-based - NER or to boostrap NER annotation ([details & examples](#-prodigy-recipes)). +- [Prodigy](https://prodi.gy) **annotation recipes** for evaluating models, + creating lists of similar multi-word phrases and converting them to match + patterns, e.g. for rule-based NER or to boostrap NER annotation + ([details & examples](#-prodigy-recipes)). ## 🚀 Usage Examples From 21e1aa184ca885e24374acc8734d8efee76de0f6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 3 Nov 2019 21:19:23 +0100 Subject: [PATCH 178/297] Use latest wasabi --- scripts/01_parse.py | 3 +-- scripts/02_preprocess.py | 3 +-- scripts/03_glove_build_counts.py | 3 +-- scripts/04_fasttext_train_vectors.py | 3 +-- scripts/04_glove_train_vectors.py | 3 +-- scripts/05_export.py | 3 +-- scripts/requirements.txt | 1 - sense2vec/prodigy_recipes.py | 5 +---- setup.cfg | 1 + 9 files changed, 8 insertions(+), 17 deletions(-) diff --git a/scripts/01_parse.py b/scripts/01_parse.py index 4bbaed4..e9b79fe 100644 --- a/scripts/01_parse.py +++ b/scripts/01_parse.py @@ -2,7 +2,7 @@ import spacy from spacy.tokens import DocBin import plac -from wasabi import Printer +from wasabi import msg from pathlib import Path import tqdm @@ -20,7 +20,6 @@ def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ - msg = Printer() input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): diff --git a/scripts/02_preprocess.py b/scripts/02_preprocess.py index 637c014..c0cacae 100644 --- a/scripts/02_preprocess.py +++ b/scripts/02_preprocess.py @@ -3,7 +3,7 @@ import spacy from spacy.tokens import DocBin import plac -from wasabi import Printer +from wasabi import msg from pathlib import Path import tqdm @@ -30,7 +30,6 @@ def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN """ - msg = Printer() input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): diff --git a/scripts/03_glove_build_counts.py b/scripts/03_glove_build_counts.py index 57940f0..eb1b923 100644 --- a/scripts/03_glove_build_counts.py +++ b/scripts/03_glove_build_counts.py @@ -2,7 +2,7 @@ import plac import os from pathlib import Path -from wasabi import Printer +from wasabi import msg @plac.annotations( @@ -28,7 +28,6 @@ def main( GloVe build directory (/build if you run the Makefile). The commands will also be printed if you want to run them separately. """ - msg = Printer() input_path = Path(in_dir) output_path = Path(out_dir) if not Path(glove_dir).exists(): diff --git a/scripts/04_fasttext_train_vectors.py b/scripts/04_fasttext_train_vectors.py index d287d76..44376d7 100644 --- a/scripts/04_fasttext_train_vectors.py +++ b/scripts/04_fasttext_train_vectors.py @@ -2,7 +2,7 @@ import plac import os from pathlib import Path -from wasabi import Printer +from wasabi import msg @plac.annotations( @@ -35,7 +35,6 @@ def main( built fasttext binary. The command will also be printed if you want to run it separately. """ - msg = Printer() input_path = Path(in_dir) output_path = Path(out_dir) if not Path(fasttext_bin).exists(): diff --git a/scripts/04_glove_train_vectors.py b/scripts/04_glove_train_vectors.py index 1f9b19d..d39ea63 100644 --- a/scripts/04_glove_train_vectors.py +++ b/scripts/04_glove_train_vectors.py @@ -2,7 +2,7 @@ import plac import os from pathlib import Path -from wasabi import Printer +from wasabi import msg @plac.annotations( @@ -37,7 +37,6 @@ def main( GloVe build directory (/build if you run the Makefile). The commands will also be printed if you want to run them separately. """ - msg = Printer() output_path = Path(out_dir) if not Path(glove_dir).exists(): msg.fail("Can't find GloVe build directory", glove_dir, exits=1) diff --git a/scripts/05_export.py b/scripts/05_export.py index c83f520..f7eee1c 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -3,7 +3,7 @@ from sense2vec.util import split_key from pathlib import Path import plac -from wasabi import Printer +from wasabi import msg import numpy @@ -34,7 +34,6 @@ def main(in_file, vocab_file, out_dir): Expects a vectors.txt and a vocab file trained with GloVe and exports a component that can be loaded with Sense2vec.from_disk. """ - msg = Printer() input_path = Path(in_file) vocab_path = Path(vocab_file) output_path = Path(out_dir) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 8977703..7c35914 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,3 +1,2 @@ plac>=0.9.6,<1.2.0 tqdm>=4.36.1,<5.0.0 -wasabi>=0.2.0,<1.1.0 diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 9936bc4..8a639cd 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -6,7 +6,7 @@ import srsly import spacy import random -from wasabi import Printer +from wasabi import msg from collections import defaultdict, Counter import copy import catalogue @@ -193,7 +193,6 @@ def evaluate( similar to word B, or to word C? If the human mostly agrees with the model, the vectors model is good. """ - msg = Printer() random.seed(0) log("RECIPE: Starting recipe sense2vec.eval", locals()) strategies = eval_strategies.get_all() @@ -392,7 +391,6 @@ def eval_most_similar( for a random phrase and unselecting the mistakes. """ log("RECIPE: Starting recipe sense2vec.eval-most-similar", locals()) - msg = Printer() random.seed(0) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) @@ -506,7 +504,6 @@ def eval_ab( session the overall stats and preferred model are shown. """ log("RECIPE: Starting recipe sense2vec.eval-ab", locals()) - msg = Printer() random.seed(0) s2v_a = Sense2Vec().from_disk(vectors_path_a) s2v_b = Sense2Vec().from_disk(vectors_path_b) diff --git a/setup.cfg b/setup.cfg index 22d5e25..8e8842e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -30,6 +30,7 @@ install_requires = spacy>=2.2.2,<3.0.0 srsly>=0.2.0 catalogue>=0.0.4 + wasabi>=0.4.0,<1.1.0 numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" From c20e917feb846f536baa897aab43e4067dcb2b46 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 3 Nov 2019 22:13:26 +0100 Subject: [PATCH 179/297] Add true-casing --- sense2vec/util.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index 7971948..5c54e35 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -61,7 +61,7 @@ def make_spacy_key( RETURNS (unicode): The key. """ default_sense = "?" - text = obj.text + text = get_true_cased_text(obj) if isinstance(obj, Token): if obj.like_url: text = "%%URL" @@ -75,6 +75,27 @@ def make_spacy_key( return (text, sense or default_sense) +def get_true_cased_text(obj): + if isinstance(obj, Token) and not obj.is_sent_start: + return obj.text + elif not obj[0].is_sent_start: + return obj.text + elif obj.ent_type_: + return obj.text + # Okay we have a non-entity, starting a sentence. Is its first letter upper-case? + elif not obj.text[0].isupper(): + return obj.text + # ..Only its first letter? + elif any(c.isupper() for c in obj.text[1:]): + return obj.text + # Is it "I"? + elif obj.text.split("|")[0] == "I": + return obj.text + else: + # Okay fix the casing. + return obj.text.lower() + + def get_noun_phrases(doc: Doc) -> List[Span]: """Compile a list of noun phrases in sense2vec's format (without determiners). Separated out to make it easier to customize, e.g. for From aff188f9d878e44dd2a690e9d13c4e6ea7a9f035 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 4 Nov 2019 00:11:48 +0100 Subject: [PATCH 180/297] Add Streamlit demo [ci skip] --- README.md | 19 ++++++++- scripts/streamlit_sense2vec.py | 74 ++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 1 deletion(-) create mode 100644 scripts/streamlit_sense2vec.py diff --git a/README.md b/README.md index 300ba02..f6c11b2 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ models. patterns, e.g. for rule-based NER or to boostrap NER annotation ([details & examples](#-prodigy-recipes)). -## 🚀 Usage Examples +## 🚀 Quickstart ### Standalone usage @@ -74,6 +74,23 @@ most_similar = doc[3:6]._.s2v_most_similar(3) # (('deep learning', 'NOUN'), 0.8573361)] ``` +### Interactive demos + + + +To try out our pretrained vectors trained on Reddit comments, check out the +[interactive sense2vec demo](https://explosion.ai/demos/sense2vec). + +This repo also includes a [Streamlit](https://streamlit.io) demo script for +exploring vectors and the most similar phrases. After installing `streamlit`, +you can run the script with `streamlit run` and one or more paths to pretrained +vectors as positional arguments on the command line. For example: + +```bash +pip install streamlit +streamlit run scripts/streamlit_sense2vec.py /path/to/vectors +``` + ## ⏳ Installation & Setup > ️🚨 **This is an alpha release so you need to specify the explicit version diff --git a/scripts/streamlit_sense2vec.py b/scripts/streamlit_sense2vec.py new file mode 100644 index 0000000..13456d5 --- /dev/null +++ b/scripts/streamlit_sense2vec.py @@ -0,0 +1,74 @@ +""" +Streamlit script for visualizing most similar sense2vec entries + +Lets you look up words and an optional sense (sense with the highest frequency +is used if "auto" is selected) and shows the N most similar phrases, their +scores and their frequencies. + +To add vector models, you can pass one or more directory paths (containing the +serialized sense2vec components) when you run it with "streamlit run": +streamlit run streamlit_sense2vec.py /path/to/sense2vec /path/to/other_sense2vec +""" +import streamlit as st +from sense2vec import Sense2Vec +import sys + +SENSE2VEC_PATHS = list(sys.argv[1:]) +DEFAULT_WORD = "natural language processing" + + +@st.cache(allow_output_mutation=True) +def load_vectors(path): + return Sense2Vec().from_disk(path) + + +st.sidebar.title("sense2vec") +st.sidebar.markdown( + "Explore semantic similarities of multi-word phrases using " + "[`sense2vec`](https://github.com/explosion/sense2vec/)." +) + +word = st.sidebar.text_input("Word", DEFAULT_WORD) +sense_dropdown = st.sidebar.empty() +n_similar = st.sidebar.slider("Number of similar entries", 1, 100, value=20, step=1) +case_insensitive = st.sidebar.checkbox("Case-insensitive (filter only)") +vectors_path = st.sidebar.selectbox("Vectors", SENSE2VEC_PATHS) + +if not vectors_path: + st.error( + f""" +#### No vectors available +You can pass one or more paths to this +script on the command line. For example: +```bash +streamlit run {sys.argv[0]} /path/to/sense2vec /path/to/other_sense2vec +``` +""" + ) +else: + s2v = load_vectors(vectors_path) + sense = sense_dropdown.selectbox("Sense", ["auto"] + s2v.senses) + + key = s2v.get_best_sense(word) if sense == "auto" else s2v.make_key(word, sense) + st.header(f"{word} ({sense})") + if key is None or key not in s2v: + st.error(f"**Not found:** No vector available for '{word}' ({sense}).") + else: + most_similar = s2v.most_similar(key, n=n_similar) + + rows = [] + if case_insensitive: + filtered = {k.lower(): (k, s) for k, s in most_similar if k.lower() != key} + most_similar = filtered.values() + for sim_key, sim_score in most_similar: + sim_word, sim_sense = s2v.split_key(sim_key) + sim_freq = s2v.get_freq(sim_key) + row = f"| {sim_word} | `{sim_sense}` | `{sim_score:.3f}` | {sim_freq:,} |" + rows.append(row) + table_rows = "\n".join(rows) + table = f""" + | Word | Sense | Similarity | Frequency | + | --- | --- | ---: | ---: | + {table_rows} + """ + st.markdown(table) From 3ca4e6765099a4a204473a8dc56c6066fb2c208b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 4 Nov 2019 00:15:08 +0100 Subject: [PATCH 181/297] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f6c11b2..c5c26c8 100644 --- a/README.md +++ b/README.md @@ -88,7 +88,7 @@ vectors as positional arguments on the command line. For example: ```bash pip install streamlit -streamlit run scripts/streamlit_sense2vec.py /path/to/vectors +streamlit run https://raw.githubusercontent.com/explosion/sense2vec/master/scripts/streamlit_sense2vec.py /path/to/vectors ``` ## ⏳ Installation & Setup From d49a3fefed1d8f1f1142f615f21965fe278fad20 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 4 Nov 2019 01:19:07 +0100 Subject: [PATCH 182/297] Fix true casing --- sense2vec/util.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index 5c54e35..1f2a00c 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -76,11 +76,9 @@ def make_spacy_key( def get_true_cased_text(obj): - if isinstance(obj, Token) and not obj.is_sent_start: + if isinstance(obj, Token) and (not obj.is_sent_start or obj.ent_type_): return obj.text - elif not obj[0].is_sent_start: - return obj.text - elif obj.ent_type_: + elif isinstance(obj, Span) and (not obj[0].is_sent_start or obj[0].ent_type): return obj.text # Okay we have a non-entity, starting a sentence. Is its first letter upper-case? elif not obj.text[0].isupper(): From 11e076cad3b6b2bfa99aa76e87e180027672dc1b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 4 Nov 2019 01:46:43 +0100 Subject: [PATCH 183/297] Fix function and add tests --- sense2vec/util.py | 25 ++++++++++++++----------- tests/test_util.py | 44 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 11 deletions(-) create mode 100644 tests/test_util.py diff --git a/sense2vec/util.py b/sense2vec/util.py index 1f2a00c..8df495f 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -75,22 +75,25 @@ def make_spacy_key( return (text, sense or default_sense) -def get_true_cased_text(obj): +def get_true_cased_text(obj: Union[Token, Span]): + """Correct casing so that sentence-initial words are not title-cased. Named + entities and other special cases (such as the word "I") should still be + title-cased. + + obj (Token / Span): The spaCy object to conver to text. + RETURNS (unicode): The converted text. + """ if isinstance(obj, Token) and (not obj.is_sent_start or obj.ent_type_): return obj.text elif isinstance(obj, Span) and (not obj[0].is_sent_start or obj[0].ent_type): return obj.text - # Okay we have a non-entity, starting a sentence. Is its first letter upper-case? - elif not obj.text[0].isupper(): - return obj.text - # ..Only its first letter? - elif any(c.isupper() for c in obj.text[1:]): - return obj.text - # Is it "I"? - elif obj.text.split("|")[0] == "I": + elif ( # Okay, we have a non-entity, starting a sentence + not obj.text[0].isupper() # Is its first letter upper-case? + or any(c.isupper() for c in obj.text[1:]) # # ..Only its first letter? + or obj.text[0] == "I" # Is it "I"? + ): return obj.text - else: - # Okay fix the casing. + else: # Fix the casing return obj.text.lower() diff --git a/tests/test_util.py b/tests/test_util.py new file mode 100644 index 0000000..26ff84e --- /dev/null +++ b/tests/test_util.py @@ -0,0 +1,44 @@ +from spacy.tokens import Doc, Span +from spacy.vocab import Vocab +from sense2vec.util import get_true_cased_text + + +def get_doc(vocab, words, spaces, pos): + doc = Doc(vocab, words=words, spaces=spaces) + for i, pos_tag in enumerate(pos): + doc[i].pos_ = pos_tag + return doc + + +def test_get_true_cased_text(): + vocab = Vocab() + words1 = ["Cool", ",", "thanks", "!"] + spaces1 = [False, True, False, False] + pos1 = ["ADJ", "PUNCT", "NOUN", "PUNCT"] + doc1 = get_doc(vocab, words1, spaces1, pos1) + assert get_true_cased_text(doc1[0:4]) == "cool, thanks!" + assert get_true_cased_text(doc1[0]) == "cool" + assert get_true_cased_text(doc1[2:4]) == "thanks!" + words2 = ["I", "can", "understand", "."] + spaces2 = [True, True, False, False] + pos2 = ["PRON", "VERB", "VERB", "PUNCT"] + doc2 = get_doc(vocab, words2, spaces2, pos2) + assert get_true_cased_text(doc2[0:4]) == "I can understand." + assert get_true_cased_text(doc2[0]) == "I" + assert get_true_cased_text(doc2[2:4]) == "understand." + words3 = ["Obama", "was", "pretty", "good", "..."] + spaces3 = [True, True, True, False, False] + pos3 = ["PROPN", "VERB", "ADV", "ADJ", "PUNCT"] + doc3 = get_doc(vocab, words3, spaces3, pos3) + doc3.ents = [Span(doc3, 0, 1, label="PERSON")] + assert get_true_cased_text(doc3[0:5]) == "Obama was pretty good..." + assert get_true_cased_text(doc3[0]) == "Obama" + assert get_true_cased_text(doc3[2:4]) == "pretty good" + words4 = ["Barack", "Obama", "was", "pretty", "good", "..."] + spaces4 = [True, True, True, True, False, False] + pos4 = ["PROPN", "PROPN", "VERB", "ADV", "ADJ", "PUNCT"] + doc4 = get_doc(vocab, words4, spaces4, pos4) + doc4.ents = [Span(doc4, 0, 2, label="PERSON")] + assert get_true_cased_text(doc4[0:6]) == "Barack Obama was pretty good..." + assert get_true_cased_text(doc4[0:2]) == "Barack Obama" + assert get_true_cased_text(doc4[1]) == "Obama" From 9c4462429a6b187c89f844c1f6a45c715d6d8469 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 4 Nov 2019 01:48:19 +0100 Subject: [PATCH 184/297] Fix formatting [ci skip] --- sense2vec/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/util.py b/sense2vec/util.py index 8df495f..25d5c60 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -89,7 +89,7 @@ def get_true_cased_text(obj: Union[Token, Span]): return obj.text elif ( # Okay, we have a non-entity, starting a sentence not obj.text[0].isupper() # Is its first letter upper-case? - or any(c.isupper() for c in obj.text[1:]) # # ..Only its first letter? + or any(c.isupper() for c in obj.text[1:]) # ..Only its first letter? or obj.text[0] == "I" # Is it "I"? ): return obj.text From 5292c662db9ac2e162fce4af8ac3fdcb56f480ae Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 4 Nov 2019 01:52:56 +0100 Subject: [PATCH 185/297] Use better examples with more variety --- tests/test_util.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/test_util.py b/tests/test_util.py index 26ff84e..bb34be8 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -26,19 +26,20 @@ def test_get_true_cased_text(): assert get_true_cased_text(doc2[0:4]) == "I can understand." assert get_true_cased_text(doc2[0]) == "I" assert get_true_cased_text(doc2[2:4]) == "understand." - words3 = ["Obama", "was", "pretty", "good", "..."] - spaces3 = [True, True, True, False, False] - pos3 = ["PROPN", "VERB", "ADV", "ADJ", "PUNCT"] + words3 = ["You", "think", "Obama", "was", "pretty", "good", "..."] + spaces3 = [True, True, True, True, True, False, False] + pos3 = ["PRON", "VERB", "PROPN", "VERB", "ADV", "ADJ", "PUNCT"] doc3 = get_doc(vocab, words3, spaces3, pos3) - doc3.ents = [Span(doc3, 0, 1, label="PERSON")] - assert get_true_cased_text(doc3[0:5]) == "Obama was pretty good..." - assert get_true_cased_text(doc3[0]) == "Obama" - assert get_true_cased_text(doc3[2:4]) == "pretty good" - words4 = ["Barack", "Obama", "was", "pretty", "good", "..."] - spaces4 = [True, True, True, True, False, False] - pos4 = ["PROPN", "PROPN", "VERB", "ADV", "ADJ", "PUNCT"] + doc3.ents = [Span(doc3, 2, 3, label="PERSON")] + assert get_true_cased_text(doc3[0:7]) == "You think Obama was pretty good..." + assert get_true_cased_text(doc3[0]) == "you" + assert get_true_cased_text(doc3[2]) == "Obama" + assert get_true_cased_text(doc3[4:6]) == "pretty good" + words4 = ["Ok", ",", "Barack", "Obama", "was", "pretty", "good", "..."] + spaces4 = [False, True, True, True, True, True, False, False] + pos4 = ["INTJ", "PUNCT", "PROPN", "PROPN", "VERB", "ADV", "ADJ", "PUNCT"] doc4 = get_doc(vocab, words4, spaces4, pos4) - doc4.ents = [Span(doc4, 0, 2, label="PERSON")] - assert get_true_cased_text(doc4[0:6]) == "Barack Obama was pretty good..." - assert get_true_cased_text(doc4[0:2]) == "Barack Obama" - assert get_true_cased_text(doc4[1]) == "Obama" + doc4.ents = [Span(doc4, 2, 4, label="PERSON")] + assert get_true_cased_text(doc4[0:8]) == "Ok, Barack Obama was pretty good..." + assert get_true_cased_text(doc4[2:4]) == "Barack Obama" + assert get_true_cased_text(doc4[3]) == "Obama" From ad49b66b992409e3061f29317d31f0b54a587688 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 4 Nov 2019 02:00:54 +0100 Subject: [PATCH 186/297] Add more util tests --- tests/test_util.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_util.py b/tests/test_util.py index bb34be8..80ad994 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -1,6 +1,7 @@ +import pytest from spacy.tokens import Doc, Span from spacy.vocab import Vocab -from sense2vec.util import get_true_cased_text +from sense2vec.util import get_true_cased_text, make_key, split_key def get_doc(vocab, words, spaces, pos): @@ -43,3 +44,16 @@ def test_get_true_cased_text(): assert get_true_cased_text(doc4[0:8]) == "Ok, Barack Obama was pretty good..." assert get_true_cased_text(doc4[2:4]) == "Barack Obama" assert get_true_cased_text(doc4[3]) == "Obama" + + +@pytest.mark.parametrize( + "word,sense,expected", + [ + ("foo", "bar", "foo|bar"), + ("hello world", "TEST", "hello_world|TEST"), + ("hello world |test!", "TEST", "hello_world_|test!|TEST"), + ], +) +def test_make_split_key(word, sense, expected): + assert make_key(word, sense) == expected + assert split_key(expected) == (word, sense) From d408e2aa3cfc5dc75b86ba2ab8e9e3f85f0061b6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 15 Nov 2019 17:17:12 +0100 Subject: [PATCH 187/297] Add CI for Python 3.8 --- azure-pipelines.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index dbd2d2d..281f514 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -17,15 +17,15 @@ jobs: Python36Mac: imageName: 'macos-10.13' python.version: '3.6' - Python37Linux: + Python38Linux: imageName: 'ubuntu-16.04' - python.version: '3.7' - Python37Windows: + python.version: '3.8' + Python38Windows: imageName: 'vs2017-win2016' - python.version: '3.7' - Python37Mac: + python.version: '3.8' + Python38Mac: imageName: 'macos-10.13' - python.version: '3.7' + python.version: '3.8' maxParallel: 4 pool: vmImage: $(imageName) From 647eae90e8f8ebe43a0e81df319f5a7d961647cd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 16:02:46 +0100 Subject: [PATCH 188/297] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c5c26c8..5d3f6fc 100644 --- a/README.md +++ b/README.md @@ -83,8 +83,9 @@ To try out our pretrained vectors trained on Reddit comments, check out the This repo also includes a [Streamlit](https://streamlit.io) demo script for exploring vectors and the most similar phrases. After installing `streamlit`, -you can run the script with `streamlit run` and one or more paths to pretrained -vectors as positional arguments on the command line. For example: +you can run the script with `streamlit run` and **one or more paths to +pretrained vectors** as **positional arguments** on the command line. For +example: ```bash pip install streamlit From 3f9bf6963962ea7d52eb2f75f550a53d2f7422ed Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 16:03:23 +0100 Subject: [PATCH 189/297] Fix pattern generation --- sense2vec/prodigy_recipes.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 8a639cd..da417bc 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -149,11 +149,12 @@ def to_patterns( if dataset not in DB: raise ValueError(f"Can't find dataset '{dataset}'") examples = DB.get_dataset(dataset) - terms = [eg["text"] for eg in examples if eg["answer"] == "accept"] + terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"]) if case_sensitive: - patterns = [{"text": t.text for t in nlp.make_doc(term)} for term in terms] + patterns = [[{"text": t.lower_} for t in nlp.make_doc(term)] for term in terms] else: - patterns = [{"lower": t.lower_ for t in nlp.make_doc(term)} for term in terms] + terms = set([word.lower() for word in terms]) + patterns = [[{"lower": t.lower_} for t in nlp.make_doc(term)] for term in terms] patterns = [{"label": label, "pattern": pattern} for pattern in patterns] log(f"RECIPE: Generated {len(patterns)} patterns") if not dry: From 66f4bb84e6fa478a55297e0649f3a1d917aec4ac Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 16:03:38 +0100 Subject: [PATCH 190/297] Use wasabi printer for ValueErrors --- sense2vec/prodigy_recipes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index da417bc..fcf60c5 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -55,7 +55,7 @@ def teach( for seed in seeds: key = s2v.get_best_sense(seed) if key is None: - raise ValueError(f"Can't find seed term '{seed}' in vectors") + msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1) accept_keys.append(key) best_word, best_sense = s2v.split_key(key) task = { @@ -147,7 +147,7 @@ def to_patterns( log(f"RECIPE: Loaded spaCy model '{spacy_model}'") DB = connect() if dataset not in DB: - raise ValueError(f"Can't find dataset '{dataset}'") + msg.fail(f"Can't find dataset '{dataset}'", exits=1) examples = DB.get_dataset(dataset) terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"]) if case_sensitive: From 601843bf87e868576fe77b684bb61ef40e76eb50 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 16:03:45 +0100 Subject: [PATCH 191/297] Fix template --- sense2vec/prodigy_recipes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index fcf60c5..ff2688d 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -48,7 +48,10 @@ def teach( log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) - html_template = "{{word}}{{sense}}" + html_template = ( + "{{word}}" + "{{sense}}" + ) accept_keys = [] seen = set(accept_keys) seed_tasks = [] From a47749d6b6cfce72612bc69b2362cb139ce636a4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 16:04:07 +0100 Subject: [PATCH 192/297] Support "blank:lang" syntax in sense2vec.to-patterns --- sense2vec/prodigy_recipes.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index ff2688d..04b72d1 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -128,7 +128,7 @@ def get_stream(): @prodigy.recipe( "sense2vec.to-patterns", dataset=("Phrase dataset to convert", "positional", None, str), - spacy_model=("spaCy model for tokenization", "positional", None, str), + spacy_model=("spaCy model or blank:en (for tokenization)", "positional", None, str), label=("Label to apply to all patterns", "positional", None, str), output_file=("Optional output file. Defaults to stdout", "option", "o", str), case_sensitive=("Make patterns case-sensitive", "flag", "CS", bool), @@ -144,9 +144,18 @@ def to_patterns( stdout. The examples are tokenized so that multi-token terms are represented correctly, e.g.: {"label": "SHOE_BRAND", "pattern": [{"LOWER": "new"}, {"LOWER": "balance"}]} + + For tokenization, you can either pass in the name of a spaCy model (e.g. if + you're using a model with custom tokenization), or "blank:" plus the + language code you want to use, e.g. blank:en or blank:de. Make sure to use + the same language / tokenizer you're planning to use at runtime – otherwise + your patterns may not match. """ log("RECIPE: Starting recipe sense2vec.to-patterns", locals()) - nlp = spacy.load(spacy_model) + if spacy_model.startswith("blank:"): + nlp = spacy.blank(spacy_model.replace("blank:", "")) + else: + nlp = spacy.load(spacy_model) log(f"RECIPE: Loaded spaCy model '{spacy_model}'") DB = connect() if dataset not in DB: From 37095f5c60004ae99d7c339308b2fc22592b12d2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 16:04:34 +0100 Subject: [PATCH 193/297] Lower threshold if whole batch is skipped and prevent infinite loop --- sense2vec/prodigy_recipes.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 04b72d1..457cda2 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -44,6 +44,9 @@ def teach( suggestions will be adjusted as you annotate and accept similar phrases. For each seed term, the best matching sense according to the sense2vec vectors will be used. + + If no similar terms are found above the given threshold, the threshold is + lowered by 0.1 and similar terms are requested again. """ log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) @@ -98,6 +101,7 @@ def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter.""" + nonlocal threshold while True: log( f"RECIPE: Looking for {n_similar} phrases most similar to " @@ -105,6 +109,7 @@ def get_stream(): ) most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") + n_skipped = 0 for key, score in most_similar: if key not in seen and score > threshold: seen.add(key) @@ -113,6 +118,20 @@ def get_stream(): # may fail when trying to serialize it to/from JSON meta = {"score": float(score)} yield {"text": key, "word": word, "sense": sense, "meta": meta} + else: + n_skipped += 1 + if n_skipped: + log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}") + if n_skipped == len(most_similar): + # No most similar phrases were found that are above the + # threshold, so lower the threshold if it's not already 0 or + # return empty list so Prodigy shows "no tasks available" + new_threshold = threshold - 0.1 + if new_threshold <= 0.0: + log(f"RECIPE: No suggestions for threshold {threshold:.2}") + return [] + log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}") + threshold = new_threshold stream = get_stream() From b8839ed4c787a4bc87eabf171c3d6c97e692dae0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 17:08:19 +0100 Subject: [PATCH 194/297] Show sense less prominently in sense2vec.teach --- sense2vec/prodigy_recipes.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 457cda2..2b3da51 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -51,10 +51,7 @@ def teach( log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) - html_template = ( - "{{word}}" - "{{sense}}" - ) + html_template = "{{word}}" accept_keys = [] seen = set(accept_keys) seed_tasks = [] @@ -68,7 +65,7 @@ def teach( "text": key, "word": best_word, "sense": best_sense, - "meta": {"score": 1.0}, + "meta": {"score": 1.0, "sense": best_sense}, "answer": "accept", } seed_tasks.append(set_hashes(task)) @@ -116,7 +113,7 @@ def get_stream(): word, sense = s2v.split_key(key) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON - meta = {"score": float(score)} + meta = {"score": float(score), "sense": sense} yield {"text": key, "word": word, "sense": sense, "meta": meta} else: n_skipped += 1 From b9f2db78ac476251a76c01adbaecd8cbfdded76a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 17:08:53 +0100 Subject: [PATCH 195/297] Add case_sensitive option and fix handling of seen terms --- sense2vec/prodigy_recipes.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 2b3da51..bbb9ec6 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -27,6 +27,7 @@ threshold=("Similarity threshold for sense2vec", "option", "t", float), n_similar=("Number of similar items to get at once", "option", "n", int), batch_size=("Batch size for submitting annotations", "option", "bs", int), + case_sensitive=("Show the same terms with different casing", "flag", "CS", bool), resume=("Resume from existing phrases dataset", "flag", "R", bool), ) def teach( @@ -36,6 +37,7 @@ def teach( threshold=0.85, n_similar=20, batch_size=5, + case_sensitive=False, resume=False, ): """ @@ -53,7 +55,7 @@ def teach( log("RECIPE: Loaded sense2vec vectors", vectors_path) html_template = "{{word}}" accept_keys = [] - seen = set(accept_keys) + seen = set() seed_tasks = [] for seed in seeds: key = s2v.get_best_sense(seed) @@ -61,6 +63,7 @@ def teach( msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1) accept_keys.append(key) best_word, best_sense = s2v.split_key(key) + seen.add(best_word if case_sensitive else best_word.lower()) task = { "text": key, "word": best_word, @@ -81,9 +84,13 @@ def teach( if resume: prev = DB.get_dataset(dataset) - prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] - accept_keys += prev_accept - seen.update(set(accept_keys)) + prev_accept_keys = [eg["text"] for eg in prev if eg["answer"] == "accept"] + prev_words = [ + eg["word"] if case_sensitive else eg["word"].lower() + for eg in prev + ] + accept_keys += prev_accept_keys + seen.update(set(prev_words)) log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}") def update(answers): @@ -107,10 +114,16 @@ def get_stream(): most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") n_skipped = 0 + n_duplicate = 0 for key, score in most_similar: - if key not in seen and score > threshold: - seen.add(key) + if score > threshold: word, sense = s2v.split_key(key) + if (case_sensitive and word in seen) or ( + not case_sensitive and word.lower() in seen + ): + n_duplicate += 1 + continue + seen.add(word if case_sensitive else word.lower()) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON meta = {"score": float(score), "sense": sense} @@ -119,7 +132,7 @@ def get_stream(): n_skipped += 1 if n_skipped: log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}") - if n_skipped == len(most_similar): + if n_skipped == len(most_similar) - n_duplicate: # No most similar phrases were found that are above the # threshold, so lower the threshold if it's not already 0 or # return empty list so Prodigy shows "no tasks available" @@ -127,7 +140,9 @@ def get_stream(): if new_threshold <= 0.0: log(f"RECIPE: No suggestions for threshold {threshold:.2}") return [] - log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}") + log( + f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}" + ) threshold = new_threshold stream = get_stream() From 5d5d888a8849167dfecbcdcf7222865ffd57ffbf Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 17:09:13 +0100 Subject: [PATCH 196/297] Use higher n_similar default in sense2vec.teach --- sense2vec/prodigy_recipes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index bbb9ec6..0f2a9ab 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -35,7 +35,7 @@ def teach( vectors_path, seeds, threshold=0.85, - n_similar=20, + n_similar=100, batch_size=5, case_sensitive=False, resume=False, From 0ec16b9dc95e1bccb6f05e0584b6d38ea9b230f9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 16 Nov 2019 17:17:47 +0100 Subject: [PATCH 197/297] Update Streamlit demo [ci skip] --- scripts/streamlit_sense2vec.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/scripts/streamlit_sense2vec.py b/scripts/streamlit_sense2vec.py index 13456d5..0a67dcf 100644 --- a/scripts/streamlit_sense2vec.py +++ b/scripts/streamlit_sense2vec.py @@ -31,7 +31,7 @@ def load_vectors(path): word = st.sidebar.text_input("Word", DEFAULT_WORD) sense_dropdown = st.sidebar.empty() n_similar = st.sidebar.slider("Number of similar entries", 1, 100, value=20, step=1) -case_insensitive = st.sidebar.checkbox("Case-insensitive (filter only)") +show_senses = st.sidebar.checkbox("Distinguish results by sense") vectors_path = st.sidebar.selectbox("Vectors", SENSE2VEC_PATHS) if not vectors_path: @@ -48,27 +48,28 @@ def load_vectors(path): else: s2v = load_vectors(vectors_path) sense = sense_dropdown.selectbox("Sense", ["auto"] + s2v.senses) - key = s2v.get_best_sense(word) if sense == "auto" else s2v.make_key(word, sense) st.header(f"{word} ({sense})") if key is None or key not in s2v: st.error(f"**Not found:** No vector available for '{word}' ({sense}).") else: most_similar = s2v.most_similar(key, n=n_similar) - rows = [] - if case_insensitive: - filtered = {k.lower(): (k, s) for k, s in most_similar if k.lower() != key} - most_similar = filtered.values() + seen = set() for sim_key, sim_score in most_similar: sim_word, sim_sense = s2v.split_key(sim_key) + if not show_senses and sim_word in seen: + continue + seen.add(sim_word) sim_freq = s2v.get_freq(sim_key) - row = f"| {sim_word} | `{sim_sense}` | `{sim_score:.3f}` | {sim_freq:,} |" + if show_senses: + sim_word = f"{sim_word} `{sim_sense}`" + row = f"| {sim_word} | `{sim_score:.3f}` | {sim_freq:,} |" rows.append(row) table_rows = "\n".join(rows) table = f""" - | Word | Sense | Similarity | Frequency | - | --- | --- | ---: | ---: | + | Word | Similarity | Frequency | + | --- | ---: | ---: | {table_rows} """ st.markdown(table) From b9bade5ca67adac52a30cd9be661bc7eec56a906 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 17 Nov 2019 15:42:16 +0100 Subject: [PATCH 198/297] Auto-format [ci skip] --- sense2vec/prodigy_recipes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 0f2a9ab..3b964c1 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -86,8 +86,7 @@ def teach( prev = DB.get_dataset(dataset) prev_accept_keys = [eg["text"] for eg in prev if eg["answer"] == "accept"] prev_words = [ - eg["word"] if case_sensitive else eg["word"].lower() - for eg in prev + eg["word"] if case_sensitive else eg["word"].lower() for eg in prev ] accept_keys += prev_accept_keys seen.update(set(prev_words)) From ff64237d5caafd71eebc3e4c9426681fa0372c38 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 17 Nov 2019 15:42:24 +0100 Subject: [PATCH 199/297] Update README.md [ci skip] --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 5d3f6fc..a907242 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ # sense2vec: Use NLP to go beyond vanilla word2vec -sense2vec [Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice +sense2vec ([Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn more interesting and detailed word vectors. For an interactive example of the technology, see our [sense2vec demo](https://demos.explosion.ai/sense2vec) that @@ -808,7 +808,7 @@ prodigy sense2vec.eval [dataset] [vectors_path] [--strategy] [--senses] | Name | Description | | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `most_similar` | Pick a random word from a random sense and get its most similar entries of the same sense. Ask about the similarity to the last and middle entry from that selection. | -| `most_least_similar` | Pick a random word from a random sense and get its least similar entry and then the least similar entry of that. | +| `most_least_similar` | Pick a random word from a random sense and get the least similar entry from its most similar entries, and then the last most similar entry of that. | | `random` | Pick a random sample of 3 words from the same random sense. | #### Example From 73b5328a5f7ab240779b0eaa891e5160b62a7f84 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 18 Nov 2019 14:09:13 +0100 Subject: [PATCH 200/297] Fix A/B evaluation output [ci skip] --- sense2vec/prodigy_recipes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 3b964c1..270f354 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -631,7 +631,7 @@ def eval_dataset(set_id): for eg in accepted: for model_id in eg["accept"]: counts[model_id] += 1 - preference = max(counts) + preference, _ = counts.most_common(1)[0] ratio = f"{counts[preference]} / {sum(counts.values()) - counts[preference]}" msg.info(f"Evaluating data from '{set_id}'") msg.text(f"You rejected {len(rejected)} and ignored {len(ignored)} pair(s)") @@ -639,7 +639,7 @@ def eval_dataset(set_id): msg.warn(f"No preference ({ratio})") else: pc = counts[preference] / sum(counts.values()) - msg.good(f"You preferred vectors {preference} {ratio} ({pc:.0%})") + msg.good(f"You preferred vectors {preference} with {ratio} ({pc:.0%})") msg.text(mapping[preference]) def on_exit(ctrl): From d25c4df7bcd7be926cc64e60bc211ae399911885 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 11:40:36 +0100 Subject: [PATCH 201/297] Update streamlit_sense2vec.py --- scripts/streamlit_sense2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/streamlit_sense2vec.py b/scripts/streamlit_sense2vec.py index 0a67dcf..da58e1a 100644 --- a/scripts/streamlit_sense2vec.py +++ b/scripts/streamlit_sense2vec.py @@ -30,7 +30,7 @@ def load_vectors(path): word = st.sidebar.text_input("Word", DEFAULT_WORD) sense_dropdown = st.sidebar.empty() -n_similar = st.sidebar.slider("Number of similar entries", 1, 100, value=20, step=1) +n_similar = st.sidebar.slider("Max number of similar entries", 1, 100, value=20, step=1) show_senses = st.sidebar.checkbox("Distinguish results by sense") vectors_path = st.sidebar.selectbox("Vectors", SENSE2VEC_PATHS) From 3c97d97d62ebec346a67d3f17d1e10c2d64026e4 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 11:41:19 +0100 Subject: [PATCH 202/297] Add lemmatize option to component --- README.md | 13 +++++++------ sense2vec/component.py | 9 +++++++++ sense2vec/util.py | 10 +++++++--- tests/test_component.py | 23 +++++++++++++++++++++++ 4 files changed, 46 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index a907242..a7ec4bb 100644 --- a/README.md +++ b/README.md @@ -530,13 +530,14 @@ The pipeline component to add sense2vec to spaCy pipelines. Initialize the pipeline component. -| Argument | Type | Description | -| --------------- | --------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- | -| `vocab` | `Vocab` | The shared `Vocab`. Mostly used for the shared `StringStore`. | -| `shape` | tuple | The vector shape. | -| `merge_phrases` | bool | Whether to merge sense2vec phrases into one token. Defaults to `False`. | +| Argument | Type | Description | +| --------------- | --------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | The shared `Vocab`. Mostly used for the shared `StringStore`. | +| `shape` | tuple | The vector shape. | +| `merge_phrases` | bool | Whether to merge sense2vec phrases into one token. Defaults to `False`. | +| `lemmatize` | bool | Always look up lemmas if available in the vectors, otherwise default to original word. Defaults to `False`. | | `overrides` | Optional custom functions to use, mapped to names registred via the registry, e.g. `{"make_key": "custom_make_key"}`. | -| **RETURNS** | `Sense2VecComponent` | The newly constructed object. | +| **RETURNS** | `Sense2VecComponent` | The newly constructed object. | ```python s2v = Sense2VecComponent(nlp.vocab) diff --git a/sense2vec/component.py b/sense2vec/component.py index 80bd52c..6744081 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -38,6 +38,7 @@ def __init__( vocab: Vocab = None, shape: Tuple[int, int] = (1000, 128), merge_phrases: bool = False, + lemmatize: bool = False, overrides: Dict[str, str] = SimpleFrozenDict(), **kwargs, ): @@ -46,6 +47,8 @@ def __init__( vocab (Vocab): The shared vocab. Mostly used for the shared StringStore. shape (tuple): The vector shape. merge_phrases (bool): Merge sense2vec phrases into one token. + lemmatize (bool): Always look up lemmas if available in the vectors, + otherwise default to original word. overrides (dict): Optional custom functions to use, mapped to names registered via the registry, e.g. {"make_key": "custom_make_key"}. RETURNS (Sense2VecComponent): The newly constructed object. @@ -58,6 +61,7 @@ def __init__( "make_spacy_key": "default", "get_phrases": "default", "merge_phrases": "default", + "lemmatize": lemmatize, } self.s2v.cfg.update(cfg) self.s2v.cfg.update(overrides) @@ -150,6 +154,11 @@ def s2v_key(self, obj: Union[Token, Span]) -> str: make_spacy_key = registry.make_spacy_key.get( obj.doc._._s2v.cfg["make_spacy_key"] ) + if obj.doc._._s2v.cfg.get("lemmatize", False): + lemma = make_spacy_key(obj, prefer_ents=self.merge_phrases, lemmatize=True) + lemma_key = obj.doc._._s2v.make_key(*lemma) + if lemma_key in obj.doc._._s2v: + return lemma_key word, sense = make_spacy_key(obj, prefer_ents=self.merge_phrases) return obj.doc._._s2v.make_key(word, sense) diff --git a/sense2vec/util.py b/sense2vec/util.py index 25d5c60..496a7b4 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -45,7 +45,7 @@ def split_key(key: str) -> Tuple[str, str]: @registry.make_spacy_key.register("default") def make_spacy_key( - obj: Union[Token, Span], prefer_ents: bool = False + obj: Union[Token, Span], prefer_ents: bool = False, lemmatize: bool = False ) -> Tuple[str, str]: """Create a key from a spaCy object, i.e. a Token or Span. If the object is a token, the part-of-speech tag (Token.pos_) is used for the sense @@ -58,10 +58,11 @@ def make_spacy_key( token.ent_type instead of tokens.pos_). Should be enabled if phrases are merged into single tokens, because otherwise the entity sense would never be used. + lemmatize (bool): Use the object's lemma instead of its text. RETURNS (unicode): The key. """ default_sense = "?" - text = get_true_cased_text(obj) + text = get_true_cased_text(obj, lemmatize=lemmatize) if isinstance(obj, Token): if obj.like_url: text = "%%URL" @@ -75,14 +76,17 @@ def make_spacy_key( return (text, sense or default_sense) -def get_true_cased_text(obj: Union[Token, Span]): +def get_true_cased_text(obj: Union[Token, Span], lemmatize: bool = False): """Correct casing so that sentence-initial words are not title-cased. Named entities and other special cases (such as the word "I") should still be title-cased. obj (Token / Span): The spaCy object to conver to text. + lemmatize (bool): Use the object's lemma instead of its text. RETURNS (unicode): The converted text. """ + if lemmatize: + return obj.lemma_ if isinstance(obj, Token) and (not obj.is_sent_start or obj.ent_type_): return obj.text elif isinstance(obj, Span) and (not obj[0].is_sent_start or obj[0].ent_type): diff --git a/tests/test_component.py b/tests/test_component.py index 3b2e745..7612bee 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -56,6 +56,29 @@ def test_component_similarity(doc): assert doc[1:3]._.s2v_similarity(doc[1:3]) == 1.0 +def test_component_lemmatize(doc): + lookups = doc.vocab.lookups.add_table("lemma_lookup") + lookups["world"] = "wrld" + s2v = Sense2VecComponent(doc.vocab, shape=(4, 4), lemmatize=True) + s2v.first_run = False + vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) + s2v.s2v.add("hello|INTJ", vector) + s2v.s2v.add("world|NOUN", vector) + s2v.s2v.add("wrld|NOUN", vector) + doc = s2v(doc) + assert doc[0]._.s2v_key == "hello|INTJ" + assert doc[1].lemma_ == "wrld" + assert doc[1]._.s2v_key == "wrld|NOUN" + lookups["hello"] = "hll" + assert doc[0].lemma_ == "hll" + assert doc[0]._.s2v_key == "hello|INTJ" + s2v.s2v.add("hll|INTJ", vector) + assert doc[0]._.s2v_key == "hll|INTJ" + new_s2v = Sense2VecComponent().from_bytes(s2v.to_bytes()) + assert new_s2v.s2v.cfg["lemmatize"] is True + doc.vocab.lookups.remove_table("lemma_lookup") + + def test_component_to_from_bytes(doc): s2v = Sense2VecComponent(doc.vocab, shape=(1, 4)) s2v.first_run = False From 124289cb546239279f9bca4d8dd8117572cc337d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 11:41:46 +0100 Subject: [PATCH 203/297] Update README.md --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index a7ec4bb..c4030a0 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ models. ```python from sense2vec import Sense2Vec -s2v = Sense2Vec().from_disk("/path/to/sense2vec_vectors") +s2v = Sense2Vec().from_disk("/path/to/s2v_reddit_2015_md") query = "natural_language_processing|NOUN" assert query in s2v vector = s2v[query] @@ -61,7 +61,7 @@ import spacy from sense2vec import Sense2VecComponent nlp = spacy.load("en_core_web_sm") -s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/sense2vec_vectors") +s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/s2v_reddit_2015_md") nlp.add_pipe(s2v) doc = nlp("A sentence about natural language processing.") @@ -111,7 +111,7 @@ the extracted data directory: ```python from sense2vec import Sense2Vec -s2v = Sense2Vec().from_disk("/path/to/sense2vec_vectors") +s2v = Sense2Vec().from_disk("/path/to/s2v_reddit_2015_md") ``` ## 👩‍💻 Usage @@ -131,7 +131,7 @@ import spacy from sense2vec import Sense2VecComponent nlp = spacy.load("en_core_web_sm") -s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/sense2vec_vectors") +s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/s2v_reddit_2015_md") nlp.add_pipe(s2v) ``` @@ -742,7 +742,7 @@ prodigy sense2vec.teach [dataset] [vectors_path] [--seeds] [--threshold] #### Example ```bash -prodigy sense2vec.teach tech_phrases /path/to/sense2vec_vectors +prodigy sense2vec.teach tech_phrases /path/to/s2v_reddit_2015_md --seeds "natural language processing, machine learning, artificial intelligence" ``` @@ -815,7 +815,7 @@ prodigy sense2vec.eval [dataset] [vectors_path] [--strategy] [--senses] #### Example ```bash -prodigy sense2vec.eval vectors_eval /path/to/sense2vec_vectors +prodigy sense2vec.eval vectors_eval /path/to/s2v_reddit_2015_md --senses NOUN,ORG,PRODUCT --threshold 0.5 ``` @@ -846,7 +846,7 @@ prodigy sense2vec.eval [dataset] [vectors_path] [--senses] [--exclude-senses] | `--show-scores`, `-S` | flag | Show all scores for debugging. | ```bash -prodigy sense2vec.eval-most-similar vectors_eval_sim /path/to/sense2vec_vectors +prodigy sense2vec.eval-most-similar vectors_eval_sim /path/to/s2v_reddit_2015_md --senses NOUN,ORG,PRODUCT ``` @@ -879,7 +879,7 @@ prodigy sense2vec.eval [dataset] [vectors_path_a] [vectors_path_b] [--senses] | `--show-mapping`, `-S` | flag | Show which models are option 1 and option 2 in the UI (for debugging). | ```bash -prodigy sense2vec.eval-ab vectors_eval_sim /path/to/sense2vec_vectors_a /path/to/sense2vec_vectors_b --senses NOUN,ORG,PRODUCT +prodigy sense2vec.eval-ab vectors_eval_sim /path/to/s2v_reddit_2015_md /path/to/s2v_reddit_2019_md --senses NOUN,ORG,PRODUCT ``` ![UI preview of sense2vec.eval-ab](https://user-images.githubusercontent.com/13643239/68088514-46d21780-fe60-11e9-9b29-fe313bb2154d.png) From 1739f8b17508f1b62986d40da78f10c0880f8feb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 11:42:15 +0100 Subject: [PATCH 204/297] Don't raise if not enough vectors exist for most_similar --- sense2vec/sense2vec.py | 5 +---- tests/test_sense2vec.py | 3 +-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 9a1bfaf..576aab3 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -201,10 +201,7 @@ def most_similar( if key not in self: raise ValueError(f"Can't find key {key} in table") if len(self.vectors) < n_similar: - raise ValueError( - f"Can't get {n} most similar out of {len(self.vectors)} total " - f"entries in the table while excluding the {len(keys)} keys" - ) + n_similar = len(self.vectors) vecs = numpy.vstack([self[key] for key in keys]) average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 5ef8c3e..6c60445 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -99,8 +99,7 @@ def test_sense2vec_most_similar(): result3 = s2v.most_similar(["a", "b"], n=3) assert len(result3) == 3 assert "y" not in [key for key, _ in result3] - with pytest.raises(ValueError): - s2v.most_similar(["a", "b"], n=10) # not enough keys left in the table + assert len(s2v.most_similar(["a", "b"], n=10)) == 4 with pytest.raises(ValueError): s2v.most_similar(["z"], n=1) # key not in table From d60b2fcf3aeab1bf96199c17ff621779cea51212 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 15:42:39 +0100 Subject: [PATCH 205/297] Update README.md --- README.md | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index c4030a0..af2bbd8 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ lets you explore semantic similarities across all Reddit comments of 2015. This library is a simple Python implementation for loading and querying sense2vec models. -🦆 **Version 1.0 alpha out now!** +🦆 **Version 1.0 out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/12/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=12) @@ -92,22 +92,29 @@ pip install streamlit streamlit run https://raw.githubusercontent.com/explosion/sense2vec/master/scripts/streamlit_sense2vec.py /path/to/vectors ``` -## ⏳ Installation & Setup +### Pretrained vectors + +To use the vectors, download the `.tar.gz` archive and pass the extracted +directory to `Sense2Vec.from_disk` or `Sense2VecComponent.from_disk`. The vector +files are **attached to the GitHub release**. Large files have been split into +multi-part downloads. -> ️🚨 **This is an alpha release so you need to specify the explicit version -> during installation. The pre-packaged vectors are just a converted version of -> the old model and will be updated for the stable release.** +| Vectors | Size | Description | 📥 Download (zipped) | +| -------------------- | -----: | ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `s2v_reddit_2019_lg` | 4 GB | Reddit comments 2019 (01-07) | [part 1](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.zip), [part 2](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.z01), [part 3](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.z02) | +| `s2v_reddit_2015_md` | 573 MB | Reddit comments 2015 | [part 1](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz) | + +## ⏳ Installation & Setup sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a6 +pip install sense2vec ``` -The Reddit vectors model is attached to -[this release](https://github.com/explosion/sense2vec/releases/tag/v1.0.0a2). To -load it in, download the `.tar.gz` archive, unpack it and point `from_disk` to -the extracted data directory: +To use pretrained vectors, download +[one of the vector packages](#pretrained-vectors), unpack the `.tar.gz` archive +and point `from_disk` to the extracted data directory: ```python from sense2vec import Sense2Vec From 58364eaf4b703e91c6c3b65b83d6b16c0d6c8ce2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 16:44:45 +0100 Subject: [PATCH 206/297] Increment version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 8e8842e..69cd5f0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a6 +version = 1.0.0a7 description = Use NLP to go beyond vanilla word2vec url = https://github.com/explosion/sense2vec author = Explosion From d83cfb997fd1375442959114452ada7a8eda855a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 16:47:36 +0100 Subject: [PATCH 207/297] Allow limiting get_best_sense to certain senses --- README.md | 12 +++++++----- sense2vec/sense2vec.py | 11 ++++++++--- tests/test_sense2vec.py | 2 ++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c4030a0..434aa2d 100644 --- a/README.md +++ b/README.md @@ -454,14 +454,16 @@ other_senses = s2v.get_other_senses("duck|NOUN") Find the best-matching sense for a given word based on the available senses and frequency counts. Returns `None` if no match is found. -| Argument | Type | Description | -| ------------- | ------- | ----------------------------------------------------------------- | -| `word` | unicode | The word to check. | -| `ignore_case` | bool | Check for uppercase, lowercase and titlecase. Defaults to `True`. | -| **RETURNS** | unicode | The best-matching key or None. | +| Argument | Type | Description | +| ------------- | ------- | ------------------------------------------------------------------------------------------------------- | +| `word` | unicode | The word to check. | +| `senses` | list | Optional list of senses to limit the search to. If not set / empty, all senses in the vectors are used. | +| `ignore_case` | bool | Check for uppercase, lowercase and titlecase. Defaults to `True`. | +| **RETURNS** | unicode | The best-matching key or None. | ```python assert s2v.get_best_sense("duck") == "duck|NOUN" +assert s2v.get_best_sense("duck", ["VERB", "ADJ"]) == "duck|VERB" ``` #### method `Sense2Vec.to_bytes` diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 576aab3..0c7ebf7 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -233,20 +233,25 @@ def get_other_senses( result.append(new_key) return result - def get_best_sense(self, word: str, ignore_case: bool = True) -> Union[str, None]: + def get_best_sense( + self, word: str, senses: Sequence[str] = tuple(), ignore_case: bool = True + ) -> Union[str, None]: """Find the best-matching sense for a given word based on the available senses and frequency counts. Returns None if no match is found. word (unicode): The word to check. + senses (list): Limit checks to senses. If not set / empty, all senses + in the vectors are used. ignore_case (bool): Check for uppercase, lowercase and titlecase. RETURNS (unicode): The best-matching key or None. """ - if not self.senses: + sense_options = senses or self.senses + if not sense_options: return None versions = [word, word.upper(), word.title()] if ignore_case else [word] freqs = [] for text in versions: - for sense in self.senses: + for sense in sense_options: key = self.make_key(text, sense) if key in self: freq = self.get_freq(key, -1) diff --git a/tests/test_sense2vec.py b/tests/test_sense2vec.py index 6c60445..fbf4657 100644 --- a/tests/test_sense2vec.py +++ b/tests/test_sense2vec.py @@ -62,6 +62,8 @@ def test_sense2vec_best_sense(): assert s2v.get_best_sense("c") is None s2v.cfg["senses"] = [] assert s2v.get_best_sense("a") is None + assert s2v.get_best_sense("b", ["A"]) == "b|A" + assert s2v.get_best_sense("b", ["A", "C"]) == "B|C" def test_sense2vec_similarity(): From aa6f0c23aef8691784305d43e5f1275e85fcf028 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 16:48:07 +0100 Subject: [PATCH 208/297] Increment version --- README.md | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 434aa2d..a9c76bf 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ streamlit run https://raw.githubusercontent.com/explosion/sense2vec/master/scrip sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a6 +pip install sense2vec==1.0.0a8 ``` The Reddit vectors model is attached to diff --git a/setup.cfg b/setup.cfg index 69cd5f0..2a11bf0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a7 +version = 1.0.0a8 description = Use NLP to go beyond vanilla word2vec url = https://github.com/explosion/sense2vec author = Explosion From bf5c3a9ec26d5dd75d95d0c6c4c70f02220c7fda Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Tue, 19 Nov 2019 16:55:29 +0100 Subject: [PATCH 209/297] Fix docstring [ci skip] --- sense2vec/sense2vec.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 0c7ebf7..93127ec 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -240,8 +240,8 @@ def get_best_sense( senses and frequency counts. Returns None if no match is found. word (unicode): The word to check. - senses (list): Limit checks to senses. If not set / empty, all senses - in the vectors are used. + senses (list): Optional list of senses to limit the search to. If not + set / empty, all senses in the vectors are used. ignore_case (bool): Check for uppercase, lowercase and titlecase. RETURNS (unicode): The best-matching key or None. """ From 7e8d72aff4a048abff0ffaf81fd6c8b0ded628b6 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Wed, 20 Nov 2019 13:39:11 +0100 Subject: [PATCH 210/297] Add script to cache neighbors --- scripts/06_precompute_neighbors.py | 80 ++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 scripts/06_precompute_neighbors.py diff --git a/scripts/06_precompute_neighbors.py b/scripts/06_precompute_neighbors.py new file mode 100644 index 0000000..5a9f0ba --- /dev/null +++ b/scripts/06_precompute_neighbors.py @@ -0,0 +1,80 @@ +"""Precompute nearest-neighbour queries for every entry in the vocab.""" +import plac +import tqdm +import numpy +import srsly +from wasabi import msg + + +@plac.annotations( + vectors_npz=("Path to vectors file from a sense2vec model.", "positional"), + output_path=( + "Path to the output file, which will be msgpack formatted.", + "positional", + ), + gpu_id=("GPU device (-1 for CPU)", "option", "g", int), + n_neighbors=("Number of neighbors to cache", "option", "n", int), + batch_size=("Batch size for to reduce memory usage.", "option", "b", int), + cutoff=( + ( + "Limit neighbors to this many earliest rows. " + "For instance, if cutoff is 100000, no word will have a nearest neighbor " + "outside of the top 100k vectors." + ), + "option", + "C", + int, + ), +) +def main( + vectors_npz, output_path, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0 +): + if gpu_id == -1: + xp = numpy + else: + import cupy as xp + import cupy.cuda.device + + device = cupy.cuda.device.Device(gpu_id) + device.use() + + msg.info(f"Loading vectors from {vectors_npz}") + vectors = xp.load(str(vectors_npz)) + msg.good(f"Loaded {vectors.shape[0]} with dimension {vectors.shape[1]}") + norms = xp.linalg.norm(vectors, axis=1, keepdims=True) + norms[norms == 0] = 1 + # Normalize to unit norm + vectors /= norms + if cutoff < 1: + cutoff = vectors.shape[0] + msg.good(f"Normalized. (mean {norms.mean():.2f}, var. {norms.var():.2f})") + msg.info(f"Finding {n_neighbors} neighbors, among {cutoff} most frequent.") + best_rows = xp.zeros((vectors.shape[0], n_neighbors), dtype="i") + scores = xp.zeros((vectors.shape[0], n_neighbors), dtype="f") + # Pre-allocate this array, so we can use it each time. + subset = xp.ascontiguousarray(vectors[:cutoff]) + sims = xp.zeros((batch_size, cutoff), dtype="f") + for i in tqdm.tqdm(list(range(0, vectors.shape[0], batch_size))): + batch = vectors[i : i + batch_size] + # batch e.g. (1024, 300) + # vectors e.g. (10000, 300) + # sims e.g. (1024, 10000) + if batch.shape[0] == sims.shape[0]: + xp.dot(batch, subset.T, out=sims) + else: + # In the last batch we'll have a different size. + sims = xp.dot(batch, subset.T) + size = sims.shape[0] + batch_indices = xp.argpartition(sims, -n_neighbors, axis=1)[:, -n_neighbors:] + # God, I hate numpy. There must be a way to write this without the loop. + batch_scores = xp.zeros((size, n_neighbors), dtype="f") + for i in range(batch_indices.shape[0]): + batch_scores[i] = sims[i, batch_indices[i]] + best_rows[i : i + size] = batch_indices + scores[i : i + size] = batch_scores + msg.info("Saving output") + srsly.write_msgpack(output_path, {"indices": best_rows, "scores": scores,}) + + +if __name__ == "__main__": + plac.call(main) From 27fe3fa9ee508f2b770328ea1e5e853b04f382a2 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 Nov 2019 16:52:06 +0100 Subject: [PATCH 211/297] Update script and add docs --- README.md | 1 + ...te_neighbors.py => 06_precompute_cache.py} | 58 ++++++++++--------- 2 files changed, 32 insertions(+), 27 deletions(-) rename scripts/{06_precompute_neighbors.py => 06_precompute_cache.py} (53%) diff --git a/README.md b/README.md index a9c76bf..fa3cc75 100644 --- a/README.md +++ b/README.md @@ -699,6 +699,7 @@ clone and `make`. | **3.** | [`03_glove_build_counts.py`](scripts/03_glove_build_counts.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) to build the vocabulary and counts. Skip this step if you're using Word2Vec via [FastText](https://github.com/facebookresearch/fastText). | | **4.** | [`04_glove_train_vectors.py`](scripts/04_glove_train_vectors.py)
[`04_fasttext_train_vectors.py`](scripts/04_fasttext_train_vectors.py) | Use [GloVe](https://github.com/stanfordnlp/GloVe) or [FastText](https://github.com/facebookresearch/fastText) to train vectors. | | **5.** | [`05_export.py`](scripts/05_export.py) | Load the vectors and frequencies and output a sense2vec component that can be loaded via `Sense2Vec.from_disk`. | +| **6.** | [`06_precompute_cache.py`](scripts/06_precompute_cache.py) | **Optional:** Precompute nearest-neighbor queries for every entry in the vocab to make `Sense2Vec.most_similar` faster. | For more detailed documentation of the scripts, check out the source or run them with `--help`. For example, `python scripts/01_parse.py --help`. diff --git a/scripts/06_precompute_neighbors.py b/scripts/06_precompute_cache.py similarity index 53% rename from scripts/06_precompute_neighbors.py rename to scripts/06_precompute_cache.py index 5a9f0ba..2881b29 100644 --- a/scripts/06_precompute_neighbors.py +++ b/scripts/06_precompute_cache.py @@ -1,34 +1,28 @@ -"""Precompute nearest-neighbour queries for every entry in the vocab.""" import plac import tqdm import numpy import srsly from wasabi import msg +from pathlib import Path @plac.annotations( - vectors_npz=("Path to vectors file from a sense2vec model.", "positional"), - output_path=( - "Path to the output file, which will be msgpack formatted.", - "positional", - ), + vectors=("Path to sense2vec component directory", "positional", None, str), gpu_id=("GPU device (-1 for CPU)", "option", "g", int), n_neighbors=("Number of neighbors to cache", "option", "n", int), batch_size=("Batch size for to reduce memory usage.", "option", "b", int), - cutoff=( - ( - "Limit neighbors to this many earliest rows. " - "For instance, if cutoff is 100000, no word will have a nearest neighbor " - "outside of the top 100k vectors." - ), - "option", - "C", - int, - ), + cutoff=("Limit neighbors to this many earliest rows", "option", "C", int,), ) -def main( - vectors_npz, output_path, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0 -): +def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): + """ + Step 6: Precompute nearest-neighbor queries (optional) + + Precompute nearest-neighbor queries for every entry in the vocab to make + Sense2Vec.most_similar faster. The --cutoff option lets you define the + number of earliest rows to limit the neighbors to. For instance, if cutoff + is 100000, no word will have a nearest neighbor outside of the top 100k + vectors. + """ if gpu_id == -1: xp = numpy else: @@ -38,17 +32,22 @@ def main( device = cupy.cuda.device.Device(gpu_id) device.use() - msg.info(f"Loading vectors from {vectors_npz}") - vectors = xp.load(str(vectors_npz)) - msg.good(f"Loaded {vectors.shape[0]} with dimension {vectors.shape[1]}") + vectors_dir = Path(vectors) + vectors_file = vectors_dir / "vectors" + if not vectors_dir.is_dir() or not vectors_file.exists(): + err = "Are you passing in the exported sense2vec directory containing a vectors file?" + msg.fail(f"Can't load vectors from {vectors}", err, exits=1) + with msg.loading(f"Loading vectors from {vectors}"): + vectors = xp.load(str(vectors_file)) + msg.good(f"Loaded {vectors.shape[0]:;} vectors with dimension {vectors.shape[1]}") norms = xp.linalg.norm(vectors, axis=1, keepdims=True) norms[norms == 0] = 1 # Normalize to unit norm vectors /= norms if cutoff < 1: cutoff = vectors.shape[0] - msg.good(f"Normalized. (mean {norms.mean():.2f}, var. {norms.var():.2f})") - msg.info(f"Finding {n_neighbors} neighbors, among {cutoff} most frequent.") + msg.good(f"Normalized (mean {norms.mean():,.2f}, variance {norms.var():,.2f})") + msg.info(f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent") best_rows = xp.zeros((vectors.shape[0], n_neighbors), dtype="i") scores = xp.zeros((vectors.shape[0], n_neighbors), dtype="f") # Pre-allocate this array, so we can use it each time. @@ -72,9 +71,14 @@ def main( batch_scores[i] = sims[i, batch_indices[i]] best_rows[i : i + size] = batch_indices scores[i : i + size] = batch_scores - msg.info("Saving output") - srsly.write_msgpack(output_path, {"indices": best_rows, "scores": scores,}) + output_file = vectors_dir / "cache" + with msg.loading("Saving output..."): + srsly.write_msgpack(output_file, {"indices": best_rows, "scores": scores}) + msg.good(f"Saved cache to {output_file}") if __name__ == "__main__": - plac.call(main) + try: + plac.call(main) + except KeyboardInterrupt: + msg.warn("Cancelled.") From 2c9699daad41c959e3f5cc287e7c6be56c373e22 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Wed, 20 Nov 2019 16:53:10 +0100 Subject: [PATCH 212/297] Fix typo --- scripts/06_precompute_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 2881b29..0bac236 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -39,7 +39,7 @@ def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): msg.fail(f"Can't load vectors from {vectors}", err, exits=1) with msg.loading(f"Loading vectors from {vectors}"): vectors = xp.load(str(vectors_file)) - msg.good(f"Loaded {vectors.shape[0]:;} vectors with dimension {vectors.shape[1]}") + msg.good(f"Loaded {vectors.shape[0]:,} vectors with dimension {vectors.shape[1]}") norms = xp.linalg.norm(vectors, axis=1, keepdims=True) norms[norms == 0] = 1 # Normalize to unit norm From d18a80dd4caa9ba014419df455ed418b14ad3bc0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 00:33:50 +0100 Subject: [PATCH 213/297] Support cache --- sense2vec/sense2vec.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 93127ec..c69c55a 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -32,6 +32,7 @@ def __init__( self.vectors = Vectors(shape=shape, name=vectors_name) self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} + self.cache = None self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"} self.cfg.update(overrides) @@ -202,6 +203,16 @@ def most_similar( raise ValueError(f"Can't find key {key} in table") if len(self.vectors) < n_similar: n_similar = len(self.vectors) + if self.cache: + indices = self.cache.get("indices", []) + scores = self.cache.get("scores", []) + if len(indices) >= n_similar: + key_row = self.vectors.find(key=key) + result_keys = self.vectors.find(rows=indices[key_row][:n_similar]) + result_scores = scores[key_row][:n_similar] + result = list(zip(result_keys, result_scores)) + result = [(self.strings[k], s) for k, s in result if k not in keys] + return result vecs = numpy.vstack([self[key] for key in keys]) average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( @@ -269,6 +280,8 @@ def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes: data = {"vectors": vectors_bytes, "cfg": self.cfg, "freqs": freqs} if "strings" not in exclude: data["strings"] = self.strings.to_bytes() + if "cache" not in exclude: + data["cache"] = self.cache return srsly.msgpack_dumps(data) def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): @@ -284,6 +297,8 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: self.strings = StringStore().from_bytes(data["strings"]) + if "cache" not in exclude and "cache" in data: + self.cache = data.get("cache", {}) return self def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): @@ -298,6 +313,8 @@ def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): srsly.write_json(path / "freqs.json", list(self.freqs.items())) if "strings" not in exclude: self.strings.to_disk(path / "strings.json") + if "cache" not in exclude and self.cache: + srsly.write_msgpack(path / "cache", self.cache) def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a directory. @@ -309,10 +326,13 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): path = Path(path) strings_path = path / "strings.json" freqs_path = path / "freqs.json" + cache_path = path / "cache" self.vectors = Vectors().from_disk(path) self.cfg.update(srsly.read_json(path / "cfg")) if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) if "strings" not in exclude and strings_path.exists(): self.strings = StringStore().from_disk(strings_path) + if "cache" not in exclude and cache_path.exists(): + self.cache = srsly.read_msgpack(cache_path) return self From 4e8d85f8b2af4cb0bde831f208a6b481d092a82b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 01:25:31 +0100 Subject: [PATCH 214/297] Fix option shortcut --- scripts/06_precompute_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 0bac236..6ea8da9 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -11,7 +11,7 @@ gpu_id=("GPU device (-1 for CPU)", "option", "g", int), n_neighbors=("Number of neighbors to cache", "option", "n", int), batch_size=("Batch size for to reduce memory usage.", "option", "b", int), - cutoff=("Limit neighbors to this many earliest rows", "option", "C", int,), + cutoff=("Limit neighbors to this many earliest rows", "option", "c", int,), ) def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): """ From a2d4238f3348f820ecfcc92be83c961cc47b26a9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 01:25:41 +0100 Subject: [PATCH 215/297] Update caching logic --- sense2vec/sense2vec.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index c69c55a..6f8671a 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -208,11 +208,9 @@ def most_similar( scores = self.cache.get("scores", []) if len(indices) >= n_similar: key_row = self.vectors.find(key=key) - result_keys = self.vectors.find(rows=indices[key_row][:n_similar]) - result_scores = scores[key_row][:n_similar] - result = list(zip(result_keys, result_scores)) - result = [(self.strings[k], s) for k, s in result if k not in keys] - return result + sim_keys = self.vectors.find(rows=indices[key_row][:n_similar]) + sim_scores = scores[key_row][:n_similar] + return [(self.strings[k], s) for k, s in zip(sim_keys, sim_scores)] vecs = numpy.vstack([self[key] for key in keys]) average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( From caaf095253117aeb96a08c644e7f31f87a3f6510 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 01:37:17 +0100 Subject: [PATCH 216/297] Fix precompute_cache script --- scripts/06_precompute_cache.py | 151 ++++++++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 20 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 6ea8da9..ac5958e 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -6,28 +6,24 @@ from pathlib import Path + @plac.annotations( vectors=("Path to sense2vec component directory", "positional", None, str), gpu_id=("GPU device (-1 for CPU)", "option", "g", int), n_neighbors=("Number of neighbors to cache", "option", "n", int), batch_size=("Batch size for to reduce memory usage.", "option", "b", int), cutoff=("Limit neighbors to this many earliest rows", "option", "c", int,), + start=("Index of vectors to start at.", "option", "s", int), + end=("Index of vectors to stop at.", "option", "e", int), ) def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): - """ - Step 6: Precompute nearest-neighbor queries (optional) - - Precompute nearest-neighbor queries for every entry in the vocab to make - Sense2Vec.most_similar faster. The --cutoff option lets you define the - number of earliest rows to limit the neighbors to. For instance, if cutoff - is 100000, no word will have a nearest neighbor outside of the top 100k - vectors. - """ if gpu_id == -1: xp = numpy else: import cupy as xp import cupy.cuda.device + cupy.take_along_axis = take_along_axis + cupy.put_along_axis = put_along_axis device = cupy.cuda.device.Device(gpu_id) device.use() @@ -46,14 +42,18 @@ def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): vectors /= norms if cutoff < 1: cutoff = vectors.shape[0] + if end is None: + end = vectors.shape[0] msg.good(f"Normalized (mean {norms.mean():,.2f}, variance {norms.var():,.2f})") msg.info(f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent") - best_rows = xp.zeros((vectors.shape[0], n_neighbors), dtype="i") - scores = xp.zeros((vectors.shape[0], n_neighbors), dtype="f") + best_rows = xp.zeros((end - start, n_neighbors), dtype="i") + scores = xp.zeros((end - start, n_neighbors), dtype="f") # Pre-allocate this array, so we can use it each time. subset = xp.ascontiguousarray(vectors[:cutoff]) sims = xp.zeros((batch_size, cutoff), dtype="f") - for i in tqdm.tqdm(list(range(0, vectors.shape[0], batch_size))): + n = n_neighbors + indices = xp.arange(cutoff).reshape((-1, 1)) + for i in tqdm.tqdm(list(range(start, end, batch_size))): batch = vectors[i : i + batch_size] # batch e.g. (1024, 300) # vectors e.g. (10000, 300) @@ -64,19 +64,130 @@ def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): # In the last batch we'll have a different size. sims = xp.dot(batch, subset.T) size = sims.shape[0] - batch_indices = xp.argpartition(sims, -n_neighbors, axis=1)[:, -n_neighbors:] - # God, I hate numpy. There must be a way to write this without the loop. - batch_scores = xp.zeros((size, n_neighbors), dtype="f") - for i in range(batch_indices.shape[0]): - batch_scores[i] = sims[i, batch_indices[i]] - best_rows[i : i + size] = batch_indices - scores[i : i + size] = batch_scores + # Zero out the self-scores, to avoid returning self as a neighbor. + self_indices = xp.arange(i, min(i+size, sims.shape[1])).reshape((1, -1)) + xp.put_along_axis(sims, self_indices, 0., axis=1) + # Get the indices and scores for the top N most similar for each in the + # batch. This is a bit complicated, to avoid sorting all of the scores + # -- we only want the top N to be sorted (which we do later). For now, + # we use argpartition to just get the cut point. + neighbors = xp.argpartition(sims, -n, axis=1)[:, -n:] + neighbor_sims = xp.partition(sims, -n, axis=1)[:, -n:] + # Can't figure out how to do this without the loop. + for j in range(min(end-i, size)): + best_rows[i+j] = neighbors[j] + scores[i+j] = neighbor_sims[j] + # Sort in reverse order + indices = xp.argsort(scores, axis=1)[:, ::-1] + scores = xp.take_along_axis(scores, indices, axis=1) + best_rows = xp.take_along_axis(best_rows, indices, axis=1) + msg.info("Saving output") + if not isinstance(best_rows, numpy.ndarray): + best_rows = best_rows.get() + if not isinstance(scores, numpy.ndarray): + scores = scores.get() + output = { + "indices": best_rows, + "scores": scores.astype("float16"), + "start": start, + "end": end, + "cutoff": cutoff + } output_file = vectors_dir / "cache" with msg.loading("Saving output..."): - srsly.write_msgpack(output_file, {"indices": best_rows, "scores": scores}) + srsly.write_msgpack(output_file, output) msg.good(f"Saved cache to {output_file}") +# These functions are missing from cupy, but will be supported in cupy 7. +def take_along_axis(a, indices, axis): + """Take values from the input array by matching 1d index and data slices. + + Args: + a (cupy.ndarray): Array to extract elements. + indices (cupy.ndarray): Indices to take along each 1d slice of ``a``. + axis (int): The axis to take 1d slices along. + + Returns: + cupy.ndarray: The indexed result. + + .. seealso:: :func:`numpy.take_along_axis` + """ + import cupy + + if indices.dtype.kind not in ('i', 'u'): + raise IndexError('`indices` must be an integer array') + + if axis is None: + a = a.ravel() + axis = 0 + + ndim = a.ndim + + if not (-ndim <= axis < ndim): + raise _errors._AxisError('Axis overrun') + + axis %= a.ndim + + if ndim != indices.ndim: + raise ValueError( + '`indices` and `a` must have the same number of dimensions') + + fancy_index = [] + for i, n in enumerate(a.shape): + if i == axis: + fancy_index.append(indices) + else: + ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1) + fancy_index.append(cupy.arange(n).reshape(ind_shape)) + + return a[fancy_index] + +def put_along_axis(a, indices, value, axis): + """Take values from the input array by matching 1d index and data slices. + + Args: + a (cupy.ndarray): Array to extract elements. + indices (cupy.ndarray): Indices to take along each 1d slice of ``a``. + axis (int): The axis to take 1d slices along. + + Returns: + cupy.ndarray: The indexed result. + + .. seealso:: :func:`numpy.take_along_axis` + """ + import cupy + + if indices.dtype.kind not in ('i', 'u'): + raise IndexError('`indices` must be an integer array') + + if axis is None: + a = a.ravel() + axis = 0 + + ndim = a.ndim + + if not (-ndim <= axis < ndim): + raise _errors._AxisError('Axis overrun') + + axis %= a.ndim + + if ndim != indices.ndim: + raise ValueError( + '`indices` and `a` must have the same number of dimensions') + + fancy_index = [] + for i, n in enumerate(a.shape): + if i == axis: + fancy_index.append(indices) + else: + ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1) + fancy_index.append(cupy.arange(n).reshape(ind_shape)) + a[fancy_index] = value + + + + if __name__ == "__main__": try: plac.call(main) From b97f0fdccd9e06296f6bad9e8ed5e93731481019 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 01:37:54 +0100 Subject: [PATCH 217/297] Format --- scripts/06_precompute_cache.py | 37 ++++++++++++++++------------------ 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index ac5958e..48f4fd2 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -6,7 +6,6 @@ from pathlib import Path - @plac.annotations( vectors=("Path to sense2vec component directory", "positional", None, str), gpu_id=("GPU device (-1 for CPU)", "option", "g", int), @@ -22,6 +21,7 @@ def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): else: import cupy as xp import cupy.cuda.device + cupy.take_along_axis = take_along_axis cupy.put_along_axis = put_along_axis @@ -65,8 +65,8 @@ def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): sims = xp.dot(batch, subset.T) size = sims.shape[0] # Zero out the self-scores, to avoid returning self as a neighbor. - self_indices = xp.arange(i, min(i+size, sims.shape[1])).reshape((1, -1)) - xp.put_along_axis(sims, self_indices, 0., axis=1) + self_indices = xp.arange(i, min(i + size, sims.shape[1])).reshape((1, -1)) + xp.put_along_axis(sims, self_indices, 0.0, axis=1) # Get the indices and scores for the top N most similar for each in the # batch. This is a bit complicated, to avoid sorting all of the scores # -- we only want the top N to be sorted (which we do later). For now, @@ -74,9 +74,9 @@ def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): neighbors = xp.argpartition(sims, -n, axis=1)[:, -n:] neighbor_sims = xp.partition(sims, -n, axis=1)[:, -n:] # Can't figure out how to do this without the loop. - for j in range(min(end-i, size)): - best_rows[i+j] = neighbors[j] - scores[i+j] = neighbor_sims[j] + for j in range(min(end - i, size)): + best_rows[i + j] = neighbors[j] + scores[i + j] = neighbor_sims[j] # Sort in reverse order indices = xp.argsort(scores, axis=1)[:, ::-1] scores = xp.take_along_axis(scores, indices, axis=1) @@ -90,8 +90,8 @@ def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): "indices": best_rows, "scores": scores.astype("float16"), "start": start, - "end": end, - "cutoff": cutoff + "end": end, + "cutoff": cutoff, } output_file = vectors_dir / "cache" with msg.loading("Saving output..."): @@ -115,8 +115,8 @@ def take_along_axis(a, indices, axis): """ import cupy - if indices.dtype.kind not in ('i', 'u'): - raise IndexError('`indices` must be an integer array') + if indices.dtype.kind not in ("i", "u"): + raise IndexError("`indices` must be an integer array") if axis is None: a = a.ravel() @@ -125,13 +125,12 @@ def take_along_axis(a, indices, axis): ndim = a.ndim if not (-ndim <= axis < ndim): - raise _errors._AxisError('Axis overrun') + raise _errors._AxisError("Axis overrun") axis %= a.ndim if ndim != indices.ndim: - raise ValueError( - '`indices` and `a` must have the same number of dimensions') + raise ValueError("`indices` and `a` must have the same number of dimensions") fancy_index = [] for i, n in enumerate(a.shape): @@ -143,6 +142,7 @@ def take_along_axis(a, indices, axis): return a[fancy_index] + def put_along_axis(a, indices, value, axis): """Take values from the input array by matching 1d index and data slices. @@ -158,8 +158,8 @@ def put_along_axis(a, indices, value, axis): """ import cupy - if indices.dtype.kind not in ('i', 'u'): - raise IndexError('`indices` must be an integer array') + if indices.dtype.kind not in ("i", "u"): + raise IndexError("`indices` must be an integer array") if axis is None: a = a.ravel() @@ -168,13 +168,12 @@ def put_along_axis(a, indices, value, axis): ndim = a.ndim if not (-ndim <= axis < ndim): - raise _errors._AxisError('Axis overrun') + raise _errors._AxisError("Axis overrun") axis %= a.ndim if ndim != indices.ndim: - raise ValueError( - '`indices` and `a` must have the same number of dimensions') + raise ValueError("`indices` and `a` must have the same number of dimensions") fancy_index = [] for i, n in enumerate(a.shape): @@ -186,8 +185,6 @@ def put_along_axis(a, indices, value, axis): a[fancy_index] = value - - if __name__ == "__main__": try: plac.call(main) From d1a4562521ad84a3b7b32d649f5d42bf7882e080 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 01:53:30 +0100 Subject: [PATCH 218/297] Fix cache --- scripts/06_precompute_cache.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 48f4fd2..0d6592b 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -15,7 +15,7 @@ start=("Index of vectors to start at.", "option", "s", int), end=("Index of vectors to stop at.", "option", "e", int), ) -def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): +def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None): if gpu_id == -1: xp = numpy else: @@ -44,14 +44,16 @@ def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0): cutoff = vectors.shape[0] if end is None: end = vectors.shape[0] - msg.good(f"Normalized (mean {norms.mean():,.2f}, variance {norms.var():,.2f})") + mean = float(norms.mean()) + var = float(norms.var()) + msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})") msg.info(f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent") - best_rows = xp.zeros((end - start, n_neighbors), dtype="i") - scores = xp.zeros((end - start, n_neighbors), dtype="f") + n = min(n_neighbors, vectors.shape[0]) + best_rows = xp.zeros((end - start, n), dtype="i") + scores = xp.zeros((end - start, n), dtype="f") # Pre-allocate this array, so we can use it each time. subset = xp.ascontiguousarray(vectors[:cutoff]) sims = xp.zeros((batch_size, cutoff), dtype="f") - n = n_neighbors indices = xp.arange(cutoff).reshape((-1, 1)) for i in tqdm.tqdm(list(range(start, end, batch_size))): batch = vectors[i : i + batch_size] From 4a52c697996cb4f98a02021a97f2eb52479de04d Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 01:58:44 +0100 Subject: [PATCH 219/297] Update 06_precompute_cache.py --- scripts/06_precompute_cache.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 0d6592b..a724b14 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -15,7 +15,18 @@ start=("Index of vectors to start at.", "option", "s", int), end=("Index of vectors to stop at.", "option", "e", int), ) -def main(vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None): +def main( + vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None +): + """ + Step 6: Precompute nearest-neighbor queries (optional) + + Precompute nearest-neighbor queries for every entry in the vocab to make + Sense2Vec.most_similar faster. The --cutoff option lets you define the + number of earliest rows to limit the neighbors to. For instance, if cutoff + is 100000, no word will have a nearest neighbor outside of the top 100k + vectors. + """ if gpu_id == -1: xp = numpy else: @@ -127,7 +138,7 @@ def take_along_axis(a, indices, axis): ndim = a.ndim if not (-ndim <= axis < ndim): - raise _errors._AxisError("Axis overrun") + raise IndexError("Axis overrun") axis %= a.ndim @@ -170,7 +181,7 @@ def put_along_axis(a, indices, value, axis): ndim = a.ndim if not (-ndim <= axis < ndim): - raise _errors._AxisError("Axis overrun") + raise IndexError("Axis overrun") axis %= a.ndim From b80fbb013b769e018a5444505016e71fa4566ea0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 01:59:02 +0100 Subject: [PATCH 220/297] Add cache to test model --- tests/data/cache | Bin 0 -> 270 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/data/cache diff --git a/tests/data/cache b/tests/data/cache new file mode 100644 index 0000000000000000000000000000000000000000..f801136664af927ac1d71035c743384bcceeb9a3 GIT binary patch literal 270 zcmZo&o|%`DnVeeOdW0!2> zB!z{6fq@x_nShuPh(UnbV#|t?^NY|Om1cy~QAQAFkU}mkE=epZVOX4+m%_R%xwIrd HEsYfbS&K1H literal 0 HcmV?d00001 From e3690e119735620109285ffa2569d7279c54a059 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 02:42:17 +0100 Subject: [PATCH 221/297] Don't zero out set self scores --- scripts/06_precompute_cache.py | 48 ---------------------------------- sense2vec/sense2vec.py | 3 ++- 2 files changed, 2 insertions(+), 49 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index a724b14..d1e19ea 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -34,11 +34,8 @@ def main( import cupy.cuda.device cupy.take_along_axis = take_along_axis - cupy.put_along_axis = put_along_axis - device = cupy.cuda.device.Device(gpu_id) device.use() - vectors_dir = Path(vectors) vectors_file = vectors_dir / "vectors" if not vectors_dir.is_dir() or not vectors_file.exists(): @@ -77,9 +74,6 @@ def main( # In the last batch we'll have a different size. sims = xp.dot(batch, subset.T) size = sims.shape[0] - # Zero out the self-scores, to avoid returning self as a neighbor. - self_indices = xp.arange(i, min(i + size, sims.shape[1])).reshape((1, -1)) - xp.put_along_axis(sims, self_indices, 0.0, axis=1) # Get the indices and scores for the top N most similar for each in the # batch. This is a bit complicated, to avoid sorting all of the scores # -- we only want the top N to be sorted (which we do later). For now, @@ -156,48 +150,6 @@ def take_along_axis(a, indices, axis): return a[fancy_index] -def put_along_axis(a, indices, value, axis): - """Take values from the input array by matching 1d index and data slices. - - Args: - a (cupy.ndarray): Array to extract elements. - indices (cupy.ndarray): Indices to take along each 1d slice of ``a``. - axis (int): The axis to take 1d slices along. - - Returns: - cupy.ndarray: The indexed result. - - .. seealso:: :func:`numpy.take_along_axis` - """ - import cupy - - if indices.dtype.kind not in ("i", "u"): - raise IndexError("`indices` must be an integer array") - - if axis is None: - a = a.ravel() - axis = 0 - - ndim = a.ndim - - if not (-ndim <= axis < ndim): - raise IndexError("Axis overrun") - - axis %= a.ndim - - if ndim != indices.ndim: - raise ValueError("`indices` and `a` must have the same number of dimensions") - - fancy_index = [] - for i, n in enumerate(a.shape): - if i == axis: - fancy_index.append(indices) - else: - ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1) - fancy_index.append(cupy.arange(n).reshape(ind_shape)) - a[fancy_index] = value - - if __name__ == "__main__": try: plac.call(main) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 6f8671a..a2d0b3c 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -210,7 +210,8 @@ def most_similar( key_row = self.vectors.find(key=key) sim_keys = self.vectors.find(rows=indices[key_row][:n_similar]) sim_scores = scores[key_row][:n_similar] - return [(self.strings[k], s) for k, s in zip(sim_keys, sim_scores)] + result = [(self.strings[k], s) for k, s in zip(sim_keys, sim_scores)] + return [(key, score) for key, score in result if key not in keys] vecs = numpy.vstack([self[key] for key in keys]) average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( From 2d106ebc8e9dffff3ccce87a590466f389072a35 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 02:42:34 +0100 Subject: [PATCH 222/297] Add cache tests --- tests/data/cache | Bin 270 -> 270 bytes tests/test_model.py | 25 ++++++++++++++++++++++--- 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/tests/data/cache b/tests/data/cache index f801136664af927ac1d71035c743384bcceeb9a3..7325e916dadd8f1c6169de4f0f8948e1120ea057 100644 GIT binary patch literal 270 zcmZo&o|%`DnVeeOdW0!2> zB!vM8n1GlWh*^M`5sE=tkCCL)FmHlh~gh9ZX73_xPrE&I&Mfnshz)_h}-n5adc*?yDx#!HJ!5{pV07N_Q= Our5n3Ey+(yV+8=$n?Mo( literal 270 zcmZo&o|%`DnVeeOdW0!2> zB!z{6fq@x_nShuPh(UnbV#|t?^NY|Om1cy~QAQAFkU}mkE=epZVOX4+m%_R%xwIrd HEsYfbS&K1H diff --git a/tests/test_model.py b/tests/test_model.py index 5f54748..2b5d48a 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -1,6 +1,7 @@ import pytest from pathlib import Path from sense2vec import Sense2Vec +import numpy @pytest.fixture @@ -10,10 +11,28 @@ def s2v(): def test_model_most_similar(s2v): + s2v.cache = None assert "beekeepers|NOUN" in s2v - result = s2v.most_similar(["beekeepers|NOUN"], n=2) - assert result[0][0] == "honey_bees|NOUN" - assert result[1][0] == "Beekeepers|NOUN" + ((key1, _), (key2, _)) = s2v.most_similar(["beekeepers|NOUN"], n=2) + assert key1 == "honey_bees|NOUN" + assert key2 == "Beekeepers|NOUN" + + +def test_model_most_similar_cache(s2v): + query = "beekeepers|NOUN" + assert s2v.cache + assert query in s2v + # Modify cache to test that the cache is used and values aren't computed + query_row = s2v.vectors.find(key=s2v.ensure_int_key(query)) + scores = numpy.array(s2v.cache["scores"], copy=True) # otherwise not writable + scores[query_row, 1] = 2.0 + scores[query_row, 2] = 3.0 + s2v.cache["scores"] = scores + ((key1, score1), (key2, score2)) = s2v.most_similar([query], n=2) + assert key1 == "honey_bees|NOUN" + assert score1 == 2.0 + assert key2 == "Beekeepers|NOUN" + assert score2 == 3.0 def test_model_other_senses(s2v): From ce0b233b4c5918629417a1ffa89d30e502ba05b9 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 02:47:57 +0100 Subject: [PATCH 223/297] Update README.md [ci skip] --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fa3cc75..db4cbe7 100644 --- a/README.md +++ b/README.md @@ -417,7 +417,9 @@ assert s2v.similarity("machine_learning|NOUN", "machine_learning|NOUN") == 1.0 #### method `Sense2Vec.most_similar` Get the most similar entries in the table. If more than one key is provided, the -average of the vectors is used. +average of the vectors is used. To make this method faster, see the +[script for precomputing a cache](scripts/06_precompute_cache.py) of the nearest +neighbors. | Argument | Type | Description | | ------------ | ------------------------- | ------------------------------------------------------- | From a69d2622e40564cc86f570fbe28fb0d0da26c9c8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 02:51:23 +0100 Subject: [PATCH 224/297] Update README.md [ci skip] --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index db4cbe7..c7187aa 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,8 @@ models. - spaCy **pipeline component** and **extension attributes**. - Fully **serializable** so you can easily ship your sense2vec vectors with your spaCy model packages. +- Optional **caching of nearest neighbors** for super fast "most similar" + queries. - **Train your own vectors** using a pretrained spaCy model, raw text and [GloVe](https://github.com/stanfordnlp/GloVe) or Word2Vec via [fastText](https://github.com/facebookresearch/fastText) From f9a246552ab4fe4e21f08b8a2774273f3241b05b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 02:51:48 +0100 Subject: [PATCH 225/297] Increment version [ci skip] --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 2a11bf0..b1773bf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a8 +version = 1.0.0a9 description = Use NLP to go beyond vanilla word2vec url = https://github.com/explosion/sense2vec author = Explosion From 9e9a8d38c9bc09842304d97b372a76b7b230f40f Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 02:55:12 +0100 Subject: [PATCH 226/297] Fix types [ci skip] --- sense2vec/sense2vec.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index a2d0b3c..5dec235 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -1,4 +1,4 @@ -from typing import Tuple, List, Union, Sequence, Dict, Callable +from typing import Tuple, List, Union, Sequence, Dict, Callable, Any from pathlib import Path from spacy.vectors import Vectors from spacy.strings import StringStore @@ -33,7 +33,11 @@ def __init__( self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} self.cache = None - self.cfg = {"senses": senses, "make_key": "default", "split_key": "default"} + self.cfg: Dict[str, Any] = { + "senses": senses, + "make_key": "default", + "split_key": "default", + } self.cfg.update(overrides) @property From b8c440bbb0d28be043a5992916b4019b826730db Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 02:57:10 +0100 Subject: [PATCH 227/297] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c7187aa..9f29085 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ streamlit run https://raw.githubusercontent.com/explosion/sense2vec/master/scrip sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a8 +pip install sense2vec==1.0.0a9 ``` The Reddit vectors model is attached to From 1e6759685f033cb1e0ece11d20ed709422422211 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 03:00:53 +0100 Subject: [PATCH 228/297] Update README.md [ci skip] --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 152672f..a893aa9 100644 --- a/README.md +++ b/README.md @@ -96,10 +96,10 @@ streamlit run https://raw.githubusercontent.com/explosion/sense2vec/master/scrip ### Pretrained vectors -To use the vectors, download the `.tar.gz` archive and pass the extracted -directory to `Sense2Vec.from_disk` or `Sense2VecComponent.from_disk`. The vector -files are **attached to the GitHub release**. Large files have been split into -multi-part downloads. +To use the vectors, download the archive(s) and pass the extracted directory to +`Sense2Vec.from_disk` or `Sense2VecComponent.from_disk`. The vector files are +**attached to the GitHub release**. Large files have been split into multi-part +downloads. | Vectors | Size | Description | 📥 Download (zipped) | | -------------------- | -----: | ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | From 8bf0b457f436e4c564dd563254b8dcd5838f1fdb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 03:05:07 +0100 Subject: [PATCH 229/297] Update tagline [ci skip] --- README.md | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9f29085..3accae3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# sense2vec: Use NLP to go beyond vanilla word2vec +# sense2vec: Contextually-keyed word vectors sense2vec ([Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn diff --git a/setup.cfg b/setup.cfg index b1773bf..169bdec 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] version = 1.0.0a9 -description = Use NLP to go beyond vanilla word2vec +description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion author_email = contact@explosion.ai From 31814e5c378dadd8ed65d71fd8e65d2a5fc2f245 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 03:06:15 +0100 Subject: [PATCH 230/297] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0e2ebdb..02bdf6a 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,9 @@ sense2vec ([Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn more interesting and detailed word vectors. For an interactive example of the technology, see our [sense2vec demo](https://demos.explosion.ai/sense2vec) that -lets you explore semantic similarities across all Reddit comments of 2015. This -library is a simple Python implementation for loading and querying sense2vec -models. +lets you explore semantic similarities across all Reddit comments of 2015 +and 2019. This library is a simple Python implementation for loading and +querying sense2vec models. 🦆 **Version 1.0 out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) From af9921a7f86c4fba2f5cb2107990707a39a64a62 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 11:19:43 +0100 Subject: [PATCH 231/297] Simplify precompute cache script --- scripts/06_precompute_cache.py | 55 +++------------------------------- 1 file changed, 4 insertions(+), 51 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index d1e19ea..63178df 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -33,7 +33,6 @@ def main( import cupy as xp import cupy.cuda.device - cupy.take_along_axis = take_along_axis device = cupy.cuda.device.Device(gpu_id) device.use() vectors_dir = Path(vectors) @@ -82,12 +81,10 @@ def main( neighbor_sims = xp.partition(sims, -n, axis=1)[:, -n:] # Can't figure out how to do this without the loop. for j in range(min(end - i, size)): - best_rows[i + j] = neighbors[j] - scores[i + j] = neighbor_sims[j] - # Sort in reverse order - indices = xp.argsort(scores, axis=1)[:, ::-1] - scores = xp.take_along_axis(scores, indices, axis=1) - best_rows = xp.take_along_axis(best_rows, indices, axis=1) + # Sort in reverse order + indices = xp.argsort(neighbor_sims[j], axis=-1)[::-1] + best_rows[i + j] = xp.take(neighbors[j], indices) + scores[i + j] = xp.take(neighbor_sims[j], indices) msg.info("Saving output") if not isinstance(best_rows, numpy.ndarray): best_rows = best_rows.get() @@ -106,50 +103,6 @@ def main( msg.good(f"Saved cache to {output_file}") -# These functions are missing from cupy, but will be supported in cupy 7. -def take_along_axis(a, indices, axis): - """Take values from the input array by matching 1d index and data slices. - - Args: - a (cupy.ndarray): Array to extract elements. - indices (cupy.ndarray): Indices to take along each 1d slice of ``a``. - axis (int): The axis to take 1d slices along. - - Returns: - cupy.ndarray: The indexed result. - - .. seealso:: :func:`numpy.take_along_axis` - """ - import cupy - - if indices.dtype.kind not in ("i", "u"): - raise IndexError("`indices` must be an integer array") - - if axis is None: - a = a.ravel() - axis = 0 - - ndim = a.ndim - - if not (-ndim <= axis < ndim): - raise IndexError("Axis overrun") - - axis %= a.ndim - - if ndim != indices.ndim: - raise ValueError("`indices` and `a` must have the same number of dimensions") - - fancy_index = [] - for i, n in enumerate(a.shape): - if i == axis: - fancy_index.append(indices) - else: - ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1) - fancy_index.append(cupy.arange(n).reshape(ind_shape)) - - return a[fancy_index] - - if __name__ == "__main__": try: plac.call(main) From 1fc5113efd0660182f7ea50c714bdc0c71e934f3 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 11:43:45 +0100 Subject: [PATCH 232/297] Update README.md --- README.md | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 02bdf6a..1a47fd1 100644 --- a/README.md +++ b/README.md @@ -4,11 +4,12 @@ sense2vec ([Trask et. al](https://arxiv.org/abs/1511.06388), 2015) is a nice twist on [word2vec](https://en.wikipedia.org/wiki/Word2vec) that lets you learn -more interesting and detailed word vectors. For an interactive example of the -technology, see our [sense2vec demo](https://demos.explosion.ai/sense2vec) that -lets you explore semantic similarities across all Reddit comments of 2015 -and 2019. This library is a simple Python implementation for loading and -querying sense2vec models. +more interesting and detailed word vectors. This library is a simple Python +implementation for loading, querying and training sense2vec models. For more +details, check out +[our blog post](https://explosion.ai/blog/sense2vec-reloaded). To explore the +semantic similarities across all Reddit comments of 2015 and 2019, see the +[interactive demo](https://demos.explosion.ai/sense2vec). 🦆 **Version 1.0 out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) @@ -20,7 +21,7 @@ querying sense2vec models. ## ✨ Features -![](https://user-images.githubusercontent.com/13643239/68089415-db407800-fe68-11e9-9c45-47338dea49a9.jpg) +![](https://user-images.githubusercontent.com/13643239/69330759-d3981600-0c53-11ea-8f64-e5c075f7ea10.jpg) - Query **vectors for multi-word phrases** based on part-of-speech tags and entity labels. From 83129a1d753d1b3e9263a209b16d611725421beb Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 11:44:05 +0100 Subject: [PATCH 233/297] Update setup.cfg --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 169bdec..ba2399a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a9 +version = 1.0.0 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion From 5b0d2b9bdd6dc9a96daea2f95554a31dda024c34 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 14:36:11 +0100 Subject: [PATCH 234/297] Revert "Simplify precompute cache script" This reverts commit af9921a7f86c4fba2f5cb2107990707a39a64a62. --- scripts/06_precompute_cache.py | 55 +++++++++++++++++++++++++++++++--- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 63178df..d1e19ea 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -33,6 +33,7 @@ def main( import cupy as xp import cupy.cuda.device + cupy.take_along_axis = take_along_axis device = cupy.cuda.device.Device(gpu_id) device.use() vectors_dir = Path(vectors) @@ -81,10 +82,12 @@ def main( neighbor_sims = xp.partition(sims, -n, axis=1)[:, -n:] # Can't figure out how to do this without the loop. for j in range(min(end - i, size)): - # Sort in reverse order - indices = xp.argsort(neighbor_sims[j], axis=-1)[::-1] - best_rows[i + j] = xp.take(neighbors[j], indices) - scores[i + j] = xp.take(neighbor_sims[j], indices) + best_rows[i + j] = neighbors[j] + scores[i + j] = neighbor_sims[j] + # Sort in reverse order + indices = xp.argsort(scores, axis=1)[:, ::-1] + scores = xp.take_along_axis(scores, indices, axis=1) + best_rows = xp.take_along_axis(best_rows, indices, axis=1) msg.info("Saving output") if not isinstance(best_rows, numpy.ndarray): best_rows = best_rows.get() @@ -103,6 +106,50 @@ def main( msg.good(f"Saved cache to {output_file}") +# These functions are missing from cupy, but will be supported in cupy 7. +def take_along_axis(a, indices, axis): + """Take values from the input array by matching 1d index and data slices. + + Args: + a (cupy.ndarray): Array to extract elements. + indices (cupy.ndarray): Indices to take along each 1d slice of ``a``. + axis (int): The axis to take 1d slices along. + + Returns: + cupy.ndarray: The indexed result. + + .. seealso:: :func:`numpy.take_along_axis` + """ + import cupy + + if indices.dtype.kind not in ("i", "u"): + raise IndexError("`indices` must be an integer array") + + if axis is None: + a = a.ravel() + axis = 0 + + ndim = a.ndim + + if not (-ndim <= axis < ndim): + raise IndexError("Axis overrun") + + axis %= a.ndim + + if ndim != indices.ndim: + raise ValueError("`indices` and `a` must have the same number of dimensions") + + fancy_index = [] + for i, n in enumerate(a.shape): + if i == axis: + fancy_index.append(indices) + else: + ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1) + fancy_index.append(cupy.arange(n).reshape(ind_shape)) + + return a[fancy_index] + + if __name__ == "__main__": try: plac.call(main) From a2f42829936eccf4edaddaf7dc98295f3e2e3ae9 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 15:55:04 +0100 Subject: [PATCH 235/297] Fix cache precompute script --- scripts/06_precompute_cache.py | 79 ++++++++++++++++++++++------------ 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index d1e19ea..c5a6ae3 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -34,6 +34,7 @@ def main( import cupy.cuda.device cupy.take_along_axis = take_along_axis + cupy.put_along_axis = put_along_axis device = cupy.cuda.device.Device(gpu_id) device.use() vectors_dir = Path(vectors) @@ -57,42 +58,34 @@ def main( msg.good(f"Normalized (mean {mean:,.2f}, variance {var:,.2f})") msg.info(f"Finding {n_neighbors:,} neighbors among {cutoff:,} most frequent") n = min(n_neighbors, vectors.shape[0]) + subset = vectors[:cutoff] best_rows = xp.zeros((end - start, n), dtype="i") scores = xp.zeros((end - start, n), dtype="f") - # Pre-allocate this array, so we can use it each time. - subset = xp.ascontiguousarray(vectors[:cutoff]) - sims = xp.zeros((batch_size, cutoff), dtype="f") - indices = xp.arange(cutoff).reshape((-1, 1)) for i in tqdm.tqdm(list(range(start, end, batch_size))): - batch = vectors[i : i + batch_size] - # batch e.g. (1024, 300) - # vectors e.g. (10000, 300) - # sims e.g. (1024, 10000) - if batch.shape[0] == sims.shape[0]: - xp.dot(batch, subset.T, out=sims) - else: - # In the last batch we'll have a different size. - sims = xp.dot(batch, subset.T) - size = sims.shape[0] - # Get the indices and scores for the top N most similar for each in the - # batch. This is a bit complicated, to avoid sorting all of the scores - # -- we only want the top N to be sorted (which we do later). For now, - # we use argpartition to just get the cut point. - neighbors = xp.argpartition(sims, -n, axis=1)[:, -n:] - neighbor_sims = xp.partition(sims, -n, axis=1)[:, -n:] - # Can't figure out how to do this without the loop. - for j in range(min(end - i, size)): - best_rows[i + j] = neighbors[j] - scores[i + j] = neighbor_sims[j] - # Sort in reverse order - indices = xp.argsort(scores, axis=1)[:, ::-1] - scores = xp.take_along_axis(scores, indices, axis=1) - best_rows = xp.take_along_axis(best_rows, indices, axis=1) + size = min(batch_size, end - i) + batch = vectors[i : i + size] + sims = xp.dot(batch, subset.T) + # Set self-similarities to -inf, so that we don't return them. + indices = xp.arange(i, min(i+size, sims.shape[1])).reshape((1, -1)) + xp.put_along_axis(sims, indices, -xp.inf, axis=1) + # This used to use argpartition, to do a partial sort...But this ended + # up being a ratsnest of terrible numpy crap. Just sorting the whole + # list isn't really slower, and it's much simpler to read. + ranks = xp.argsort(sims, axis=1) + batch_rows = ranks[:, -n:] + # Reverse + batch_rows = batch_rows[:, ::-1] + batch_scores = xp.take_along_axis(sims, batch_rows, axis=1) + best_rows[i:i+size] = batch_rows + scores[i:i+size] = batch_scores msg.info("Saving output") if not isinstance(best_rows, numpy.ndarray): best_rows = best_rows.get() if not isinstance(scores, numpy.ndarray): scores = scores.get() + #for row in range(best_rows.shape[0]): + # assert best_rows[row, 0] == row + # assert abs(scores[row, 0] - 1.0) < 1e-2, scores[row] output = { "indices": best_rows, "scores": scores.astype("float16"), @@ -149,6 +142,36 @@ def take_along_axis(a, indices, axis): return a[fancy_index] +def put_along_axis(a, indices, value, axis): + import cupy + + if indices.dtype.kind not in ("i", "u"): + raise IndexError("`indices` must be an integer array") + + if axis is None: + a = a.ravel() + axis = 0 + + ndim = a.ndim + + if not (-ndim <= axis < ndim): + raise IndexError("Axis overrun") + + axis %= a.ndim + + if ndim != indices.ndim: + raise ValueError("`indices` and `a` must have the same number of dimensions") + + fancy_index = [] + for i, n in enumerate(a.shape): + if i == axis: + fancy_index.append(indices) + else: + ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1) + fancy_index.append(cupy.arange(n).reshape(ind_shape)) + + a[fancy_index] = value + if __name__ == "__main__": try: From f35a041059208b124e6db3a0ebabec2bee2397a3 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 15:55:17 +0100 Subject: [PATCH 236/297] Fix cache usage, by remembering row2key map --- sense2vec/sense2vec.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 5dec235..2f0099b 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -30,6 +30,7 @@ def __init__( RETURNS (Sense2Vec): The newly constructed object. """ self.vectors = Vectors(shape=shape, name=vectors_name) + self._row2key = None self.strings = StringStore() if strings is None else strings self.freqs: Dict[int, int] = {} self.cache = None @@ -87,6 +88,7 @@ def __setitem__(self, key: Union[str, int], vector: numpy.ndarray): if key not in self.vectors: raise ValueError(f"Can't find key {key} in table") self.vectors[key] = vector + self._row2key = None def __iter__(self): """YIELDS (tuple): String key and vector pairs in the table.""" @@ -106,6 +108,11 @@ def values(self): """YIELDS (numpy.ndarray): The vectors in the table.""" yield from self.vectors.values() + def row2key(self): + if not self._row2key: + self._row2key = {row: key for row, key in self.vectors.key2row.items()} + return self._row2key + @property def make_key(self) -> Callable: """Get the function to make keys.""" @@ -128,6 +135,7 @@ def add(self, key: Union[str, int], vector: numpy.ndarray, freq: int = None): self.vectors.add(key, vector=vector) if freq is not None: self.set_freq(key, freq) + self._row2key = None def get_freq(self, key: Union[str, int], default=None) -> Union[int, None]: """Get the frequency count for a given key. @@ -207,15 +215,15 @@ def most_similar( raise ValueError(f"Can't find key {key} in table") if len(self.vectors) < n_similar: n_similar = len(self.vectors) - if self.cache: - indices = self.cache.get("indices", []) - scores = self.cache.get("scores", []) - if len(indices) >= n_similar: - key_row = self.vectors.find(key=key) - sim_keys = self.vectors.find(rows=indices[key_row][:n_similar]) - sim_scores = scores[key_row][:n_similar] - result = [(self.strings[k], s) for k, s in zip(sim_keys, sim_scores)] - return [(key, score) for key, score in result if key not in keys] + if self.cache and self.cache["indices"].shape[1] >= n_similar: + key = self.ensure_int_key(key) + key_row = self.vectors.find(key=key) + rows = self.cache["indices"][key_row, :n_similar] + scores = self.cache["indices"][key_row, :n_similar] + keys = [self.row2key[k] for k in rows] + result = [(self.strings[k], s) for k, s in zip(sim_keys, sim_scores)] + if k != key] + return result vecs = numpy.vstack([self[key] for key in keys]) average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( From 3926c2d490707682b84677416bfc3386a9e4b73a Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 15:57:56 +0100 Subject: [PATCH 237/297] Fix syntax --- sense2vec/sense2vec.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 2f0099b..8a9da2d 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -220,10 +220,9 @@ def most_similar( key_row = self.vectors.find(key=key) rows = self.cache["indices"][key_row, :n_similar] scores = self.cache["indices"][key_row, :n_similar] - keys = [self.row2key[k] for k in rows] - result = [(self.strings[k], s) for k, s in zip(sim_keys, sim_scores)] - if k != key] - return result + keys = [self.strings[self.row2key[k]] for k in rows] + assert len(keys) == len(scores) + return list(zip(keys, scores)) vecs = numpy.vstack([self[key] for key in keys]) average = vecs.mean(axis=0, keepdims=True) result_keys, _, scores = self.vectors.most_similar( From 026869fcca512b174c36118b09bb76467d6157e9 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 15:58:02 +0100 Subject: [PATCH 238/297] Format --- scripts/06_precompute_cache.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index c5a6ae3..7393d8f 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -66,7 +66,7 @@ def main( batch = vectors[i : i + size] sims = xp.dot(batch, subset.T) # Set self-similarities to -inf, so that we don't return them. - indices = xp.arange(i, min(i+size, sims.shape[1])).reshape((1, -1)) + indices = xp.arange(i, min(i + size, sims.shape[1])).reshape((1, -1)) xp.put_along_axis(sims, indices, -xp.inf, axis=1) # This used to use argpartition, to do a partial sort...But this ended # up being a ratsnest of terrible numpy crap. Just sorting the whole @@ -76,14 +76,14 @@ def main( # Reverse batch_rows = batch_rows[:, ::-1] batch_scores = xp.take_along_axis(sims, batch_rows, axis=1) - best_rows[i:i+size] = batch_rows - scores[i:i+size] = batch_scores + best_rows[i : i + size] = batch_rows + scores[i : i + size] = batch_scores msg.info("Saving output") if not isinstance(best_rows, numpy.ndarray): best_rows = best_rows.get() if not isinstance(scores, numpy.ndarray): scores = scores.get() - #for row in range(best_rows.shape[0]): + # for row in range(best_rows.shape[0]): # assert best_rows[row, 0] == row # assert abs(scores[row, 0] - 1.0) < 1e-2, scores[row] output = { @@ -142,6 +142,7 @@ def take_along_axis(a, indices, axis): return a[fancy_index] + def put_along_axis(a, indices, value, axis): import cupy From 5a27708f6a0fe588faf5a4a02ae586942bcef057 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 16:11:56 +0100 Subject: [PATCH 239/297] Fix sense2vec --- sense2vec/sense2vec.py | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 8a9da2d..5a4c133 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -108,9 +108,10 @@ def values(self): """YIELDS (numpy.ndarray): The vectors in the table.""" yield from self.vectors.values() + @property def row2key(self): if not self._row2key: - self._row2key = {row: key for row, key in self.vectors.key2row.items()} + self._row2key = {row: key for key, row in self.vectors.key2row.items()} return self._row2key @property @@ -208,30 +209,32 @@ def most_similar( """ if isinstance(keys, (str, int)): keys = [keys] - # Always ask for more because we'll always get the keys themselves - n_similar = n + len(keys) for key in keys: if key not in self: raise ValueError(f"Can't find key {key} in table") - if len(self.vectors) < n_similar: - n_similar = len(self.vectors) - if self.cache and self.cache["indices"].shape[1] >= n_similar: + if self.cache and self.cache["indices"].shape[1] >= n: + n = min(len(self.vectors), n) key = self.ensure_int_key(key) key_row = self.vectors.find(key=key) - rows = self.cache["indices"][key_row, :n_similar] - scores = self.cache["indices"][key_row, :n_similar] - keys = [self.strings[self.row2key[k]] for k in rows] + rows = self.cache["indices"][key_row, :n] + scores = self.cache["indices"][key_row, :n] + keys = [self.row2key[r] for r in rows] + keys = [self.strings[k] for k in keys] assert len(keys) == len(scores) return list(zip(keys, scores)) - vecs = numpy.vstack([self[key] for key in keys]) - average = vecs.mean(axis=0, keepdims=True) - result_keys, _, scores = self.vectors.most_similar( - average, n=n_similar, batch_size=batch_size - ) - result = list(zip(result_keys.flatten(), scores.flatten())) - result = [(self.strings[key], score) for key, score in result if key] - result = [(key, score) for key, score in result if key not in keys] - return result + else: + # Always ask for more because we'll always get the keys themselves + n = min(len(self.vectors), n + len(keys)) + rows = numpy.asarray(self.vectors.find(keys=keys)) + vecs = self.vectors.data[rows] + average = vecs.mean(axis=0, keepdims=True) + result_keys, _, scores = self.vectors.most_similar( + average, n=n, batch_size=batch_size + ) + result = list(zip(result_keys.flatten(), scores.flatten())) + result = [(self.strings[key], score) for key, score in result if key] + result = [(key, score) for key, score in result if key not in keys] + return result def get_other_senses( self, key: Union[str, int], ignore_case: bool = True @@ -309,6 +312,7 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): self.strings = StringStore().from_bytes(data["strings"]) if "cache" not in exclude and "cache" in data: self.cache = data.get("cache", {}) + self._row2key = None return self def to_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): @@ -345,4 +349,5 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): self.strings = StringStore().from_disk(strings_path) if "cache" not in exclude and cache_path.exists(): self.cache = srsly.read_msgpack(cache_path) + self._row2key = None return self From c336dc4cdf1abfb5b9edc217334efc257999813a Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 16:19:04 +0100 Subject: [PATCH 240/297] Clarify test slightly --- tests/test_model.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 2b5d48a..7dcc478 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -25,9 +25,13 @@ def test_model_most_similar_cache(s2v): # Modify cache to test that the cache is used and values aren't computed query_row = s2v.vectors.find(key=s2v.ensure_int_key(query)) scores = numpy.array(s2v.cache["scores"], copy=True) # otherwise not writable - scores[query_row, 1] = 2.0 - scores[query_row, 2] = 3.0 + honey_bees_row = s2v.vectors.find(key="honey_bees|NOUN") + scores[query_row, honey_bees_row] = 2.0 + beekeepers_row = s2v.vectors.find(key="Beekepers|NOUN") + scores[query_row, 1] = 3.0 s2v.cache["scores"] = scores + print(scores) + print(s2v.cache["indices"]) ((key1, score1), (key2, score2)) = s2v.most_similar([query], n=2) assert key1 == "honey_bees|NOUN" assert score1 == 2.0 From 862023ae0963c79c860a615babf8452094e5dd60 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 16:19:14 +0100 Subject: [PATCH 241/297] Update cache in test --- tests/data/cache | Bin 270 -> 270 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/tests/data/cache b/tests/data/cache index 7325e916dadd8f1c6169de4f0f8948e1120ea057..658d70456861ed0defc7888c63ba3aa33e267602 100644 GIT binary patch literal 270 zcmZo&o|%`DnVeeOdW0!2> zB!!8Afq@x_S%8=kh(Q3v28n|JNDd~BtPUm*qG5VrYGHCP^~;Kr^NY|Om1cy~QAQC~ zZdUtEL<|}Jh+32ziWpuqVE7YZwcWDMtQ;uj24u}Q28oGU^qK89nQzSSXK8UsVo?dh R;?%qp)@8}1CHZM-tN?>HL(%{M literal 270 zcmZo&o|%`DnVeeOdW0!2> zB!vM8n1GlWh*^M`5sE=tkCCL)FmHlh~gh9ZX73_xPrE&I&Mfnshz)_h}-n5adc*?yDx#!HJ!5{pV07N_Q= Our5n3Ey+(yV+8=$n?Mo( From a1edcbfaae7290e6a743f8feafdd803ab0d2ca1c Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 16:19:26 +0100 Subject: [PATCH 242/297] Fix precompute cache --- scripts/06_precompute_cache.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 7393d8f..62fa6bb 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -66,7 +66,7 @@ def main( batch = vectors[i : i + size] sims = xp.dot(batch, subset.T) # Set self-similarities to -inf, so that we don't return them. - indices = xp.arange(i, min(i + size, sims.shape[1])).reshape((1, -1)) + indices = xp.arange(i, min(i + size, sims.shape[1])).reshape((-1, 1)) xp.put_along_axis(sims, indices, -xp.inf, axis=1) # This used to use argpartition, to do a partial sort...But this ended # up being a ratsnest of terrible numpy crap. Just sorting the whole From d21c29e5101eb557e3eabf97a2bea3c0d287053f Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 16:22:14 +0100 Subject: [PATCH 243/297] Remove commented code --- scripts/06_precompute_cache.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 62fa6bb..6e6c97b 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -83,9 +83,6 @@ def main( best_rows = best_rows.get() if not isinstance(scores, numpy.ndarray): scores = scores.get() - # for row in range(best_rows.shape[0]): - # assert best_rows[row, 0] == row - # assert abs(scores[row, 0] - 1.0) < 1e-2, scores[row] output = { "indices": best_rows, "scores": scores.astype("float16"), From b274eaaedb8b0bfec110fe02bf46549402b690e5 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 21 Nov 2019 16:23:18 +0100 Subject: [PATCH 244/297] Fix test --- tests/test_model.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 7dcc478..830fe5f 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -28,10 +28,8 @@ def test_model_most_similar_cache(s2v): honey_bees_row = s2v.vectors.find(key="honey_bees|NOUN") scores[query_row, honey_bees_row] = 2.0 beekeepers_row = s2v.vectors.find(key="Beekepers|NOUN") - scores[query_row, 1] = 3.0 + scores[query_row, beekeepers_row] = 3.0 s2v.cache["scores"] = scores - print(scores) - print(s2v.cache["indices"]) ((key1, score1), (key2, score2)) = s2v.most_similar([query], n=2) assert key1 == "honey_bees|NOUN" assert score1 == 2.0 From da6b2ce97c0d978bdae224e05ba2f01baef402b8 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 16:38:24 +0100 Subject: [PATCH 245/297] Update README.md --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 1a47fd1..66eeac1 100644 --- a/README.md +++ b/README.md @@ -722,6 +722,10 @@ This package also seamlessly integrates with the [Prodigy](https://prodi.gy) annotation tool and exposes recipes for using sense2vec vectors to quickly generate lists of multi-word phrases and bootstrap NER annotations. To use a recipe, `sense2vec` needs to be installed in the same environment as Prodigy. +For an example of a real-world use case, check out this +[NER project](https://github.com/explosion/projects/tree/master/ner-fashion-brands) +with downloadable datasets. + The following recipes are available – see below for more detailed docs. | Recipe | Description | From bc52035b993f6c6c8794c9a23ab61f2c24fbf062 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 16:39:27 +0100 Subject: [PATCH 246/297] Update spaCy pin --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index ba2399a..54c68f4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,7 +27,7 @@ zip_safe = true include_package_data = true python_requires = >=3.6 install_requires = - spacy>=2.2.2,<3.0.0 + spacy>=2.2.3,<3.0.0 srsly>=0.2.0 catalogue>=0.0.4 wasabi>=0.4.0,<1.1.0 From 9c06eb8dd1a0dc3097fa3f73f7eef0510ab50624 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 18:01:14 +0100 Subject: [PATCH 247/297] Update to spaCy dev version --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a7827ff..b9228b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our packages -spacy>=2.2.2,<3.0.0 +spacy==2.2.3.dev0 srsly>=0.2.0 catalogue>=0.0.4 # Third-party dependencies diff --git a/setup.cfg b/setup.cfg index 54c68f4..8f09062 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,7 +27,7 @@ zip_safe = true include_package_data = true python_requires = >=3.6 install_requires = - spacy>=2.2.3,<3.0.0 + spacy==2.2.3.dev0 srsly>=0.2.0 catalogue>=0.0.4 wasabi>=0.4.0,<1.1.0 From 9402bcb8f03c1e38a727a19f8fc1d0ee40b36766 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 19:22:58 +0100 Subject: [PATCH 248/297] Update README.md --- README.md | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 66eeac1..ef08789 100644 --- a/README.md +++ b/README.md @@ -102,10 +102,16 @@ To use the vectors, download the archive(s) and pass the extracted directory to **attached to the GitHub release**. Large files have been split into multi-part downloads. -| Vectors | Size | Description | 📥 Download (zipped) | -| -------------------- | -----: | ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `s2v_reddit_2019_lg` | 4 GB | Reddit comments 2019 (01-07) | [part 1](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.zip), [part 2](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.z01), [part 3](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.z02) | -| `s2v_reddit_2015_md` | 573 MB | Reddit comments 2015 | [part 1](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz) | +| Vectors | Size | Description | 📥 Download (zipped) | +| -------------------- | -----: | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `s2v_reddit_2019_lg` | 4 GB | Reddit comments 2019 (01-07) | [part 1](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.tar.gz.001), [part 2](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.tar.gz.002), [part 3](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2019_lg.tar.gz.003) | +| `s2v_reddit_2015_md` | 573 MB | Reddit comments 2015 | [part 1](https://github.com/explosion/sense2vec/releases/download/v1.0.0/s2v_reddit_2015_md.tar.gz) | + +To merge the multi-part archives, you can run the following: + +```bash +cat s2v_reddit_2019_lg.tar.gz.* > s2v_reddit_2019_lg.tar.gz +``` ## ⏳ Installation & Setup From 78cbbf61d3bd581cf0b73bde352ba9f4ebfc4077 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 19:42:32 +0100 Subject: [PATCH 249/297] Update spaCy pin --- requirements.txt | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index b9228b9..9103469 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our packages -spacy==2.2.3.dev0 +spacy>=2.2.3,<3.0.0 srsly>=0.2.0 catalogue>=0.0.4 # Third-party dependencies diff --git a/setup.cfg b/setup.cfg index 8f09062..54c68f4 100644 --- a/setup.cfg +++ b/setup.cfg @@ -27,7 +27,7 @@ zip_safe = true include_package_data = true python_requires = >=3.6 install_requires = - spacy==2.2.3.dev0 + spacy>=2.2.3,<3.0.0 srsly>=0.2.0 catalogue>=0.0.4 wasabi>=0.4.0,<1.1.0 From 01a700cf1c69beb0c6b59f3d0653168188421621 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Thu, 21 Nov 2019 20:59:50 +0100 Subject: [PATCH 250/297] Increment version [ci skip] --- README.md | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 3accae3..b5afd88 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ streamlit run https://raw.githubusercontent.com/explosion/sense2vec/master/scrip sense2vec releases are available on pip: ```bash -pip install sense2vec==1.0.0a9 +pip install sense2vec==1.0.0a10 ``` The Reddit vectors model is attached to diff --git a/setup.cfg b/setup.cfg index 169bdec..6756173 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0a9 +version = 1.0.0a10 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion From 9f1157583c63da86b500458dcbb62b0a5f1f92ec Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Fri, 22 Nov 2019 16:33:31 +0100 Subject: [PATCH 251/297] Fix cache --- sense2vec/sense2vec.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 5a4c133..4f0e43b 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -217,7 +217,7 @@ def most_similar( key = self.ensure_int_key(key) key_row = self.vectors.find(key=key) rows = self.cache["indices"][key_row, :n] - scores = self.cache["indices"][key_row, :n] + scores = self.cache["scores"][key_row, :n] keys = [self.row2key[r] for r in rows] keys = [self.strings[k] for k in keys] assert len(keys) == len(scores) From 521ea401e0e4af2215a84a9a0a9e4330a5a5a4aa Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Nov 2019 16:34:56 +0100 Subject: [PATCH 252/297] Increment version [ci skip] --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 54c68f4..4b08230 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.0 +version = 1.0.1 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion From a7680d242310b4b4bff2464881899df7131fa766 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Nov 2019 16:42:59 +0100 Subject: [PATCH 253/297] Update README.md [ci skip] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ef08789..761b237 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ semantic similarities across all Reddit comments of 2015 and 2019, see the [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/12/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=12) -[![Current Release Version](https://img.shields.io/github/v/release/explosion/sense2vec.svg?style=flat-square&include_prereleases&logo=github)](https://github.com/explosion/sense2vec/releases) +[![Current Release Version](https://img.shields.io/github/v/release/explosion/sense2vec.svg?style=flat-square&logo=github)](https://github.com/explosion/sense2vec/releases) [![pypi Version](https://img.shields.io/pypi/v/sense2vec.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/sense2vec/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) From 274d917cbe475e6fc0ca824d5fdd3bd58ba62998 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Fri, 22 Nov 2019 16:47:17 +0100 Subject: [PATCH 254/297] Fix cache test --- tests/test_model.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tests/test_model.py b/tests/test_model.py index 830fe5f..45e29ea 100644 --- a/tests/test_model.py +++ b/tests/test_model.py @@ -22,13 +22,18 @@ def test_model_most_similar_cache(s2v): query = "beekeepers|NOUN" assert s2v.cache assert query in s2v + indices = s2v.cache["indices"] # Modify cache to test that the cache is used and values aren't computed query_row = s2v.vectors.find(key=s2v.ensure_int_key(query)) scores = numpy.array(s2v.cache["scores"], copy=True) # otherwise not writable honey_bees_row = s2v.vectors.find(key="honey_bees|NOUN") - scores[query_row, honey_bees_row] = 2.0 - beekeepers_row = s2v.vectors.find(key="Beekepers|NOUN") - scores[query_row, beekeepers_row] = 3.0 + beekeepers_row = s2v.vectors.find(key="Beekeepers|NOUN") + for i in range(indices.shape[0]): + for j in range(indices.shape[1]): + if indices[i, j] == honey_bees_row: + scores[i, j] = 2.0 + elif indices[i, j] == beekeepers_row: + scores[i, j] = 3.0 s2v.cache["scores"] = scores ((key1, score1), (key2, score2)) = s2v.most_similar([query], n=2) assert key1 == "honey_bees|NOUN" From 28f0ed70568e90ea1b4b19f0462e89ca1b3d5ea1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Nov 2019 18:26:28 +0100 Subject: [PATCH 255/297] Add defaults in components if saved out with old cfg --- sense2vec/component.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index 6744081..95c37d2 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -89,7 +89,8 @@ def __call__(self, doc: Doc) -> Doc: # Store reference to s2v object on Doc to make sure it's right doc._._s2v = self.s2v if self.merge_phrases: - merge_phrases = registry.merge_phrases.get(doc._._s2v.cfg["merge_phrases"]) + merge_phrases_id = doc._._s2v.cfg.get("merge_phrases", "default") + merge_phrases = registry.merge_phrases.get(merge_phrases_id) doc = merge_phrases(doc) return doc @@ -117,7 +118,7 @@ def get_phrases(self, doc: Doc) -> List[Span]: doc (Doc): The Doc to get phrases from. RETURNS (list): The phrases as a list of Span objects. """ - func = registry.get_phrases.get(doc._._s2v.cfg["get_phrases"]) + func = registry.get_phrases.get(doc._._s2v.cfg.get("get_phrases", "default")) return func(doc) def in_s2v(self, obj: Union[Token, Span]) -> bool: @@ -151,9 +152,8 @@ def s2v_key(self, obj: Union[Token, Span]) -> str: obj (Token / Span): The object to create the key for. RETURNS (unicode): The key. """ - make_spacy_key = registry.make_spacy_key.get( - obj.doc._._s2v.cfg["make_spacy_key"] - ) + make_space_key_id = obj.doc._._s2v.cfg.get("make_spacy_key", "default") + make_spacy_key = registry.make_spacy_key.get(make_space_key_id) if obj.doc._._s2v.cfg.get("lemmatize", False): lemma = make_spacy_key(obj, prefer_ents=self.merge_phrases, lemmatize=True) lemma_key = obj.doc._._s2v.make_key(*lemma) From f9aeb222c534378cbd063c5b85cf5f2fd3a2cdb0 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Nov 2019 18:38:48 +0100 Subject: [PATCH 256/297] Fix serialization / deserialization of string store --- sense2vec/component.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index 95c37d2..469ea10 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -55,8 +55,7 @@ def __init__( """ self.first_run = True self.merge_phrases = merge_phrases - strings = vocab.strings if vocab is not None else None - self.s2v = Sense2Vec(shape=shape, strings=strings) + self.s2v = Sense2Vec(shape=shape) cfg = { "make_spacy_key": "default", "get_phrases": "default", @@ -212,7 +211,7 @@ def from_bytes(self, bytes_data: bytes): bytes_data (bytes): The data to load. RETURNS (Sense2VecComponent): The loaded object. """ - self.s2v = Sense2Vec().from_bytes(bytes_data, exclude=["strings"]) + self.s2v = Sense2Vec().from_bytes(bytes_data) return self def to_disk(self, path: Union[str, Path]): @@ -228,5 +227,5 @@ def from_disk(self, path: Union[str, Path]): path (unicode / Path): The path to load from. RETURNS (Sense2VecComponent): The loaded object. """ - self.s2v = Sense2Vec().from_disk(path, exclude=["strings"]) + self.s2v = Sense2Vec().from_disk(path) return self From 61eb3a7ec080b36f52352f700e442f2b83b0a565 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Fri, 22 Nov 2019 18:39:07 +0100 Subject: [PATCH 257/297] Increment version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 4b08230..ae54e1a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.1 +version = 1.0.2 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion From 19d3c3eede633fad723a0705677c9f43375fac90 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Wed, 27 Nov 2019 16:46:49 +0100 Subject: [PATCH 258/297] Update precompute cache --- scripts/06_precompute_cache.py | 37 +++------------------------------- 1 file changed, 3 insertions(+), 34 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 6e6c97b..fb19845 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -34,7 +34,6 @@ def main( import cupy.cuda.device cupy.take_along_axis = take_along_axis - cupy.put_along_axis = put_along_axis device = cupy.cuda.device.Device(gpu_id) device.use() vectors_dir = Path(vectors) @@ -66,8 +65,9 @@ def main( batch = vectors[i : i + size] sims = xp.dot(batch, subset.T) # Set self-similarities to -inf, so that we don't return them. - indices = xp.arange(i, min(i + size, sims.shape[1])).reshape((-1, 1)) - xp.put_along_axis(sims, indices, -xp.inf, axis=1) + for j in range(size): + if i+j < sims.shape[1]: + sims[j, i+j] = -xp.inf # This used to use argpartition, to do a partial sort...But this ended # up being a ratsnest of terrible numpy crap. Just sorting the whole # list isn't really slower, and it's much simpler to read. @@ -140,37 +140,6 @@ def take_along_axis(a, indices, axis): return a[fancy_index] -def put_along_axis(a, indices, value, axis): - import cupy - - if indices.dtype.kind not in ("i", "u"): - raise IndexError("`indices` must be an integer array") - - if axis is None: - a = a.ravel() - axis = 0 - - ndim = a.ndim - - if not (-ndim <= axis < ndim): - raise IndexError("Axis overrun") - - axis %= a.ndim - - if ndim != indices.ndim: - raise ValueError("`indices` and `a` must have the same number of dimensions") - - fancy_index = [] - for i, n in enumerate(a.shape): - if i == axis: - fancy_index.append(indices) - else: - ind_shape = (1,) * i + (-1,) + (1,) * (ndim - i - 1) - fancy_index.append(cupy.arange(n).reshape(ind_shape)) - - a[fancy_index] = value - - if __name__ == "__main__": try: plac.call(main) From fca865e2c9928834137420be188819b79bb3251c Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 28 Nov 2019 18:29:20 +0100 Subject: [PATCH 259/297] Make cosine_similarity util function --- sense2vec/util.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sense2vec/util.py b/sense2vec/util.py index 496a7b4..9c0db4b 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -2,6 +2,7 @@ import re from spacy.tokens import Doc, Token, Span from spacy.util import filter_spans +from thinc.neural.util import get_array_module import catalogue try: @@ -167,6 +168,18 @@ def merge_phrases(doc: Doc) -> Doc: return doc +def cosine_similarity(vec1, vec2) -> float: + """Compute the cosine similarity of two vectors.""" + if vec1.all() == 0 or vec2.all() == 0: + return 0.0 + xp = get_array_module(vec1) + norm1 = xp.linalg.norm(vec1) + norm2 = xp.linalg.norm(vec2) + if norm1 == norm2: + return 1.0 + return xp.dot(vec1, vec2) / (norm1 * norm2) + + class SimpleFrozenDict(dict): """Simplified implementation of a frozen dict, mainly used as default function or method argument (for arguments that should default to empty From d7305b9dd247e14abe210bc29314768c9517f48d Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 28 Nov 2019 18:29:59 +0100 Subject: [PATCH 260/297] Use cosine_similarity util function --- sense2vec/sense2vec.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 4f0e43b..46ccdbc 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -5,7 +5,7 @@ import numpy import srsly -from .util import registry, SimpleFrozenDict +from .util import registry, cosine_similarity, SimpleFrozenDict class Sense2Vec(object): @@ -185,14 +185,8 @@ def similarity( keys_b = [keys_b] average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0) average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0) - if average_a.all() == 0 or average_b.all() == 0: - return 0.0 - norm_a = numpy.linalg.norm(average_a) - norm_b = numpy.linalg.norm(average_b) - if norm_a == norm_b: - return 1.0 - return numpy.dot(average_a, average_b) / (norm_a * norm_b) - + return cosine_similarity(average_a, average_b) + def most_similar( self, keys: Union[Sequence[Union[str, int]], str, int], From b68c85dd6a8f38a172dd5db98944d05dea8779c8 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 28 Nov 2019 18:31:27 +0100 Subject: [PATCH 261/297] Fix sense2vec most_similar if cache too small --- sense2vec/sense2vec.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 46ccdbc..af2bca8 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -210,25 +210,25 @@ def most_similar( n = min(len(self.vectors), n) key = self.ensure_int_key(key) key_row = self.vectors.find(key=key) - rows = self.cache["indices"][key_row, :n] - scores = self.cache["scores"][key_row, :n] - keys = [self.row2key[r] for r in rows] - keys = [self.strings[k] for k in keys] - assert len(keys) == len(scores) - return list(zip(keys, scores)) - else: - # Always ask for more because we'll always get the keys themselves - n = min(len(self.vectors), n + len(keys)) - rows = numpy.asarray(self.vectors.find(keys=keys)) - vecs = self.vectors.data[rows] - average = vecs.mean(axis=0, keepdims=True) - result_keys, _, scores = self.vectors.most_similar( - average, n=n, batch_size=batch_size - ) - result = list(zip(result_keys.flatten(), scores.flatten())) - result = [(self.strings[key], score) for key, score in result if key] - result = [(key, score) for key, score in result if key not in keys] - return result + if key_row < self.cache["indices"].shape[0]: + rows = self.cache["indices"][key_row, :n] + scores = self.cache["scores"][key_row, :n] + entries = zip(rows, scores) + entries = [(self.strings[self.row2key[r]], score) for r, score in entries + if r in self.row2key] + return entries + # Always ask for more because we'll always get the keys themselves + n = min(len(self.vectors), n + len(keys)) + rows = numpy.asarray(self.vectors.find(keys=keys)) + vecs = self.vectors.data[rows] + average = vecs.mean(axis=0, keepdims=True) + result_keys, _, scores = self.vectors.most_similar( + average, n=n, batch_size=batch_size + ) + result = list(zip(result_keys.flatten(), scores.flatten())) + result = [(self.strings[key], score) for key, score in result if key] + result = [(key, score) for key, score in result if key not in keys] + return result def get_other_senses( self, key: Union[str, int], ignore_case: bool = True From d6cab3f6e7f3efb5e5c8c308dfe3aeddca0e1f34 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 28 Nov 2019 18:32:14 +0100 Subject: [PATCH 262/297] Add cutoffs to export script --- scripts/05_export.py | 97 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 79 insertions(+), 18 deletions(-) diff --git a/scripts/05_export.py b/scripts/05_export.py index f7eee1c..dc1d90c 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -1,6 +1,7 @@ #!/usr/bin/env python +from collections import OrderedDict, defaultdict from sense2vec import Sense2Vec -from sense2vec.util import split_key +from sense2vec.util import split_key, cosine_similarity from pathlib import Path import plac from wasabi import msg @@ -22,12 +23,75 @@ def _get_shape(file_): return shape, file_ +def read_vocab(vocab_file): + freqs = OrderedDict() + for line in vocab_file: + item = line.rstrip() + if item.endswith(" word"): # for fastText vocabs + item = item[:-5] + try: + key, freq = item.rsplit(" ", 1) + except ValueError: + continue + freqs[key] = int(freq) + return freqs + + +def get_minority_keys(freqs, min_ratio): + """Remove keys that are too infrequent relative to a main sense.""" + by_word = defaultdict(list) + for key, freq in freqs.items(): + try: + term, sense = split_key(key) + except ValueError: + continue + if freq: + by_word[term.lower()].append((freq, key)) + discarded = [] + for values in by_word.values(): + if len(values) >= 2: + values.sort(reverse=True) + freq1, key1 = values[0] + for freq2, key2 in values[1:]: + ratio = freq2 / freq1 + if ratio < min_ratio: + discarded.append(key2) + return discarded + + +def get_redundant_keys(vocab, vectors, min_distance): + if min_distance <= 0.0: + return [] + by_word = defaultdict(list) + for key, freq in vocab.items(): + try: + term, sense = split_key(key) + except ValueError: + continue + term = term.split("_")[-1] + by_word[term.lower()].append((freq, key)) + too_similar = [] + for values in by_word.values(): + if len(values) >= 2: + values.sort(reverse=True) + freq1, key1 = values[0] + vector1 = vectors[key1] + for freq2, key2 in values[1:]: + vector2 = vectors[key2] + sim = cosine_similarity(vector1, vector2) + if sim >= (1-min_distance): + too_similar.append(key2) + return too_similar + + @plac.annotations( in_file=("Vectors file (text-based)", "positional", None, str), vocab_file=("Vocabulary file", "positional", None, str), out_dir=("Path to output directory", "positional", None, str), + min_freq_ratio=("Frequency ratio threshold for discarding minority senses or casings.", "option", "r", float), + min_distance=("Similarity threshold for discarding redundant keys.", "option", "s", float) ) -def main(in_file, vocab_file, out_dir): +def main(in_file, vocab_file, out_dir, min_freq_ratio=0.0, min_distance=0.0): """ Step 5: Export a sense2vec component @@ -50,8 +114,8 @@ def main(in_file, vocab_file, out_dir): (n_vectors, vector_size), f = _get_shape(f) vectors_data = f.readlines() with vocab_path.open("r", encoding="utf8") as f: - vocab_data = f.readlines() - data = [] + vocab = read_vocab(f) + vectors = {} all_senses = set() for item in vectors_data: item = item.rstrip().rsplit(" ", vector_size) @@ -64,21 +128,18 @@ def main(in_file, vocab_file, out_dir): if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) all_senses.add(sense) - data.append((key, numpy.asarray(vec, dtype=numpy.float32))) - s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses) - for key, vector in data: - s2v.add(key, vector) - for item in vocab_data: - item = item.rstrip() - if item.endswith(" word"): # for fastText vocabs - item = item[:-5] - try: - key, freq = item.rsplit(" ", 1) - except ValueError: - continue - s2v.set_freq(key, int(freq)) + vectors[key] = numpy.asarray(vec, dtype=numpy.float32) + discarded = set() + discarded.update(get_minority_keys(vocab, min_freq_ratio)) + discarded.update(get_redundant_keys(vocab, vectors, min_distance)) + n_vectors = len(vectors) - len(discarded) + s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) + for key, vector in vectors.items(): + if key not in discarded: + s2v.add(key, vector) + s2v.set_freq(key, vocab[key]) msg.good("Created the sense2vec model") - msg.info(f"{len(data)} vectors, {len(all_senses)} total senses") + msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir) From 9a005fd78446abd8c9f60aac6f17ddc004e44919 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 28 Nov 2019 18:33:13 +0100 Subject: [PATCH 263/297] Fix enabling GPU in precompute cache --- scripts/06_precompute_cache.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index fb19845..7ed302f 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -32,9 +32,9 @@ def main( else: import cupy as xp import cupy.cuda.device - - cupy.take_along_axis = take_along_axis + xp.take_along_axis = take_along_axis device = cupy.cuda.device.Device(gpu_id) + cupy.cuda.get_cublas_handle() device.use() vectors_dir = Path(vectors) vectors_file = vectors_dir / "vectors" From 1388bdffd320adb5ba7db2d66676ed3d58b19da1 Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 28 Nov 2019 18:33:49 +0100 Subject: [PATCH 264/297] Format --- scripts/05_export.py | 18 ++++++++++++++---- scripts/06_precompute_cache.py | 7 ++++--- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/scripts/05_export.py b/scripts/05_export.py index dc1d90c..922bb16 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -35,7 +35,7 @@ def read_vocab(vocab_file): continue freqs[key] = int(freq) return freqs - + def get_minority_keys(freqs, min_ratio): """Remove keys that are too infrequent relative to a main sense.""" @@ -79,7 +79,7 @@ def get_redundant_keys(vocab, vectors, min_distance): for freq2, key2 in values[1:]: vector2 = vectors[key2] sim = cosine_similarity(vector1, vector2) - if sim >= (1-min_distance): + if sim >= (1 - min_distance): too_similar.append(key2) return too_similar @@ -88,8 +88,18 @@ def get_redundant_keys(vocab, vectors, min_distance): in_file=("Vectors file (text-based)", "positional", None, str), vocab_file=("Vocabulary file", "positional", None, str), out_dir=("Path to output directory", "positional", None, str), - min_freq_ratio=("Frequency ratio threshold for discarding minority senses or casings.", "option", "r", float), - min_distance=("Similarity threshold for discarding redundant keys.", "option", "s", float) + min_freq_ratio=( + "Frequency ratio threshold for discarding minority senses or casings.", + "option", + "r", + float, + ), + min_distance=( + "Similarity threshold for discarding redundant keys.", + "option", + "s", + float, + ), ) def main(in_file, vocab_file, out_dir, min_freq_ratio=0.0, min_distance=0.0): """ diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 7ed302f..55a9601 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -11,7 +11,7 @@ gpu_id=("GPU device (-1 for CPU)", "option", "g", int), n_neighbors=("Number of neighbors to cache", "option", "n", int), batch_size=("Batch size for to reduce memory usage.", "option", "b", int), - cutoff=("Limit neighbors to this many earliest rows", "option", "c", int,), + cutoff=("Limit neighbors to this many earliest rows", "option", "c", int), start=("Index of vectors to start at.", "option", "s", int), end=("Index of vectors to stop at.", "option", "e", int), ) @@ -32,6 +32,7 @@ def main( else: import cupy as xp import cupy.cuda.device + xp.take_along_axis = take_along_axis device = cupy.cuda.device.Device(gpu_id) cupy.cuda.get_cublas_handle() @@ -66,8 +67,8 @@ def main( sims = xp.dot(batch, subset.T) # Set self-similarities to -inf, so that we don't return them. for j in range(size): - if i+j < sims.shape[1]: - sims[j, i+j] = -xp.inf + if i + j < sims.shape[1]: + sims[j, i + j] = -xp.inf # This used to use argpartition, to do a partial sort...But this ended # up being a ratsnest of terrible numpy crap. Just sorting the whole # list isn't really slower, and it's much simpler to read. From a178a21f4755bfe697f3f193bf1536a38d3c391a Mon Sep 17 00:00:00 2001 From: Matthw Honnibal Date: Thu, 28 Nov 2019 18:34:03 +0100 Subject: [PATCH 265/297] Format --- sense2vec/sense2vec.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index af2bca8..7a7170d 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -186,7 +186,7 @@ def similarity( average_a = numpy.vstack([self[key] for key in keys_a]).mean(axis=0) average_b = numpy.vstack([self[key] for key in keys_b]).mean(axis=0) return cosine_similarity(average_a, average_b) - + def most_similar( self, keys: Union[Sequence[Union[str, int]], str, int], @@ -214,8 +214,11 @@ def most_similar( rows = self.cache["indices"][key_row, :n] scores = self.cache["scores"][key_row, :n] entries = zip(rows, scores) - entries = [(self.strings[self.row2key[r]], score) for r, score in entries - if r in self.row2key] + entries = [ + (self.strings[self.row2key[r]], score) + for r, score in entries + if r in self.row2key + ] return entries # Always ask for more because we'll always get the keys themselves n = min(len(self.vectors), n + len(keys)) From abb811a7c4e0fea7f29d0e7bc54ef7fd1670353b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 2 Dec 2019 02:02:41 +0100 Subject: [PATCH 266/297] Fix case-sensitive pattern creation --- sense2vec/prodigy_recipes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 270f354..8be9759 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -193,7 +193,7 @@ def to_patterns( examples = DB.get_dataset(dataset) terms = set([eg["word"] for eg in examples if eg["answer"] == "accept"]) if case_sensitive: - patterns = [[{"text": t.lower_} for t in nlp.make_doc(term)] for term in terms] + patterns = [[{"text": t.text} for t in nlp.make_doc(term)] for term in terms] else: terms = set([word.lower() for word in terms]) patterns = [[{"lower": t.lower_} for t in nlp.make_doc(term)] for term in terms] From 955eda735bb59c98c8e7bfac067d5e8e00a91bbe Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 14 Apr 2020 14:14:25 +0200 Subject: [PATCH 267/297] Update from macOS-10.13 to macOS-10.14 (#101) --- azure-pipelines.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 281f514..80506e1 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -15,7 +15,7 @@ jobs: imageName: 'vs2017-win2016' python.version: '3.6' Python36Mac: - imageName: 'macos-10.13' + imageName: 'macos-10.14' python.version: '3.6' Python38Linux: imageName: 'ubuntu-16.04' @@ -24,7 +24,7 @@ jobs: imageName: 'vs2017-win2016' python.version: '3.8' Python38Mac: - imageName: 'macos-10.13' + imageName: 'macos-10.14' python.version: '3.8' maxParallel: 4 pool: From 48b03f107237309ce0d3f7ef27998ff33f8ef11b Mon Sep 17 00:00:00 2001 From: cerules Date: Tue, 14 Apr 2020 08:14:54 -0400 Subject: [PATCH 268/297] fix missing f before format string (#98) --- scripts/03_glove_build_counts.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/03_glove_build_counts.py b/scripts/03_glove_build_counts.py index eb1b923..5fb05b6 100644 --- a/scripts/03_glove_build_counts.py +++ b/scripts/03_glove_build_counts.py @@ -61,7 +61,7 @@ def main( cmd = ( f"cat {' '.join(input_files)} | {glove_dir}/cooccur -memory {memory} " f"-vocab-file {vocab_file} -verbose {verbose} " - "-window-size {window_size} > {cooc_file}" + f"-window-size {window_size} > {cooc_file}" ) print(cmd) cooccur_cmd = os.system(cmd) From b330903fbb3e5ec65d5c27a4cec93cb5b2abde98 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 14 Apr 2020 14:20:04 +0200 Subject: [PATCH 269/297] fix few small typo's in readme (#97) --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 761b237..6ceefee 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ semantic similarities across all Reddit comments of 2015 and 2019, see the ([details](#-training-your-own-sense2vec-vectors)). - [Prodigy](https://prodi.gy) **annotation recipes** for evaluating models, creating lists of similar multi-word phrases and converting them to match - patterns, e.g. for rule-based NER or to boostrap NER annotation + patterns, e.g. for rule-based NER or to bootstrap NER annotation ([details & examples](#-prodigy-recipes)). ## 🚀 Quickstart @@ -151,7 +151,7 @@ s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/s2v_reddit_2015_md") nlp.add_pipe(s2v) ``` -The component will add serveral +The component will add several [extension attributes and methods](https://spacy.io/usage/processing-pipelines#custom-components-attributes) to spaCy's `Token` and `Span` objects that let you retrieve vectors and frequencies, as well as most similar terms. @@ -401,7 +401,7 @@ assert "VERB" in s2v.senses #### property `Sense2vec.frequencies` -The frequencies of they keys in the table, in descending order. +The frequencies of the keys in the table, in descending order. | Argument | Type | Description | | ----------- | ---- | -------------------------------------------------- | @@ -745,7 +745,7 @@ The following recipes are available – see below for more detailed docs. ### recipe `sense2vec.teach` Bootstrap a terminology list using sense2vec. Prodigy will suggest similar terms -based on the the most similar phrases from sense2vec, and the suggestions will +based on the most similar phrases from sense2vec, and the suggestions will be adjusted as you annotate and accept similar phrases. For each seed term, the best matching sense according to the sense2vec vectors will be used. From 87a8d62d40e4b2982638fd2ab862dc3edb0038d7 Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 14 Apr 2020 15:58:39 -0400 Subject: [PATCH 270/297] Split up spacy bin files --- scripts/01_parse.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/scripts/01_parse.py b/scripts/01_parse.py index e9b79fe..1f7e5bd 100644 --- a/scripts/01_parse.py +++ b/scripts/01_parse.py @@ -12,8 +12,9 @@ out_dir=("Path to output directory", "positional", None, str), spacy_model=("Name of spaCy model to use", "positional", None, str), n_process=("Number of processes (multiprocessing)", "option", "n", int), + max_docs=("Maximum docs per batch", "option", "m", int), ) -def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): +def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=10**6): """ Step 1: Parse raw text with spaCy @@ -31,17 +32,30 @@ def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") + count = 0 + batch_num = 0 with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): - doc_bin.add(doc) - msg.good(f"Processed {len(doc_bin)} docs") - doc_bin_bytes = doc_bin.to_bytes() - output_file = output_path / f"{input_path.stem}.spacy" - with output_file.open("wb") as f: - f.write(doc_bin_bytes) - msg.good(f"Saved parsed docs to file", output_file.resolve()) - + if count < max_docs: + doc_bin.add(doc) + count += 1 + else: + batch_num += 1 + count = 0 + msg.good(f"Processed {len(doc_bin)} docs") + doc_bin_bytes = doc_bin.to_bytes() + output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" + with output_file.open("wb") as f: + f.write(doc_bin_bytes) + msg.good(f"Saved parsed docs to file", output_file.resolve()) + doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) + with output_file.open("wb") as f: + batch_num += 1 + output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" + doc_bin_bytes = doc_bin.to_bytes() + f.write(doc_bin_bytes) + msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve()) if __name__ == "__main__": plac.call(main) From 0c1221eb45f4ad95d8eefec07861ae68feb62443 Mon Sep 17 00:00:00 2001 From: "Sheffield, David" Date: Mon, 20 Apr 2020 16:38:22 -0700 Subject: [PATCH 271/297] fix: make 04_fasttext_train_vectors.py and 05_export.py compatible with Windows 10 docs: update requirements.txt with fastText library reference and version feat: add functionality to save fastText model to disk to avoid retraining the model feat: give control to user to choose # of epochs feat: add functionality to load saved fastText model from disk to create vocab.txt and vectors.txt feat: add more warning, fail, and good messages for user --- scripts/04_fasttext_train_vectors.py | 122 ++++++++++++++++++--------- scripts/05_export.py | 2 +- scripts/requirements.txt | 1 + 3 files changed, 83 insertions(+), 42 deletions(-) diff --git a/scripts/04_fasttext_train_vectors.py b/scripts/04_fasttext_train_vectors.py index 44376d7..4800e2f 100644 --- a/scripts/04_fasttext_train_vectors.py +++ b/scripts/04_fasttext_train_vectors.py @@ -1,26 +1,32 @@ #!/usr/bin/env python import plac -import os from pathlib import Path from wasabi import msg +import fasttext +from errno import EPIPE +# python 04_fasttext_train_vectors.py /path/to/output/director/ -in /path/to/input/directory @plac.annotations( - fasttext_bin=("Path to the fasttext binary", "positional", None, str), - in_dir=("Directory with preprocessed .s2v files", "positional", None, str), out_dir=("Path to output directory", "positional", None, str), + in_dir=("Path to directory with preprocessed .s2v file(s)", "option", "in", str), n_threads=("Number of threads", "option", "t", int), min_count=("Minimum count for inclusion in vocab", "option", "c", int), vector_size=("Dimension of word vector representations", "option", "s", int), + epoch=("Number of times the fastText model will loop over your data", "option", "e", int), + save_fasttext_model=("Save fastText model to output directory as a binary file to avoid retraining", "flag", "sv"), + fasttext_filepath=("Path to saved fastText model .bin file", "option", "ft", str), verbose=("Set verbosity: 0, 1, or 2", "option", "v", int), ) def main( - fasttext_bin, - in_dir, out_dir, + in_dir=None, n_threads=10, min_count=50, vector_size=300, + epoch=5, + save_fasttext_model=False, + fasttext_filepath=None, verbose=2, ): """ @@ -35,51 +41,85 @@ def main( built fasttext binary. The command will also be printed if you want to run it separately. """ - input_path = Path(in_dir) + output_path = Path(out_dir) - if not Path(fasttext_bin).exists(): - msg.fail("Can't find fastText binary", fasttext_bin, exits=1) - if not input_path.exists() or not input_path.is_dir(): - msg.fail("Not a valid input directory", in_dir, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") - output_file = output_path / f"vectors_w2v_{vector_size}dim" - # fastText expects only one input file and only reads from disk and not - # stdin, so we need to create a temporary file that concatenates the inputs - tmp_path = input_path / "s2v_input.tmp" - input_files = [p for p in input_path.iterdir() if p.suffix == ".s2v"] - if not input_files: - msg.fail("Input directory contains no .s2v files", in_dir, exits=1) - with tmp_path.open("a", encoding="utf8") as tmp_file: - for input_file in input_files: - with input_file.open("r", encoding="utf8") as f: - tmp_file.write(f.read()) - msg.info("Created temporary merged input file", tmp_path) - msg.info("Training vectors") - cmd = ( - f"{fasttext_bin} skipgram -thread {n_threads} -input {tmp_path} " - f"-output {output_file} -dim {vector_size} -minn 0 -maxn 0 " - f"-minCount {min_count} -verbose {verbose}" - ) - print(cmd) - train_cmd = os.system(cmd) - tmp_path.unlink() - msg.good("Deleted temporary input file", tmp_path) - if train_cmd != 0: - msg.fail("Failed training vectors", exits=1) - msg.good("Successfully trained vectors", out_dir) + if fasttext_filepath: + msg.info("Loading fastText model vectors from .bin file") + if in_dir: + msg.warn(f"Warning: Providing a fastText filepath overrides fastText vector training") + fasttext_filepath = Path(fasttext_filepath) + if not fasttext_filepath.exists() or not fasttext_filepath.is_file() or not (fasttext_filepath.suffix == '.bin'): + msg.fail("Error: fasttext_filepath expects a fastText model .bin file", exits=1) + fasttext_model = fasttext.load_model(str(fasttext_filepath)) + msg.good("Successfully loaded fastText model") + elif in_dir: + msg.info("Training fastText model vectors") + input_path = Path(in_dir) + # Check to see if fasttext_filepath exists + if not input_path.exists() or not input_path.is_dir(): + msg.fail("Not a valid input directory", in_dir, exits=1) + tmp_path = input_path / "s2v_input.tmp" + input_files = [p for p in input_path.iterdir() if p.suffix == ".s2v"] + if not input_files: + msg.fail("Input directory contains no .s2v files", in_dir, exits=1) + # fastText expects only one input file and only reads from disk and not + # stdin, so we need to create a temporary file that concatenates the inputs + with tmp_path.open("a", encoding="utf8") as tmp_file: + for input_file in input_files: + with input_file.open("r", encoding="utf8") as f: + tmp_file.write(f.read()) + msg.info("Created temporary merged input file", tmp_path) + fasttext_model = fasttext.train_unsupervised(str(tmp_path), thread=n_threads, epoch=epoch, dim=vector_size, + minn=0, maxn=0, minCount=min_count, verbose=verbose) + msg.good("Successfully trained fastText model vectors") + + tmp_path.unlink() + msg.good("Deleted temporary input file", tmp_path) + output_file = output_path / f"vectors_w2v_{vector_size}dim.bin" + if save_fasttext_model: + fasttext_model.save_model(str(output_file)) + if not output_file.exists() or not output_file.is_file(): + msg.fail("Failed to save fastText model to disk", output_file, exits=1) + msg.good("Successfully saved fastText model to disk", output_file) + else: + fasttext_model = None + msg.fail("Must provide an input directory or fastText binary filepath", exits=1) - msg.info("Creating vocabulary") + msg.info("Creating vocabulary file") vocab_file = output_path / "vocab.txt" - cmd = f"{fasttext_bin} dump {output_file.with_suffix('.bin')} dict > {vocab_file}" - print(cmd) - vocab_cmd = os.system(cmd) - if vocab_cmd != 0: - msg.fail("Failed creating vocabulary", exits=1) + words, freqs = fasttext_model.get_words(include_freq=True) + with vocab_file.open('w', encoding='utf8') as f: + for i in range(len(words)): + f.write(words[i] + " " + str(freqs[i]) + " word\n") + if not vocab_file.exists() or not vocab_file.is_file(): + msg.fail("Failed to create vocabulary", vocab_file, exits=1) msg.good("Successfully created vocabulary file", vocab_file) + msg.info("Creating vectors file") + vectors_file = output_path / "vectors.txt" + # Adapted from https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/bin_to_vec.py#L31 + with vectors_file.open('w', encoding='utf-8') as file_out: + # the first line must contain the number of total words and vector dimension + file_out.write(str(len(words)) + " " + str(fasttext_model.get_dimension()) + '\n') + # line by line, append vector to vectors file + for w in words: + v = fasttext_model.get_word_vector(w) + vstr = "" + for vi in v: + vstr += " " + str(vi) + try: + file_out.write(w + vstr + '\n') + except IOError as e: + if e.errno == EPIPE: + pass + if not vectors_file.exists() or not vectors_file.is_file(): + msg.fail("Failed to create vectors file", vectors_file, exits=1) + msg.good("Successfully created vectors file", vectors_file) + if __name__ == "__main__": plac.call(main) diff --git a/scripts/05_export.py b/scripts/05_export.py index 922bb16..6ea5c0e 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -12,7 +12,7 @@ def _get_shape(file_): """Return a tuple with (number of entries, vector dimensions). Handle both word2vec/FastText format, which has a header with this, or GloVe's format, which doesn't.""" - first_line = next(file_).split() + first_line = next(file_).replace('\ufeff','').split() if len(first_line) == 2: return tuple(int(size) for size in first_line), file_ count = 1 diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 7c35914..3e67665 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,2 +1,3 @@ plac>=0.9.6,<1.2.0 tqdm>=4.36.1,<5.0.0 +fasttext>=0.9.1 \ No newline at end of file From 65542e98e0fcf2b3ed5cad81c24f6852d5e1a9d5 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 22 Apr 2020 12:42:09 +0200 Subject: [PATCH 272/297] align package versions with the current spacy defaults --- requirements.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9103469..31ec30d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ # Our packages spacy>=2.2.3,<3.0.0 -srsly>=0.2.0 -catalogue>=0.0.4 +srsly>=1.0.2,<1.1.0 +catalogue>=0.0.7,<1.1.0 # Third-party dependencies numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" From 17ebaf33fa208b9fb42962a790e18b7a5c3ca0ac Mon Sep 17 00:00:00 2001 From: svlandeg Date: Wed, 22 Apr 2020 12:42:43 +0200 Subject: [PATCH 273/297] fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6ceefee..00dfac2 100644 --- a/README.md +++ b/README.md @@ -705,7 +705,7 @@ To train your own sense2vec vectors, you'll need the following: The training process is split up into several steps to allow you to resume at any given point. Processing scripts are designed to operate on single files, -making it easy to paralellize the work. The scripts in this repo require either +making it easy to parallellize the work. The scripts in this repo require either [Glove](https://github.com/stanfordnlp/GloVe) or [fastText](https://github.com/facebookresearch/fastText), which you need to clone and `make`. From b95564038b4d7f3d63c659cb015a49fd8e4b205d Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 24 Apr 2020 13:03:21 +0200 Subject: [PATCH 274/297] ignore Pycharm project files --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a1eb014..bd0e174 100644 --- a/.gitignore +++ b/.gitignore @@ -70,3 +70,6 @@ target/ #Ipython Notebook .ipynb_checkpoints + +# Pycharm project files +/.idea/ From d22b3bceee448685bf40a291c01db9b3b9847a2c Mon Sep 17 00:00:00 2001 From: svlandeg Date: Fri, 24 Apr 2020 14:23:52 +0200 Subject: [PATCH 275/297] add readme part for Fasttext Windows users --- README.md | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 00dfac2..4a8eecf 100644 --- a/README.md +++ b/README.md @@ -707,8 +707,15 @@ The training process is split up into several steps to allow you to resume at any given point. Processing scripts are designed to operate on single files, making it easy to parallellize the work. The scripts in this repo require either [Glove](https://github.com/stanfordnlp/GloVe) or -[fastText](https://github.com/facebookresearch/fastText), which you need to -clone and `make`. +[fastText](https://github.com/facebookresearch/fastText) which you need to +clone and `make`. + +For Fasttext, the scripts will require the path to the created binary file. +If you're working on Windows, you can build with `cmake`, or alternatively +use the `.exe` file from this **unofficial** +repo with FastText binary builds for Windows: +https://github.com/xiamx/fastText/releases. + | | Script | Description | | ------ | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | From a1b77729bf12f9f81886f84fff1ece52e8dc40eb Mon Sep 17 00:00:00 2001 From: Anxo06 Date: Thu, 10 Dec 2020 16:53:20 +0100 Subject: [PATCH 276/297] Fix #118 --- sense2vec/component.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index 469ea10..f9634a9 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -196,7 +196,7 @@ def s2v_other_senses(self, obj: Union[Token, Span]) -> List[str]: RETURNS (list): A list of other senses. """ key = self.s2v_key(obj) - return obj._._s2v.get_other_senses(key) + return obj.doc._._s2v.get_other_senses(key) def to_bytes(self) -> bytes: """Serialize the component to a bytestring. From 9f7ea477c55f24f78e6bc46846186aaf642e0286 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 15:03:12 +1100 Subject: [PATCH 277/297] Update for spaCy v3 --- requirements.txt | 7 ++++--- sense2vec/component.py | 25 +++++++++++++++++++++---- sense2vec/sense2vec.py | 3 ++- sense2vec/util.py | 25 ++----------------------- setup.cfg | 12 ++++++------ tests/test_component.py | 11 +++++++---- 6 files changed, 42 insertions(+), 41 deletions(-) diff --git a/requirements.txt b/requirements.txt index 31ec30d..c20e484 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,8 @@ # Our packages -spacy>=2.2.3,<3.0.0 -srsly>=1.0.2,<1.1.0 -catalogue>=0.0.7,<1.1.0 +spacy>=3.0.0,<4.0.0 +wasabi>=0.8.1,<1.1.0 +srsly>=2.4.0,<3.0.0 +catalogue>=2.0.1,<2.1.0 # Third-party dependencies numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" diff --git a/sense2vec/component.py b/sense2vec/component.py index 469ea10..a44a6a3 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -1,16 +1,16 @@ from typing import Tuple, Union, List, Dict -from spacy import component +from spacy.language import Language from spacy.tokens import Doc, Token, Span from spacy.vocab import Vocab -from spacy.language import Language +from spacy.util import SimpleFrozenDict from pathlib import Path import numpy from .sense2vec import Sense2Vec -from .util import registry, SimpleFrozenDict +from .util import registry -@component( +@Language.factory( "sense2vec", requires=["token.pos", "token.dep", "token.ent_type", "token.ent_iob", "doc.ents"], assigns=[ @@ -32,6 +32,23 @@ "span._.s2v_similarity", ], ) +def make_sense2vec( + nlp: Language, + name: str, + shape: Tuple[int, int] = (100, 128), + merge_phrases: bool = False, + lemmatize: bool = False, + overrides: Dict[str, str] = SimpleFrozenDict(), +): + return Sense2VecComponent( + nlp.vocab, + shape=shape, + merge_phrases=merge_phrases, + lemmatize=lemmatize, + overrides=overrides, + ) + + class Sense2VecComponent(object): def __init__( self, diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 7a7170d..9067ef6 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -2,10 +2,11 @@ from pathlib import Path from spacy.vectors import Vectors from spacy.strings import StringStore +from spacy.util import SimpleFrozenDict import numpy import srsly -from .util import registry, cosine_similarity, SimpleFrozenDict +from .util import registry, cosine_similarity class Sense2Vec(object): diff --git a/sense2vec/util.py b/sense2vec/util.py index 9c0db4b..e5862fe 100644 --- a/sense2vec/util.py +++ b/sense2vec/util.py @@ -2,7 +2,7 @@ import re from spacy.tokens import Doc, Token, Span from spacy.util import filter_spans -from thinc.neural.util import get_array_module +from thinc.api import get_array_module import catalogue try: @@ -113,7 +113,7 @@ def get_noun_phrases(doc: Doc) -> List[Span]: """ trim_labels = ("advmod", "amod", "compound") spans = [] - if doc.is_parsed: + if doc.has_annotation("DEP"): for np in doc.noun_chunks: while len(np) > 1 and np[0].dep_ not in trim_labels: np = np[1:] @@ -178,24 +178,3 @@ def cosine_similarity(vec1, vec2) -> float: if norm1 == norm2: return 1.0 return xp.dot(vec1, vec2) / (norm1 * norm2) - - -class SimpleFrozenDict(dict): - """Simplified implementation of a frozen dict, mainly used as default - function or method argument (for arguments that should default to empty - dictionary). Will raise an error if user or spaCy attempts to add to dict. - """ - - err = ( - "Can't write to frozen dictionary. This is likely an internal error. " - "Are you writing to a default function argument?" - ) - - def __setitem__(self, key, value): - raise NotImplementedError(self.err) - - def pop(self, key, default=None): - raise NotImplementedError(self.err) - - def update(self, other): - raise NotImplementedError(self.err) diff --git a/setup.cfg b/setup.cfg index ae54e1a..90b7df7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.2 +version = 2.0.0 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion @@ -27,16 +27,16 @@ zip_safe = true include_package_data = true python_requires = >=3.6 install_requires = - spacy>=2.2.3,<3.0.0 - srsly>=0.2.0 - catalogue>=0.0.4 - wasabi>=0.4.0,<1.1.0 + spacy>=3.0.0,<4.0.0 + wasabi>=0.8.1,<1.1.0 + srsly>=2.4.0,<3.0.0 + catalogue>=2.0.1,<2.1.0 numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" [options.entry_points] spacy_factories = - sense2vec = sense2vec:Sense2VecComponent.from_nlp + sense2vec = sense2vec:make_sense2vec prodigy_recipes = sense2vec.teach = sense2vec:prodigy_recipes.teach sens2vec.to-patterns = sense2vec:prodigy_recipes.to_patterns diff --git a/tests/test_component.py b/tests/test_component.py index 7612bee..4541097 100644 --- a/tests/test_component.py +++ b/tests/test_component.py @@ -57,26 +57,29 @@ def test_component_similarity(doc): def test_component_lemmatize(doc): - lookups = doc.vocab.lookups.add_table("lemma_lookup") - lookups["world"] = "wrld" + def lemmatize(doc, lookups): + for token in doc: + token.lemma_ = lookups.get(token.text, token.text) + return doc + s2v = Sense2VecComponent(doc.vocab, shape=(4, 4), lemmatize=True) s2v.first_run = False vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) s2v.s2v.add("hello|INTJ", vector) s2v.s2v.add("world|NOUN", vector) s2v.s2v.add("wrld|NOUN", vector) + doc = lemmatize(doc, {"world": "wrld"}) doc = s2v(doc) assert doc[0]._.s2v_key == "hello|INTJ" assert doc[1].lemma_ == "wrld" assert doc[1]._.s2v_key == "wrld|NOUN" - lookups["hello"] = "hll" + doc = lemmatize(doc, {"hello": "hll"}) assert doc[0].lemma_ == "hll" assert doc[0]._.s2v_key == "hello|INTJ" s2v.s2v.add("hll|INTJ", vector) assert doc[0]._.s2v_key == "hll|INTJ" new_s2v = Sense2VecComponent().from_bytes(s2v.to_bytes()) assert new_s2v.s2v.cfg["lemmatize"] is True - doc.vocab.lookups.remove_table("lemma_lookup") def test_component_to_from_bytes(doc): From 97f9ad8bf61b6767cf8255553edbd96f360af60a Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 15:10:02 +1100 Subject: [PATCH 278/297] Update version --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index 90b7df7..a748103 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 2.0.0 +version = 2.0.0a0 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion From 2b88fc325f25012da31ac22363918799b581de4b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 15:12:15 +1100 Subject: [PATCH 279/297] Move tests into package and update CI --- azure-pipelines.yml | 33 +++++++++++++++---- {tests => sense2vec/tests}/__init__.py | 0 {tests => sense2vec/tests}/data/cache | Bin {tests => sense2vec/tests}/data/cfg | 0 {tests => sense2vec/tests}/data/freqs.json | 0 {tests => sense2vec/tests}/data/key2row | Bin {tests => sense2vec/tests}/data/strings.json | 0 {tests => sense2vec/tests}/data/vectors | Bin {tests => sense2vec/tests}/test_component.py | 0 {tests => sense2vec/tests}/test_model.py | 0 {tests => sense2vec/tests}/test_sense2vec.py | 0 {tests => sense2vec/tests}/test_util.py | 0 12 files changed, 27 insertions(+), 6 deletions(-) rename {tests => sense2vec/tests}/__init__.py (100%) rename {tests => sense2vec/tests}/data/cache (100%) rename {tests => sense2vec/tests}/data/cfg (100%) rename {tests => sense2vec/tests}/data/freqs.json (100%) rename {tests => sense2vec/tests}/data/key2row (100%) rename {tests => sense2vec/tests}/data/strings.json (100%) rename {tests => sense2vec/tests}/data/vectors (100%) rename {tests => sense2vec/tests}/test_component.py (100%) rename {tests => sense2vec/tests}/test_model.py (100%) rename {tests => sense2vec/tests}/test_sense2vec.py (100%) rename {tests => sense2vec/tests}/test_util.py (100%) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 80506e1..bf7b252 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -36,11 +36,32 @@ jobs: versionSpec: '$(python.version)' architecture: 'x64' - - script: pip install -r requirements.txt - displayName: 'Install dependencies' + - script: | + python -m pip install -U setuptools + pip install -r requirements.txt + displayName: "Install dependencies" - - script: pip install -e . - displayName: 'Build and install' + - script: | + python setup.py build_ext --inplace + python setup.py sdist --formats=gztar + displayName: "Compile and build sdist" - - script: python -m pytest tests - displayName: 'Run tests' + - task: DeleteFiles@1 + inputs: + contents: "sense2vec" + displayName: "Delete source directory" + + - script: | + pip freeze > installed.txt + pip uninstall -y -r installed.txt + displayName: "Uninstall all packages" + + - bash: | + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + pip install dist/$SDIST + displayName: "Install from sdist" + + - script: | + pip install -r requirements.txt + python -m pytest --pyargs sense2vec + displayName: "Run tests" diff --git a/tests/__init__.py b/sense2vec/tests/__init__.py similarity index 100% rename from tests/__init__.py rename to sense2vec/tests/__init__.py diff --git a/tests/data/cache b/sense2vec/tests/data/cache similarity index 100% rename from tests/data/cache rename to sense2vec/tests/data/cache diff --git a/tests/data/cfg b/sense2vec/tests/data/cfg similarity index 100% rename from tests/data/cfg rename to sense2vec/tests/data/cfg diff --git a/tests/data/freqs.json b/sense2vec/tests/data/freqs.json similarity index 100% rename from tests/data/freqs.json rename to sense2vec/tests/data/freqs.json diff --git a/tests/data/key2row b/sense2vec/tests/data/key2row similarity index 100% rename from tests/data/key2row rename to sense2vec/tests/data/key2row diff --git a/tests/data/strings.json b/sense2vec/tests/data/strings.json similarity index 100% rename from tests/data/strings.json rename to sense2vec/tests/data/strings.json diff --git a/tests/data/vectors b/sense2vec/tests/data/vectors similarity index 100% rename from tests/data/vectors rename to sense2vec/tests/data/vectors diff --git a/tests/test_component.py b/sense2vec/tests/test_component.py similarity index 100% rename from tests/test_component.py rename to sense2vec/tests/test_component.py diff --git a/tests/test_model.py b/sense2vec/tests/test_model.py similarity index 100% rename from tests/test_model.py rename to sense2vec/tests/test_model.py diff --git a/tests/test_sense2vec.py b/sense2vec/tests/test_sense2vec.py similarity index 100% rename from tests/test_sense2vec.py rename to sense2vec/tests/test_sense2vec.py diff --git a/tests/test_util.py b/sense2vec/tests/test_util.py similarity index 100% rename from tests/test_util.py rename to sense2vec/tests/test_util.py From a563624209c8d2bb1ed9b3fbe9399cb3415a99b1 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 15:25:38 +1100 Subject: [PATCH 280/297] Include test data --- MANIFEST.in | 1 + 1 file changed, 1 insertion(+) diff --git a/MANIFEST.in b/MANIFEST.in index c1a7121..9017ba7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,3 @@ include LICENSE include README.md +include sense2vec/tests/data/* From b4db740bef28cb94c9bfb03801010d8339d4e588 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 16:02:08 +1100 Subject: [PATCH 281/297] Update scripts --- scripts/01_parse.py | 24 ++--- scripts/02_preprocess.py | 19 ++-- scripts/03_glove_build_counts.py | 23 +++-- scripts/04_fasttext_train_vectors.py | 74 ++++++++------- scripts/04_glove_train_vectors.py | 35 +++---- scripts/05_export.py | 137 +++++++++++++-------------- scripts/06_precompute_cache.py | 28 +++--- scripts/requirements.txt | 4 +- 8 files changed, 168 insertions(+), 176 deletions(-) diff --git a/scripts/01_parse.py b/scripts/01_parse.py index 1f7e5bd..b5e63da 100644 --- a/scripts/01_parse.py +++ b/scripts/01_parse.py @@ -1,20 +1,19 @@ #!/usr/bin/env python import spacy from spacy.tokens import DocBin -import plac from wasabi import msg from pathlib import Path import tqdm +import typer -@plac.annotations( - in_file=("Path to input file", "positional", None, str), - out_dir=("Path to output directory", "positional", None, str), - spacy_model=("Name of spaCy model to use", "positional", None, str), - n_process=("Number of processes (multiprocessing)", "option", "n", int), - max_docs=("Maximum docs per batch", "option", "m", int), -) -def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=10**6): +def main( + in_file: str = typer.Argument(..., help="Path to input file"), + out_dir: str = typer.Argument(..., help="Path to output directory"), + spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"), + n_process: int = typer.Option(1, "--n-process", "-n", help="Number of processes (multiprocessing)"), + max_docs: int = typer.Option(10 ** 6, "--max-docs", "-m", help="Maximum docs per batch"), +): """ Step 1: Parse raw text with spaCy @@ -55,7 +54,10 @@ def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=1 output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" doc_bin_bytes = doc_bin.to_bytes() f.write(doc_bin_bytes) - msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve()) + msg.good( + f"Complete. Saved final parsed docs to file", output_file.resolve() + ) + if __name__ == "__main__": - plac.call(main) + typer.run(main) diff --git a/scripts/02_preprocess.py b/scripts/02_preprocess.py index c0cacae..b61eeb0 100644 --- a/scripts/02_preprocess.py +++ b/scripts/02_preprocess.py @@ -2,19 +2,20 @@ from sense2vec.util import make_key, make_spacy_key, merge_phrases import spacy from spacy.tokens import DocBin -import plac from wasabi import msg from pathlib import Path import tqdm +import typer -@plac.annotations( - in_file=("Path to input file", "positional", None, str), - out_dir=("Path to output directory", "positional", None, str), - spacy_model=("Name of spaCy model to use", "positional", None, str), - n_process=("Number of processes (multiprocessing)", "option", "n", int), -) -def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): +def main( + # fmt: off + in_file: str = typer.Argument(..., help="Path to input file"), + out_dir: str = typer.Argument(..., help="Path to output directory"), + spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"), + n_process: int = typer.Option(1, "--n-process", "-n", help="Number of processes (multiprocessing)"), + # fmt: on +): """ Step 2: Preprocess text in sense2vec's format @@ -65,4 +66,4 @@ def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1): if __name__ == "__main__": - plac.call(main) + typer.run(main) diff --git a/scripts/03_glove_build_counts.py b/scripts/03_glove_build_counts.py index 5fb05b6..9e093e3 100644 --- a/scripts/03_glove_build_counts.py +++ b/scripts/03_glove_build_counts.py @@ -1,21 +1,20 @@ #!/usr/bin/env python -import plac import os from pathlib import Path from wasabi import msg +import typer -@plac.annotations( - glove_dir=("Directory containing the GloVe build", "positional", None, str), - in_dir=("Directory with preprocessed .s2v files", "positional", None, str), - out_dir=("Path to output directory", "positional", None, str), - min_count=("Minimum count for inclusion in vocab", "option", "c", int), - memory=("Soft limit for memory consumption, in GB", "option", "m", float), - window_size=("Number of context words on either side", "option", "w", int), - verbose=("Set verbosity: 0, 1, or 2", "option", "v", int), -) def main( - glove_dir, in_dir, out_dir, min_count=5, memory=4.0, window_size=15, verbose=2 + # fmt: off + glove_dir: str = typer.Argument(..., help="Directory containing the GloVe build"), + in_dir: str = typer.Argument(..., help="Directory with preprocessed .s2v files"), + out_dir: str = typer.Argument(..., help="Path to output directory"), + min_count: int = typer.Option(5, "--min-count", "-c", help="Minimum count for inclusion in vocab"), + memory: float = typer.Option(4.0, "--memory", "-m", help="Soft limit for memory consumption, in GB"), + window_size: int = typer.Option(15, "--window-size", "-w", help="Number of context words on either side"), + verbose: int = typer.Option(2, "--verbose", "-v", help="Set verbosity: 0, 1, or 2"), + # fmt: on ): """ Step 3: Build vocabulary and frequency counts @@ -82,4 +81,4 @@ def main( if __name__ == "__main__": - plac.call(main) + typer.run(main) diff --git a/scripts/04_fasttext_train_vectors.py b/scripts/04_fasttext_train_vectors.py index 4800e2f..b0cc1a3 100644 --- a/scripts/04_fasttext_train_vectors.py +++ b/scripts/04_fasttext_train_vectors.py @@ -1,33 +1,26 @@ #!/usr/bin/env python -import plac +from typing import Optional from pathlib import Path from wasabi import msg import fasttext from errno import EPIPE +import typer + # python 04_fasttext_train_vectors.py /path/to/output/director/ -in /path/to/input/directory -@plac.annotations( - out_dir=("Path to output directory", "positional", None, str), - in_dir=("Path to directory with preprocessed .s2v file(s)", "option", "in", str), - n_threads=("Number of threads", "option", "t", int), - min_count=("Minimum count for inclusion in vocab", "option", "c", int), - vector_size=("Dimension of word vector representations", "option", "s", int), - epoch=("Number of times the fastText model will loop over your data", "option", "e", int), - save_fasttext_model=("Save fastText model to output directory as a binary file to avoid retraining", "flag", "sv"), - fasttext_filepath=("Path to saved fastText model .bin file", "option", "ft", str), - verbose=("Set verbosity: 0, 1, or 2", "option", "v", int), -) def main( - out_dir, - in_dir=None, - n_threads=10, - min_count=50, - vector_size=300, - epoch=5, - save_fasttext_model=False, - fasttext_filepath=None, - verbose=2, + # fmt: off + out_dir: str = typer.Argument(..., help="Path to output directory"), + in_dir: Optional[str] = typer.Argument(None, help="Path to directory with preprocessed .s2v file(s)"), + n_threads: int = typer.Option(10, "--n-threads", "-t", help="Number of threads"), + min_count: int = typer.Option(50, "--min-count", "-c", help="Minimum count for inclusion in vocab"), + vector_size: int = typer.Option(300, "--vector-size", "-s", help="Dimension of word vector representations"), + epoch: int = typer.Option(5, "--epoch", "-e", help="Number of times the fastText model will loop over your data"), + save_fasttext_model: bool = typer.Option(False, "--save-fasttext-model", "-sv", help="Save fastText model to output directory as a binary file to avoid retraining"), + fasttext_filepath: Optional[str] = typer.Option(None, "--fasttext-filepath", "-ft", help="Path to saved fastText model .bin file"), + verbose: int = typer.Option(2, "--verbose", "-v", help="Set verbosity: 0, 1, or 2"), + # fmt: on ): """ Step 4: Train the vectors @@ -41,7 +34,6 @@ def main( built fasttext binary. The command will also be printed if you want to run it separately. """ - output_path = Path(out_dir) if not output_path.exists(): output_path.mkdir(parents=True) @@ -50,10 +42,18 @@ def main( if fasttext_filepath: msg.info("Loading fastText model vectors from .bin file") if in_dir: - msg.warn(f"Warning: Providing a fastText filepath overrides fastText vector training") + msg.warn( + f"Warning: Providing a fastText filepath overrides fastText vector training" + ) fasttext_filepath = Path(fasttext_filepath) - if not fasttext_filepath.exists() or not fasttext_filepath.is_file() or not (fasttext_filepath.suffix == '.bin'): - msg.fail("Error: fasttext_filepath expects a fastText model .bin file", exits=1) + if ( + not fasttext_filepath.exists() + or not fasttext_filepath.is_file() + or not (fasttext_filepath.suffix == ".bin") + ): + msg.fail( + "Error: fasttext_filepath expects a fastText model .bin file", exits=1 + ) fasttext_model = fasttext.load_model(str(fasttext_filepath)) msg.good("Successfully loaded fastText model") elif in_dir: @@ -73,8 +73,16 @@ def main( with input_file.open("r", encoding="utf8") as f: tmp_file.write(f.read()) msg.info("Created temporary merged input file", tmp_path) - fasttext_model = fasttext.train_unsupervised(str(tmp_path), thread=n_threads, epoch=epoch, dim=vector_size, - minn=0, maxn=0, minCount=min_count, verbose=verbose) + fasttext_model = fasttext.train_unsupervised( + str(tmp_path), + thread=n_threads, + epoch=epoch, + dim=vector_size, + minn=0, + maxn=0, + minCount=min_count, + verbose=verbose, + ) msg.good("Successfully trained fastText model vectors") tmp_path.unlink() @@ -92,7 +100,7 @@ def main( msg.info("Creating vocabulary file") vocab_file = output_path / "vocab.txt" words, freqs = fasttext_model.get_words(include_freq=True) - with vocab_file.open('w', encoding='utf8') as f: + with vocab_file.open("w", encoding="utf8") as f: for i in range(len(words)): f.write(words[i] + " " + str(freqs[i]) + " word\n") if not vocab_file.exists() or not vocab_file.is_file(): @@ -102,9 +110,11 @@ def main( msg.info("Creating vectors file") vectors_file = output_path / "vectors.txt" # Adapted from https://github.com/facebookresearch/fastText/blob/master/python/doc/examples/bin_to_vec.py#L31 - with vectors_file.open('w', encoding='utf-8') as file_out: + with vectors_file.open("w", encoding="utf-8") as file_out: # the first line must contain the number of total words and vector dimension - file_out.write(str(len(words)) + " " + str(fasttext_model.get_dimension()) + '\n') + file_out.write( + str(len(words)) + " " + str(fasttext_model.get_dimension()) + "\n" + ) # line by line, append vector to vectors file for w in words: v = fasttext_model.get_word_vector(w) @@ -112,7 +122,7 @@ def main( for vi in v: vstr += " " + str(vi) try: - file_out.write(w + vstr + '\n') + file_out.write(w + vstr + "\n") except IOError as e: if e.errno == EPIPE: pass @@ -122,4 +132,4 @@ def main( if __name__ == "__main__": - plac.call(main) + typer.run(main) diff --git a/scripts/04_glove_train_vectors.py b/scripts/04_glove_train_vectors.py index d39ea63..7a659ed 100644 --- a/scripts/04_glove_train_vectors.py +++ b/scripts/04_glove_train_vectors.py @@ -1,31 +1,22 @@ #!/usr/bin/env python -import plac import os from pathlib import Path from wasabi import msg +import typer -@plac.annotations( - glove_dir=("Directory containing the GloVe build", "positional", None, str), - in_file=("Input file (shuffled cooccurrences)", "positional", None, str), - vocab_file=("Vocabulary file", "positional", None, str), - out_dir=("Path to output directory", "positional", None, str), - n_threads=("Number of threads", "option", "t", int), - n_iter=("Number of iterations", "option", "n", int), - x_max=("Parameter specifying cutoff in weighting function", "option", "x", int), - vector_size=("Dimension of word vector representations", "option", "s", int), - verbose=("Set verbosity: 0, 1, or 2", "option", "v", int), -) def main( - glove_dir, - in_file, - vocab_file, - out_dir, - n_threads=8, - n_iter=15, - x_max=10, - vector_size=128, - verbose=2, + # fmt: off + glove_dir: str = typer.Argument(..., help="Directory containing the GloVe build"), + in_file: str = typer.Argument(..., help="Input file (shuffled cooccurrences)"), + vocab_file: str = typer.Argument(..., help="Vocabulary file"), + out_dir: str = typer.Argument(..., help="Path to output directory"), + n_threads: int = typer.Option(8, "--n-threads", "-t", help="Number of threads"), + n_iter: int = typer.Option(15, "--n-iter", "-n", help="Number of iterations"), + x_max: int = typer.Option(10, "--x-max", "-x", help="Parameter specifying cutoff in weighting function"), + vector_size: int = typer.Option(128, "--vector-size", "-s", help="Dimension of word vector representations"), + verbose: int = typer.Option(2, "--verbose", "-v", help="Set verbosity: 0, 1, or 2"), + # fmt: on ): """ Step 4: Train the vectors @@ -63,4 +54,4 @@ def main( if __name__ == "__main__": - plac.call(main) + typer.run(main) diff --git a/scripts/05_export.py b/scripts/05_export.py index 6ea5c0e..2302501 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -3,16 +3,77 @@ from sense2vec import Sense2Vec from sense2vec.util import split_key, cosine_similarity from pathlib import Path -import plac from wasabi import msg import numpy +import typer + + +def main( + # fmt: off + in_file: str = typer.Argument(..., help="Vectors file (text-based)"), + vocab_file: str = typer.Argument(..., help="Vocabulary file"), + out_dir: str = typer.Argument(..., help="Path to output directory"), + min_freq_ratio: float = typer.Option(0.0, "--min-freq-ratio", "-r", help="Frequency ratio threshold for discarding minority senses or casings"), + min_distance: float = typer.Option(0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"), + # fmt: on +): + """ + Step 5: Export a sense2vec component + + Expects a vectors.txt and a vocab file trained with GloVe and exports + a component that can be loaded with Sense2vec.from_disk. + """ + input_path = Path(in_file) + vocab_path = Path(vocab_file) + output_path = Path(out_dir) + if not input_path.exists(): + msg.fail("Can't find input file", in_file, exits=1) + if input_path.suffix == ".bin": + msg.fail("Need text-based vectors file, not binary", in_file, exits=1) + if not vocab_path.exists(): + msg.fail("Can't find vocab file", vocab_file, exits=1) + if not output_path.exists(): + output_path.mkdir(parents=True) + msg.good(f"Created output directory {out_dir}") + with input_path.open("r", encoding="utf8") as f: + (n_vectors, vector_size), f = _get_shape(f) + vectors_data = f.readlines() + with vocab_path.open("r", encoding="utf8") as f: + vocab = read_vocab(f) + vectors = {} + all_senses = set() + for item in vectors_data: + item = item.rstrip().rsplit(" ", vector_size) + key = item[0] + try: + _, sense = split_key(key) + except ValueError: + continue + vec = item[1:] + if len(vec) != vector_size: + msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) + all_senses.add(sense) + vectors[key] = numpy.asarray(vec, dtype=numpy.float32) + discarded = set() + discarded.update(get_minority_keys(vocab, min_freq_ratio)) + discarded.update(get_redundant_keys(vocab, vectors, min_distance)) + n_vectors = len(vectors) - len(discarded) + s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) + for key, vector in vectors.items(): + if key not in discarded: + s2v.add(key, vector) + s2v.set_freq(key, vocab[key]) + msg.good("Created the sense2vec model") + msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") + s2v.to_disk(output_path) + msg.good("Saved model to directory", out_dir) def _get_shape(file_): """Return a tuple with (number of entries, vector dimensions). Handle both word2vec/FastText format, which has a header with this, or GloVe's format, which doesn't.""" - first_line = next(file_).replace('\ufeff','').split() + first_line = next(file_).replace("\ufeff", "").split() if len(first_line) == 2: return tuple(int(size) for size in first_line), file_ count = 1 @@ -84,75 +145,5 @@ def get_redundant_keys(vocab, vectors, min_distance): return too_similar -@plac.annotations( - in_file=("Vectors file (text-based)", "positional", None, str), - vocab_file=("Vocabulary file", "positional", None, str), - out_dir=("Path to output directory", "positional", None, str), - min_freq_ratio=( - "Frequency ratio threshold for discarding minority senses or casings.", - "option", - "r", - float, - ), - min_distance=( - "Similarity threshold for discarding redundant keys.", - "option", - "s", - float, - ), -) -def main(in_file, vocab_file, out_dir, min_freq_ratio=0.0, min_distance=0.0): - """ - Step 5: Export a sense2vec component - - Expects a vectors.txt and a vocab file trained with GloVe and exports - a component that can be loaded with Sense2vec.from_disk. - """ - input_path = Path(in_file) - vocab_path = Path(vocab_file) - output_path = Path(out_dir) - if not input_path.exists(): - msg.fail("Can't find input file", in_file, exits=1) - if input_path.suffix == ".bin": - msg.fail("Need text-based vectors file, not binary", in_file, exits=1) - if not vocab_path.exists(): - msg.fail("Can't find vocab file", vocab_file, exits=1) - if not output_path.exists(): - output_path.mkdir(parents=True) - msg.good(f"Created output directory {out_dir}") - with input_path.open("r", encoding="utf8") as f: - (n_vectors, vector_size), f = _get_shape(f) - vectors_data = f.readlines() - with vocab_path.open("r", encoding="utf8") as f: - vocab = read_vocab(f) - vectors = {} - all_senses = set() - for item in vectors_data: - item = item.rstrip().rsplit(" ", vector_size) - key = item[0] - try: - _, sense = split_key(key) - except ValueError: - continue - vec = item[1:] - if len(vec) != vector_size: - msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) - all_senses.add(sense) - vectors[key] = numpy.asarray(vec, dtype=numpy.float32) - discarded = set() - discarded.update(get_minority_keys(vocab, min_freq_ratio)) - discarded.update(get_redundant_keys(vocab, vectors, min_distance)) - n_vectors = len(vectors) - len(discarded) - s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) - for key, vector in vectors.items(): - if key not in discarded: - s2v.add(key, vector) - s2v.set_freq(key, vocab[key]) - msg.good("Created the sense2vec model") - msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") - s2v.to_disk(output_path) - msg.good("Saved model to directory", out_dir) - - if __name__ == "__main__": - plac.call(main) + typer.run(main) diff --git a/scripts/06_precompute_cache.py b/scripts/06_precompute_cache.py index 55a9601..2272cf1 100644 --- a/scripts/06_precompute_cache.py +++ b/scripts/06_precompute_cache.py @@ -1,22 +1,23 @@ -import plac +#!/usr/bin/env python +from typing import Optional import tqdm import numpy import srsly from wasabi import msg from pathlib import Path +import typer -@plac.annotations( - vectors=("Path to sense2vec component directory", "positional", None, str), - gpu_id=("GPU device (-1 for CPU)", "option", "g", int), - n_neighbors=("Number of neighbors to cache", "option", "n", int), - batch_size=("Batch size for to reduce memory usage.", "option", "b", int), - cutoff=("Limit neighbors to this many earliest rows", "option", "c", int), - start=("Index of vectors to start at.", "option", "s", int), - end=("Index of vectors to stop at.", "option", "e", int), -) def main( - vectors, gpu_id=-1, n_neighbors=100, batch_size=1024, cutoff=0, start=0, end=None + # fmt: off + vectors: str = typer.Argument(..., help="Path to sense2vec component directory"), + gpu_id: int = typer.Option(-1, "--gpu-id", "-g", help="GPU device (-1 for CPU)"), + n_neighbors: int = typer.Option(100, "--n-neighbors", "-n", help="Number of neighbors to cache"), + batch_size: int = typer.Option(1024, "--batch-size", "-b", help="Batch size for to reduce memory usage"), + cutoff: int = typer.Option(0, "--cutoff", "-c", help="Limit neighbors to this many earliest rows"), + start: int = typer.Option(0, "--start", "-s", help="Index of vectors to start at"), + end: Optional[int] = typer.Option(None, "--end", "-e", help="Index of vectors to stop at"), + # fmt: on ): """ Step 6: Precompute nearest-neighbor queries (optional) @@ -142,7 +143,4 @@ def take_along_axis(a, indices, axis): if __name__ == "__main__": - try: - plac.call(main) - except KeyboardInterrupt: - msg.warn("Cancelled.") + typer.run(main) diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 3e67665..4f348f6 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1,3 +1,3 @@ -plac>=0.9.6,<1.2.0 +typer>=0.3.0 tqdm>=4.36.1,<5.0.0 -fasttext>=0.9.1 \ No newline at end of file +fasttext>=0.9.1 From 1de569fd0890e07b83b9ce1c017331bffd71731b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 16:31:43 +1100 Subject: [PATCH 282/297] Update version [ci skip] --- setup.cfg | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.cfg b/setup.cfg index a748103..90b7df7 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 2.0.0a0 +version = 2.0.0 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion From 56f87dd01c012a0c8d0fef0faefabcb4977e2d34 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 16:35:13 +1100 Subject: [PATCH 283/297] Update script [ci skip] --- scripts/01_parse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/01_parse.py b/scripts/01_parse.py index b5e63da..f0b3e2f 100644 --- a/scripts/01_parse.py +++ b/scripts/01_parse.py @@ -8,11 +8,13 @@ def main( + # fmt: off in_file: str = typer.Argument(..., help="Path to input file"), out_dir: str = typer.Argument(..., help="Path to output directory"), spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"), n_process: int = typer.Option(1, "--n-process", "-n", help="Number of processes (multiprocessing)"), max_docs: int = typer.Option(10 ** 6, "--max-docs", "-m", help="Maximum docs per batch"), + # fmt: on ): """ Step 1: Parse raw text with spaCy From 2012112c6faacc38af4ba24152585c8445cf7c0b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 16:49:12 +1100 Subject: [PATCH 284/297] Add initialize method --- sense2vec/sense2vec.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 9067ef6..e286878 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -1,5 +1,7 @@ -from typing import Tuple, List, Union, Sequence, Dict, Callable, Any +from typing import Tuple, List, Union, Sequence, Dict, Callable, Any, Iterable +from typing import Optional from pathlib import Path +from spacy.language import Language from spacy.vectors import Vectors from spacy.strings import StringStore from spacy.util import SimpleFrozenDict @@ -295,6 +297,24 @@ def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes: data["cache"] = self.cache return srsly.msgpack_dumps(data) + def initialize( + self, + get_examples: Callable[[], Iterable], + *, + nlp: Optional[Language] = None, + data_path: Optional[str] = None + ): + """Initialize the component and load in data. Can be used to add the + component with vectors to a pipeline before training. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + data_path (Optional[str]): Optional path to sense2vec model. + """ + if data_path is not None: + self.from_disk(data_path) + def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a bytestring. From 7586325a41d8e081b02575f1d4a0c3344b6bd10b Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 16:49:15 +1100 Subject: [PATCH 285/297] Update README.md --- README.md | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 4a8eecf..56bb0ae 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ details, check out semantic similarities across all Reddit comments of 2015 and 2019, see the [interactive demo](https://demos.explosion.ai/sense2vec). -🦆 **Version 1.0 out now!** +🦆 **Version 2.0 (for spaCy v3) out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/12/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=12) @@ -59,13 +59,14 @@ most_similar = s2v.most_similar(query, n=3) ### Usage as a spaCy pipeline component +> ⚠️ Note that this example describes usage with [spaCy v3](https://spacy.io/usage/v3). For usage with spaCy v2, download `sense2vec==1.0.3` and check out the [`v1.x`](https://github.com/explosion/sense2vec/tree/v1.x) branch of this repo. + ```python import spacy -from sense2vec import Sense2VecComponent nlp = spacy.load("en_core_web_sm") -s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/s2v_reddit_2015_md") -nlp.add_pipe(s2v) +s2v = nlp.add_pipe("sense2vec") +s2v.from_disk("/path/to/s2v_reddit_2015_md") doc = nlp("A sentence about natural language processing.") assert doc[3:6].text == "natural language processing" @@ -132,7 +133,7 @@ s2v = Sense2Vec().from_disk("/path/to/s2v_reddit_2015_md") ## 👩‍💻 Usage -### Usage with spaCy v2.2+ +### Usage with spaCy v3 The easiest way to use the library and vectors is to plug it into your spaCy pipeline. The `sense2vec` package exposes a `Sense2VecComponent`, which can be @@ -147,8 +148,8 @@ import spacy from sense2vec import Sense2VecComponent nlp = spacy.load("en_core_web_sm") -s2v = Sense2VecComponent(nlp.vocab).from_disk("/path/to/s2v_reddit_2015_md") -nlp.add_pipe(s2v) +s2v = nlp.add_pipe("sense2vec") +s2v.from_disk("/path/to/s2v_reddit_2015_md") ``` The component will add several @@ -204,6 +205,17 @@ The following attributes are available via the `._` property of `Token` and > have a part-of-speech tag, so if no entity label is present, the "sense" > defaults to the root's part-of-speech tag. +#### Adding sense2vec to a trained pipeline + +If you're training and packaging a spaCy pipeline and want to include a sense2vec component in it, you can load in the data via the [`[initialize]` block](https://spacy.io/usage/training#config-lifecycle) of the training config: + +```ini +[initialize.components] + +[initialize.components.sense2vec] +data_path = "/path/to/s2v_reddit_2015_md" +``` + ### Standalone usage You can also use the underlying `Sense2Vec` class directly and load in the @@ -708,12 +720,12 @@ any given point. Processing scripts are designed to operate on single files, making it easy to parallellize the work. The scripts in this repo require either [Glove](https://github.com/stanfordnlp/GloVe) or [fastText](https://github.com/facebookresearch/fastText) which you need to -clone and `make`. +clone and `make`. -For Fasttext, the scripts will require the path to the created binary file. -If you're working on Windows, you can build with `cmake`, or alternatively -use the `.exe` file from this **unofficial** -repo with FastText binary builds for Windows: +For Fasttext, the scripts will require the path to the created binary file. +If you're working on Windows, you can build with `cmake`, or alternatively +use the `.exe` file from this **unofficial** +repo with FastText binary builds for Windows: https://github.com/xiamx/fastText/releases. From b80070cd9de13be86732947ca9f78524c6668019 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 7 Feb 2021 17:06:59 +1100 Subject: [PATCH 286/297] Fix initialize and update tests --- sense2vec/component.py | 21 +++++++++++++++++++- sense2vec/sense2vec.py | 22 +-------------------- sense2vec/tests/test_component.py | 32 +++++++++++++++++++++++++++++++ setup.cfg | 2 +- 4 files changed, 54 insertions(+), 23 deletions(-) diff --git a/sense2vec/component.py b/sense2vec/component.py index ff7b683..6f74cdd 100644 --- a/sense2vec/component.py +++ b/sense2vec/component.py @@ -1,6 +1,7 @@ -from typing import Tuple, Union, List, Dict +from typing import Tuple, Union, List, Dict, Callable, Iterable, Optional from spacy.language import Language from spacy.tokens import Doc, Token, Span +from spacy.training import Example from spacy.vocab import Vocab from spacy.util import SimpleFrozenDict from pathlib import Path @@ -215,6 +216,24 @@ def s2v_other_senses(self, obj: Union[Token, Span]) -> List[str]: key = self.s2v_key(obj) return obj.doc._._s2v.get_other_senses(key) + def initialize( + self, + get_examples: Callable[[], Iterable[Example]], + *, + nlp: Optional[Language] = None, + data_path: Optional[str] = None + ): + """Initialize the component and load in data. Can be used to add the + component with vectors to a pipeline before training. + + get_examples (Callable[[], Iterable[Example]]): Function that + returns a representative sample of gold-standard Example objects. + nlp (Language): The current nlp object the component is part of. + data_path (Optional[str]): Optional path to sense2vec model. + """ + if data_path is not None: + self.from_disk(data_path) + def to_bytes(self) -> bytes: """Serialize the component to a bytestring. diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index e286878..9067ef6 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -1,7 +1,5 @@ -from typing import Tuple, List, Union, Sequence, Dict, Callable, Any, Iterable -from typing import Optional +from typing import Tuple, List, Union, Sequence, Dict, Callable, Any from pathlib import Path -from spacy.language import Language from spacy.vectors import Vectors from spacy.strings import StringStore from spacy.util import SimpleFrozenDict @@ -297,24 +295,6 @@ def to_bytes(self, exclude: Sequence[str] = tuple()) -> bytes: data["cache"] = self.cache return srsly.msgpack_dumps(data) - def initialize( - self, - get_examples: Callable[[], Iterable], - *, - nlp: Optional[Language] = None, - data_path: Optional[str] = None - ): - """Initialize the component and load in data. Can be used to add the - component with vectors to a pipeline before training. - - get_examples (Callable[[], Iterable[Example]]): Function that - returns a representative sample of gold-standard Example objects. - nlp (Language): The current nlp object the component is part of. - data_path (Optional[str]): Optional path to sense2vec model. - """ - if data_path is not None: - self.from_disk(data_path) - def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """Load a Sense2Vec object from a bytestring. diff --git a/sense2vec/tests/test_component.py b/sense2vec/tests/test_component.py index 4541097..9d62836 100644 --- a/sense2vec/tests/test_component.py +++ b/sense2vec/tests/test_component.py @@ -1,8 +1,10 @@ import pytest import numpy +import spacy from spacy.vocab import Vocab from spacy.tokens import Doc, Span from sense2vec import Sense2VecComponent +from pathlib import Path @pytest.fixture @@ -103,3 +105,33 @@ def test_component_to_from_bytes(doc): assert doc[0]._.in_s2v is False new_doc = new_s2v(doc) assert new_doc[0]._.in_s2v is True + + +def test_component_initialize(): + data_path = Path(__file__).parent / "data" + # With from_disk + nlp = spacy.blank("en") + s2v = nlp.add_pipe("sense2vec") + if Doc.has_extension("s2v_phrases"): + s2v.first_run = False # don't set up extensions again + s2v.from_disk(data_path) + doc = Doc(nlp.vocab, words=["beekeepers"], pos=["NOUN"]) + s2v(doc) + assert doc[0]._.s2v_key == "beekeepers|NOUN" + most_similar = [item for item, score in doc[0]._.s2v_most_similar(2)] + assert most_similar[0] == ("honey bees", "NOUN") + assert most_similar[1] == ("Beekeepers", "NOUN") + + # With initialize + nlp = spacy.blank("en") + s2v = nlp.add_pipe("sense2vec") + s2v.first_run = False # don't set up extensions again + init_cfg = {"sense2vec": {"data_path": str(data_path)}} + nlp.config["initialize"]["components"] = init_cfg + nlp.initialize() + doc = Doc(nlp.vocab, words=["beekeepers"], pos=["NOUN"]) + s2v(doc) + assert doc[0]._.s2v_key == "beekeepers|NOUN" + most_similar = [item for item, score in doc[0]._.s2v_most_similar(2)] + assert most_similar[0] == ("honey bees", "NOUN") + assert most_similar[1] == ("Beekeepers", "NOUN") diff --git a/setup.cfg b/setup.cfg index 90b7df7..d06338c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,7 @@ install_requires = [options.entry_points] spacy_factories = - sense2vec = sense2vec:make_sense2vec + sense2vec = sense2vec:component.make_sense2vec prodigy_recipes = sense2vec.teach = sense2vec:prodigy_recipes.teach sens2vec.to-patterns = sense2vec:prodigy_recipes.to_patterns From ba85ecfec329e06871162b571a4a6942dc902950 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 3 Mar 2021 10:08:52 +0100 Subject: [PATCH 287/297] fix senses type (#135) --- scripts/05_export.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/05_export.py b/scripts/05_export.py index 2302501..450a283 100644 --- a/scripts/05_export.py +++ b/scripts/05_export.py @@ -58,7 +58,7 @@ def main( discarded.update(get_minority_keys(vocab, min_freq_ratio)) discarded.update(get_redundant_keys(vocab, vectors, min_distance)) n_vectors = len(vectors) - len(discarded) - s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) + s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=list(all_senses)) for key, vector in vectors.items(): if key not in discarded: s2v.add(key, vector) From 3c191aee178f4cbb0314b616622189c1d1c45876 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sun, 18 Apr 2021 11:50:23 +1000 Subject: [PATCH 288/297] sense2vec.teach: only fail if no seeds are available --- sense2vec/prodigy_recipes.py | 10 +++++++++- setup.cfg | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/sense2vec/prodigy_recipes.py b/sense2vec/prodigy_recipes.py index 8be9759..64bee9d 100644 --- a/sense2vec/prodigy_recipes.py +++ b/sense2vec/prodigy_recipes.py @@ -60,7 +60,8 @@ def teach( for seed in seeds: key = s2v.get_best_sense(seed) if key is None: - msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1) + msg.warn(f"Can't find seed term '{seed}' in vectors") + continue accept_keys.append(key) best_word, best_sense = s2v.split_key(key) seen.add(best_word if case_sensitive else best_word.lower()) @@ -72,6 +73,13 @@ def teach( "answer": "accept", } seed_tasks.append(set_hashes(task)) + if len(accept_keys) == 0: + msg.fail( + "No seeds available. This typically happens if none of your seed " + "terms are found in the vectors. Try using more generic terms or " + "different vectors that cover the expressions you're looking for.", + exits=1, + ) print(f"Starting with seed keys: {accept_keys}") DB = connect() if dataset not in DB: diff --git a/setup.cfg b/setup.cfg index d06338c..580eb91 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 2.0.0 +version = 2.0.1 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion From 95d1790849d97fb6531c5c45271d0fa58b1d3f08 Mon Sep 17 00:00:00 2001 From: David Chanin Date: Mon, 16 Aug 2021 12:38:44 +0100 Subject: [PATCH 289/297] test for lowercase when ignoring case in get_best_sense and get_other_senses (#139) --- sense2vec/sense2vec.py | 4 ++-- sense2vec/tests/test_sense2vec.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index 9067ef6..bb157f5 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -247,7 +247,7 @@ def get_other_senses( result = [] key = key if isinstance(key, str) else self.strings[key] word, orig_sense = self.split_key(key) - versions = [word, word.upper(), word.title()] if ignore_case else [word] + versions = set([word, word.lower(), word.upper(), word.title()]) if ignore_case else [word] for text in versions: for sense in self.senses: new_key = self.make_key(text, sense) @@ -270,7 +270,7 @@ def get_best_sense( sense_options = senses or self.senses if not sense_options: return None - versions = [word, word.upper(), word.title()] if ignore_case else [word] + versions = set([word, word.lower(), word.upper(), word.title()]) if ignore_case else [word] freqs = [] for text in versions: for sense in sense_options: diff --git a/sense2vec/tests/test_sense2vec.py b/sense2vec/tests/test_sense2vec.py index fbf4657..298d32c 100644 --- a/sense2vec/tests/test_sense2vec.py +++ b/sense2vec/tests/test_sense2vec.py @@ -47,6 +47,8 @@ def test_sense2vec_other_senses(): assert sorted(others) == ["a|B", "a|C"] others = s2v.get_other_senses("b|C") assert others == ["b|A"] + others = s2v.get_other_senses("B|C") + assert others == ["b|A"] others = s2v.get_other_senses("c|A") assert others == [] @@ -57,6 +59,7 @@ def test_sense2vec_best_sense(): for key, freq in [("a|A", 100), ("a|B", 50), ("a|C", 10), ("b|A", 1), ("B|C", 2)]: s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32), freq) assert s2v.get_best_sense("a") == "a|A" + assert s2v.get_best_sense("A") == "a|A" assert s2v.get_best_sense("b") == "B|C" assert s2v.get_best_sense("b", ignore_case=False) == "b|A" assert s2v.get_best_sense("c") is None From d689bb65ce0f6c597c891cea3ba279ad1f92916f Mon Sep 17 00:00:00 2001 From: Mukesh Kr Mehta Date: Mon, 16 Aug 2021 17:14:51 +0530 Subject: [PATCH 290/297] added code to create new file, earlier, it used to overwrite the penultimate file. (#137) --- scripts/01_parse.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/01_parse.py b/scripts/01_parse.py index f0b3e2f..2733286 100644 --- a/scripts/01_parse.py +++ b/scripts/01_parse.py @@ -51,9 +51,9 @@ def main( f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve()) doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) + batch_num += 1 + output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: - batch_num += 1 - output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" doc_bin_bytes = doc_bin.to_bytes() f.write(doc_bin_bytes) msg.good( From 60ac0d0e4e89d57cb953c73377e4929dd3be3a00 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 7 Dec 2022 17:18:06 +0100 Subject: [PATCH 291/297] Update setup, CI, wasabi (#152) * Update setup, CI, wasabi * Temporarily test with latest wasabi * Extend pytest requirement * Revert "Temporarily test with latest wasabi" This reverts commit ad42fcd3b89e59db4508df99ba8cffa7e9a95377. --- azure-pipelines.yml | 21 +++++++++------------ requirements.txt | 4 ++-- setup.cfg | 5 ++++- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index bf7b252..80bfac9 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -9,23 +9,20 @@ jobs: strategy: matrix: Python36Linux: - imageName: 'ubuntu-16.04' + imageName: 'ubuntu-20.04' python.version: '3.6' Python36Windows: - imageName: 'vs2017-win2016' - python.version: '3.6' - Python36Mac: - imageName: 'macos-10.14' + imageName: 'windows-2019' python.version: '3.6' Python38Linux: - imageName: 'ubuntu-16.04' - python.version: '3.8' - Python38Windows: - imageName: 'vs2017-win2016' - python.version: '3.8' - Python38Mac: - imageName: 'macos-10.14' + imageName: 'ubuntu-latest' python.version: '3.8' + Python39Windows: + imageName: 'windows-latest' + python.version: '3.9' + Python311Mac: + imageName: 'macos-latest' + python.version: '3.11' maxParallel: 4 pool: vmImage: $(imageName) diff --git a/requirements.txt b/requirements.txt index c20e484..093de33 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,10 @@ # Our packages spacy>=3.0.0,<4.0.0 -wasabi>=0.8.1,<1.1.0 +wasabi>=0.8.1,<1.2.0 srsly>=2.4.0,<3.0.0 catalogue>=2.0.1,<2.1.0 # Third-party dependencies numpy>=1.15.0 importlib_metadata>=0.20; python_version < "3.8" # Development requirements -pytest>=5.2.0,<6.0.0 +pytest>=5.2.0,!=7.1.0 diff --git a/setup.cfg b/setup.cfg index 580eb91..ea2bed5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,6 +20,9 @@ classifiers = Programming Language :: Python :: 3.6 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 Topic :: Scientific/Engineering [options] @@ -28,7 +31,7 @@ include_package_data = true python_requires = >=3.6 install_requires = spacy>=3.0.0,<4.0.0 - wasabi>=0.8.1,<1.1.0 + wasabi>=0.8.1,<1.2.0 srsly>=2.4.0,<3.0.0 catalogue>=2.0.1,<2.1.0 numpy>=1.15.0 From 82ef0301e8f2917192577f52a15643cad3fb4bc0 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 8 Dec 2022 10:17:58 +0100 Subject: [PATCH 292/297] Add pyproject.toml for setuptools (#153) --- pyproject.toml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 pyproject.toml diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..40810cc --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,5 @@ +[build-system] +requires = [ + "setuptools", +] +build-backend = "setuptools.build_meta" From eb53bf467ee1a02b333ca6f43afeb542fa58a49f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 5 Apr 2023 13:50:10 +0200 Subject: [PATCH 293/297] CI: Add GHA tests (#156) --- .github/workflows/tests.yml | 82 +++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000..9ed9138 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,82 @@ +name: tests + +on: + push: + paths-ignore: + - "*.md" + pull_request: + types: [opened, synchronize, reopened, edited] + paths-ignore: + - "*.md" + +env: + MODULE_NAME: 'sense2vec' + RUN_MYPY: 'false' + +jobs: + tests: + name: Test + if: github.repository_owner == 'explosion' + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python_version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + include: + - os: windows-2019 + python_version: "3.6" + - os: ubuntu-20.04 + python_version: "3.6" + runs-on: ${{ matrix.os }} + + steps: + - name: Check out repo + uses: actions/checkout@v3 + + - name: Configure Python version + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python_version }} + architecture: x64 + + - name: Build sdist + run: | + python -m pip install -U build pip setuptools + python -m pip install -U -r requirements.txt + python -m build --sdist + + - name: Run mypy + shell: bash + if: ${{ env.RUN_MYPY == 'true' }} + run: | + python -m mypy $MODULE_NAME + + - name: Delete source directory + shell: bash + run: | + rm -rf $MODULE_NAME + + - name: Uninstall all packages + run: | + python -m pip freeze > installed.txt + python -m pip uninstall -y -r installed.txt + + - name: Install from sdist + shell: bash + run: | + SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) + pip install dist/$SDIST + + - name: Test import + shell: bash + run: | + python -c "import $MODULE_NAME" -Werror + + - name: Install test requirements + run: | + python -m pip install -U -r requirements.txt + + - name: Run tests + shell: bash + run: | + python -m pytest --pyargs $MODULE_NAME -Werror From ab97146ab74c51f95f238fe21d42a26772ffbc4d Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 17 Apr 2023 14:26:30 +0200 Subject: [PATCH 294/297] Pin vectors to the CPU after deserialization (#157) * Pin vectors to the CPU after deserialization * Restore CPU ops after regression test * Skip test if GPU support is not present * Use `use_ops` context manager in test * Typo --- sense2vec/sense2vec.py | 19 +++++++++++++++++-- sense2vec/tests/test_issue155.py | 13 +++++++++++++ 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 sense2vec/tests/test_issue155.py diff --git a/sense2vec/sense2vec.py b/sense2vec/sense2vec.py index bb157f5..1e1cf8f 100644 --- a/sense2vec/sense2vec.py +++ b/sense2vec/sense2vec.py @@ -3,6 +3,7 @@ from spacy.vectors import Vectors from spacy.strings import StringStore from spacy.util import SimpleFrozenDict +from thinc.api import NumpyOps import numpy import srsly @@ -247,7 +248,11 @@ def get_other_senses( result = [] key = key if isinstance(key, str) else self.strings[key] word, orig_sense = self.split_key(key) - versions = set([word, word.lower(), word.upper(), word.title()]) if ignore_case else [word] + versions = ( + set([word, word.lower(), word.upper(), word.title()]) + if ignore_case + else [word] + ) for text in versions: for sense in self.senses: new_key = self.make_key(text, sense) @@ -270,7 +275,11 @@ def get_best_sense( sense_options = senses or self.senses if not sense_options: return None - versions = set([word, word.lower(), word.upper(), word.title()]) if ignore_case else [word] + versions = ( + set([word, word.lower(), word.upper(), word.title()]) + if ignore_case + else [word] + ) freqs = [] for text in versions: for sense in sense_options: @@ -304,6 +313,9 @@ def from_bytes(self, bytes_data: bytes, exclude: Sequence[str] = tuple()): """ data = srsly.msgpack_loads(bytes_data) self.vectors = Vectors().from_bytes(data["vectors"]) + # Pin vectors to the CPU so that we don't end up comparing + # numpy and cupy arrays. + self.vectors.to_ops(NumpyOps()) self.freqs = dict(data.get("freqs", [])) self.cfg.update(data.get("cfg", {})) if "strings" not in exclude and "strings" in data: @@ -340,6 +352,9 @@ def from_disk(self, path: Union[Path, str], exclude: Sequence[str] = tuple()): freqs_path = path / "freqs.json" cache_path = path / "cache" self.vectors = Vectors().from_disk(path) + # Pin vectors to the CPU so that we don't end up comparing + # numpy and cupy arrays. + self.vectors.to_ops(NumpyOps()) self.cfg.update(srsly.read_json(path / "cfg")) if freqs_path.exists(): self.freqs = dict(srsly.read_json(freqs_path)) diff --git a/sense2vec/tests/test_issue155.py b/sense2vec/tests/test_issue155.py new file mode 100644 index 0000000..546734d --- /dev/null +++ b/sense2vec/tests/test_issue155.py @@ -0,0 +1,13 @@ +from pathlib import Path +import pytest +from sense2vec.sense2vec import Sense2Vec +from thinc.api import use_ops +from thinc.util import has_cupy_gpu + + +@pytest.mark.skipif(not has_cupy_gpu, reason="requires Cupy/GPU") +def test_issue155(): + data_path = Path(__file__).parent / "data" + with use_ops("cupy"): + s2v = Sense2Vec().from_disk(data_path) + s2v.most_similar("beekeepers|NOUN") From 2fd2d4c45e42749176bc46e145f20331fb63824d Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Mon, 17 Apr 2023 14:54:39 +0200 Subject: [PATCH 295/297] Set version to 2.0.2 (#158) * Set version to `2.0.2` * Update license year --- LICENSE | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/LICENSE b/LICENSE index b8ba168..d78a24b 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (C) 2019 ExplosionAI GmbH +Copyright (C) 2019-2023 ExplosionAI GmbH Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/setup.cfg b/setup.cfg index ea2bed5..44a4c07 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 2.0.1 +version = 2.0.2 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion From 323b6a37ad4b644aced71fffdd8537d9d249d4a4 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 20 Apr 2023 16:53:46 +0200 Subject: [PATCH 296/297] CI: Switch from Azure to GHA (#159) --- README.md | 43 ++++++++++++++++-------------- azure-pipelines.yml | 64 --------------------------------------------- 2 files changed, 24 insertions(+), 83 deletions(-) delete mode 100644 azure-pipelines.yml diff --git a/README.md b/README.md index 56bb0ae..736d5a6 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ semantic similarities across all Reddit comments of 2015 and 2019, see the 🦆 **Version 2.0 (for spaCy v3) out now!** [Read the release notes here.](https://github.com/explosion/sense2vec/releases/) -[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/12/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=12) +[![tests](https://github.com/explosion/sense2vec/actions/workflows/tests.yml/badge.svg)](https://github.com/explosion/sense2vec/actions/workflows/tests.yml) [![Current Release Version](https://img.shields.io/github/v/release/explosion/sense2vec.svg?style=flat-square&logo=github)](https://github.com/explosion/sense2vec/releases) [![pypi Version](https://img.shields.io/pypi/v/sense2vec.svg?style=flat-square&logo=pypi&logoColor=white)](https://pypi.org/project/sense2vec/) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg?style=flat-square)](https://github.com/ambv/black) @@ -59,7 +59,11 @@ most_similar = s2v.most_similar(query, n=3) ### Usage as a spaCy pipeline component -> ⚠️ Note that this example describes usage with [spaCy v3](https://spacy.io/usage/v3). For usage with spaCy v2, download `sense2vec==1.0.3` and check out the [`v1.x`](https://github.com/explosion/sense2vec/tree/v1.x) branch of this repo. +> ⚠️ Note that this example describes usage with +> [spaCy v3](https://spacy.io/usage/v3). For usage with spaCy v2, download +> `sense2vec==1.0.3` and check out the +> [`v1.x`](https://github.com/explosion/sense2vec/tree/v1.x) branch of this +> repo. ```python import spacy @@ -188,12 +192,12 @@ The following attributes are available via the `._` property of `Token` and `Span` objects – for example `token._.in_s2v`: | Name | Attribute Type | Return Type | Description | -| ------------------ | -------------- | ------------------ | ---------------------------------------------------------------------------------- | +| ------------------ | -------------- | ------------------ | ---------------------------------------------------------------------------------- | --------------- | ------- | | `in_s2v` | property | bool | Whether a key exists in the vector map. | -| `s2v_key` | property | unicode | The sense2vec key of the given object, e.g. `"duck|NOUN"`. | +| `s2v_key` | property | unicode | The sense2vec key of the given object, e.g. `"duck | NOUN"`. | | `s2v_vec` | property | `ndarray[float32]` | The vector of the given key. | | `s2v_freq` | property | int | The frequency of the given key. | -| `s2v_other_senses` | property | list | Available other senses, e.g. `"duck|VERB"` for `"duck|NOUN"`. | +| `s2v_other_senses` | property | list | Available other senses, e.g. `"duck | VERB"`for`"duck | NOUN"`. | | `s2v_most_similar` | method | list | Get the `n` most similar terms. Returns a list of `((word, sense), score)` tuples. | | `s2v_similarity` | method | float | Get the similarity to another `Token` or `Span`. | @@ -207,7 +211,10 @@ The following attributes are available via the `._` property of `Token` and #### Adding sense2vec to a trained pipeline -If you're training and packaging a spaCy pipeline and want to include a sense2vec component in it, you can load in the data via the [`[initialize]` block](https://spacy.io/usage/training#config-lifecycle) of the training config: +If you're training and packaging a spaCy pipeline and want to include a +sense2vec component in it, you can load in the data via the +[`[initialize]` block](https://spacy.io/usage/training#config-lifecycle) of the +training config: ```ini [initialize.components] @@ -655,10 +662,10 @@ custom functions, swap them out and serialize the custom names when you save out the model. The following registry options are available: | Name | Description | -| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `registry.make_key` | Given a `word` and `sense`, return a string of the key, e.g. `"word|sense".` | +| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | +| `registry.make_key` | Given a `word` and `sense`, return a string of the key, e.g. `"word | sense".` | | `registry.split_key` | Given a string key, return a `(word, sense)` tuple. | -| `registry.make_spacy_key` | Given a spaCy object (`Token` or `Span`) and a boolean `prefer_ents` keyword argument (whether to prefer the entity label for single tokens), return a `(word, sense)` tuple. Used in extension attributes to generate a key for tokens and spans. | | +| `registry.make_spacy_key` | Given a spaCy object (`Token` or `Span`) and a boolean `prefer_ents` keyword argument (whether to prefer the entity label for single tokens), return a `(word, sense)` tuple. Used in extension attributes to generate a key for tokens and spans. | | | `registry.get_phrases` | Given a spaCy `Doc`, return a list of `Span` objects used for sense2vec phrases (typically noun phrases and named entities). | | `registry.merge_phrases` | Given a spaCy `Doc`, get all sense2vec phrases and merge them into single tokens.  | @@ -719,15 +726,13 @@ The training process is split up into several steps to allow you to resume at any given point. Processing scripts are designed to operate on single files, making it easy to parallellize the work. The scripts in this repo require either [Glove](https://github.com/stanfordnlp/GloVe) or -[fastText](https://github.com/facebookresearch/fastText) which you need to -clone and `make`. - -For Fasttext, the scripts will require the path to the created binary file. -If you're working on Windows, you can build with `cmake`, or alternatively -use the `.exe` file from this **unofficial** -repo with FastText binary builds for Windows: -https://github.com/xiamx/fastText/releases. +[fastText](https://github.com/facebookresearch/fastText) which you need to clone +and `make`. +For Fasttext, the scripts will require the path to the created binary file. If +you're working on Windows, you can build with `cmake`, or alternatively use the +`.exe` file from this **unofficial** repo with FastText binary builds for +Windows: https://github.com/xiamx/fastText/releases. | | Script | Description | | ------ | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -764,8 +769,8 @@ The following recipes are available – see below for more detailed docs. ### recipe `sense2vec.teach` Bootstrap a terminology list using sense2vec. Prodigy will suggest similar terms -based on the most similar phrases from sense2vec, and the suggestions will -be adjusted as you annotate and accept similar phrases. For each seed term, the +based on the most similar phrases from sense2vec, and the suggestions will be +adjusted as you annotate and accept similar phrases. For each seed term, the best matching sense according to the sense2vec vectors will be used. ```bash diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 80bfac9..0000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,64 +0,0 @@ -trigger: - batch: true - branches: - include: - - '*' - -jobs: -- job: 'Test' - strategy: - matrix: - Python36Linux: - imageName: 'ubuntu-20.04' - python.version: '3.6' - Python36Windows: - imageName: 'windows-2019' - python.version: '3.6' - Python38Linux: - imageName: 'ubuntu-latest' - python.version: '3.8' - Python39Windows: - imageName: 'windows-latest' - python.version: '3.9' - Python311Mac: - imageName: 'macos-latest' - python.version: '3.11' - maxParallel: 4 - pool: - vmImage: $(imageName) - - steps: - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python.version)' - architecture: 'x64' - - - script: | - python -m pip install -U setuptools - pip install -r requirements.txt - displayName: "Install dependencies" - - - script: | - python setup.py build_ext --inplace - python setup.py sdist --formats=gztar - displayName: "Compile and build sdist" - - - task: DeleteFiles@1 - inputs: - contents: "sense2vec" - displayName: "Delete source directory" - - - script: | - pip freeze > installed.txt - pip uninstall -y -r installed.txt - displayName: "Uninstall all packages" - - - bash: | - SDIST=$(python -c "import os;print(os.listdir('./dist')[-1])" 2>&1) - pip install dist/$SDIST - displayName: "Install from sdist" - - - script: | - pip install -r requirements.txt - python -m pytest --pyargs sense2vec - displayName: "Run tests" From 24715918c38ae4293e7bb9a5e7bb20091a4f4fdc Mon Sep 17 00:00:00 2001 From: Elias <38086802+HandcartCactus@users.noreply.github.com> Date: Wed, 23 Apr 2025 09:04:26 -0400 Subject: [PATCH 297/297] Add escaped pipes to fix tables in README.md (#166) --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 736d5a6..94c6a1e 100644 --- a/README.md +++ b/README.md @@ -192,12 +192,12 @@ The following attributes are available via the `._` property of `Token` and `Span` objects – for example `token._.in_s2v`: | Name | Attribute Type | Return Type | Description | -| ------------------ | -------------- | ------------------ | ---------------------------------------------------------------------------------- | --------------- | ------- | +| ------------------ | -------------- | ------------------ | ---------------------------------------------------------------------------------- | | `in_s2v` | property | bool | Whether a key exists in the vector map. | -| `s2v_key` | property | unicode | The sense2vec key of the given object, e.g. `"duck | NOUN"`. | +| `s2v_key` | property | unicode | The sense2vec key of the given object, e.g. `"duck NOUN"`. | | `s2v_vec` | property | `ndarray[float32]` | The vector of the given key. | | `s2v_freq` | property | int | The frequency of the given key. | -| `s2v_other_senses` | property | list | Available other senses, e.g. `"duck | VERB"`for`"duck | NOUN"`. | +| `s2v_other_senses` | property | list | Available other senses, e.g. `"duck\|VERB"` for `"duck\|NOUN"`. | | `s2v_most_similar` | method | list | Get the `n` most similar terms. Returns a list of `((word, sense), score)` tuples. | | `s2v_similarity` | method | float | Get the similarity to another `Token` or `Span`. | @@ -662,10 +662,10 @@ custom functions, swap them out and serialize the custom names when you save out the model. The following registry options are available: | Name | Description | -| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------- | -| `registry.make_key` | Given a `word` and `sense`, return a string of the key, e.g. `"word | sense".` | +| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `registry.make_key` | Given a `word` and `sense`, return a string of the key, e.g. `"word\|sense".` | | `registry.split_key` | Given a string key, return a `(word, sense)` tuple. | -| `registry.make_spacy_key` | Given a spaCy object (`Token` or `Span`) and a boolean `prefer_ents` keyword argument (whether to prefer the entity label for single tokens), return a `(word, sense)` tuple. Used in extension attributes to generate a key for tokens and spans. | | +| `registry.make_spacy_key` | Given a spaCy object (`Token` or `Span`) and a boolean `prefer_ents` keyword argument (whether to prefer the entity label for single tokens), return a `(word, sense)` tuple. Used in extension attributes to generate a key for tokens and spans. | | `registry.get_phrases` | Given a spaCy `Doc`, return a list of `Span` objects used for sense2vec phrases (typically noun phrases and named entities). | | `registry.merge_phrases` | Given a spaCy `Doc`, get all sense2vec phrases and merge them into single tokens.  |