diff --git a/scripts/01_parse.py b/scripts/01_parse.py index 1f7e5bd..700abd0 100644 --- a/scripts/01_parse.py +++ b/scripts/01_parse.py @@ -12,9 +12,9 @@ out_dir=("Path to output directory", "positional", None, str), spacy_model=("Name of spaCy model to use", "positional", None, str), n_process=("Number of processes (multiprocessing)", "option", "n", int), - max_docs=("Maximum docs per batch", "option", "m", int), + max_docs=("Maximum docs per batch", "option", "m", int), ) -def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=10**6): +def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=10 ** 6): """ Step 1: Parse raw text with spaCy @@ -50,12 +50,13 @@ def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=1 f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve()) doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) + batch_num += 1 + output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" + doc_bin_bytes = doc_bin.to_bytes() with output_file.open("wb") as f: - batch_num += 1 - output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" - doc_bin_bytes = doc_bin.to_bytes() f.write(doc_bin_bytes) - msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve()) + msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve()) + if __name__ == "__main__": plac.call(main) diff --git a/setup.cfg b/setup.cfg index ae54e1a..b7e6296 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [metadata] -version = 1.0.2 +version = 1.0.3 description = Contextually-keyed word vectors url = https://github.com/explosion/sense2vec author = Explosion