From 74092ada819db18e7d4d943b6dd1c05f4a420d18 Mon Sep 17 00:00:00 2001
From: andyjessen <andy.jessen@gmail.com>
Date: Thu, 30 Mar 2023 19:44:41 -0600
Subject: [PATCH 01/26] Update EntityLinker docstring

This commit updates the docstring to include current supported linkers.
---
 scispacy/linking.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scispacy/linking.py b/scispacy/linking.py
index 1e247bd..a1c11de 100644
--- a/scispacy/linking.py
+++ b/scispacy/linking.py
@@ -11,10 +11,12 @@ class EntityLinker:
     A spacy pipeline component which identifies entities in text which appear
     in a knowledge base.
 
-    Currently, there are two defaults: the Unified Medical Language System (UMLS) and
-    the Medical Subject Headings (MESH) dictionary.
+    Currently, there are five defaults: the Unified Medical Language System (UMLS),
+    the Medical Subject Headings (MeSH) dictionary, the RxNorm ontology, the Gene
+    Ontology, and the Human Phenotype Ontology.
 
-    To use these configured default KBs, pass the `name` parameter, either 'umls' or 'mesh'.
+    To use these configured default KBs, pass the `name` parameter ('umls','mesh',
+    'rxnorm','go','hpo').
 
     Currently this implementation just compares string similarity, returning
     entities above a given threshold.

From 9bf618a0caf2ccf018efac281346333d648e2759 Mon Sep 17 00:00:00 2001
From: Daniel King <43149077+dakinggg@users.noreply.github.com>
Date: Sat, 29 Apr 2023 14:40:19 -0700
Subject: [PATCH 02/26] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0066f0d..041dcd1 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@ In particular, there is a custom tokenizer that adds tokenization rules on top o
 rule-based tokenizer, a POS tagger and syntactic parser trained on biomedical data and
 an entity span detection model. Separately, there are also NER models for more specific tasks.
 
-**Just looking to test out the models on your data? Check out our [demo](https://scispacy.apps.allenai.org)**.
+**Just looking to test out the models on your data? Check out our [demo](https://scispacy.apps.allenai.org)** (Note: this demo is running an older version of scispaCy and may produce different results than the latest version).
 
 
 ## Installation

From d16887bb716483af74789ce89bd3761df5a0be5d Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Sun, 7 May 2023 14:09:00 +0200
Subject: [PATCH 03/26] Add lang and non_supressed options to UMLS reader

---
 scispacy/umls_utils.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py
index 07be1b9..c037288 100644
--- a/scispacy/umls_utils.py
+++ b/scispacy/umls_utils.py
@@ -38,7 +38,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
 
 
 def read_umls_concepts(
-    meta_path: str, concept_details: Dict, source: Optional[str] = None
+    meta_path: str, concept_details: Dict, source: str = None, lang: str = None, non_suppressed : bool= True
 ):
     """
     Read the concepts file MRCONSO.RRF from a UMLS release and store it in
@@ -58,6 +58,8 @@ def read_umls_concepts(
         concept_details: a dictionary to be filled with concept informations
         source: An optional source identifier, used as a filter to extract only a
                 specific source from UMLS.
+        lang: An optional language identifier, used to filter terms by language
+        non_suppressed: flag to indicate whether only non-suppressed concepts should be kept
     """
     concepts_filename = "MRCONSO.RRF"
     headers = read_umls_file_headers(meta_path, concepts_filename)
@@ -66,8 +68,8 @@ def read_umls_concepts(
             splits = line.strip().split("|")
             assert len(headers) == len(splits), (headers, splits)
             concept = dict(zip(headers, splits))
-            if concept["LAT"] != "ENG" or concept["SUPPRESS"] != "N":
-                continue  # Keep English non-suppressed concepts only
+            if (lang is not None and concept["LAT"] != lang) or (non_suppressed and concept["SUPPRESS"] != "N"):
+                continue  # Keep non-suppressed concepts in target language only
 
             if source is not None:
                 if concept["SAB"] != source:

From 8a72058ce12727eb5dfd1e3d4e3fda2ca2bfc4ac Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Sun, 7 May 2023 14:12:29 +0200
Subject: [PATCH 04/26] Support multiple lang exporting UMLS .jsons

---
 scripts/export_umls_json.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py
index 5b1192e..19c558a 100644
--- a/scripts/export_umls_json.py
+++ b/scripts/export_umls_json.py
@@ -7,7 +7,7 @@
 import argparse
 from scispacy import umls_utils
 
-def main(meta_path: str, output_path: str, source: str = None):
+def main(meta_path: str, output_path: str, lang: str, source: str = None):
 
     concept_details = {}  # dictionary of concept_id -> {
                           #                 'concept_id': str,
@@ -18,7 +18,7 @@ def main(meta_path: str, output_path: str, source: str = None):
                           # }
 
     print('Reading concepts ... ')
-    umls_utils.read_umls_concepts(meta_path, concept_details, source)
+    umls_utils.read_umls_concepts(meta_path, concept_details, lang, source)
 
     print('Reading types ... ')
     umls_utils.read_umls_types(meta_path, concept_details)
@@ -95,6 +95,10 @@ def main(meta_path: str, output_path: str, source: str = None):
         '--output_path',
         help="Path to the output jsonl file"
     )
+    parser.add_argument(
+        '--lang',
+        help="Language subset of UMLS"
+    )
     parser.add_argument(
         '--source',
         type=str,
@@ -102,4 +106,4 @@ def main(meta_path: str, output_path: str, source: str = None):
         help="Whether to filter for a only a single UMLS source."
     )
     args = parser.parse_args()
-    main(args.meta_path, args.output_path, args.source)
+    main(args.meta_path, args.output_path, args.lang, args.source)

From 5f25b3d33a0b2e81a0ec50c6e5502267bf20fb6a Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Sun, 7 May 2023 14:15:36 +0200
Subject: [PATCH 05/26] Add default to lang

To avoid breaking current implementations
---
 scripts/export_umls_json.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py
index 19c558a..976c382 100644
--- a/scripts/export_umls_json.py
+++ b/scripts/export_umls_json.py
@@ -7,7 +7,7 @@
 import argparse
 from scispacy import umls_utils
 
-def main(meta_path: str, output_path: str, lang: str, source: str = None):
+def main(meta_path: str, output_path: str, lang: str = None, source: str = None):
 
     concept_details = {}  # dictionary of concept_id -> {
                           #                 'concept_id': str,

From 8c130ea37460ac9fcd960b85c50013752c32c2f8 Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Sun, 7 May 2023 14:24:20 +0200
Subject: [PATCH 06/26] Correct source type to Optional[str]

---
 scispacy/umls_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py
index c037288..3f26c7b 100644
--- a/scispacy/umls_utils.py
+++ b/scispacy/umls_utils.py
@@ -38,7 +38,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
 
 
 def read_umls_concepts(
-    meta_path: str, concept_details: Dict, source: str = None, lang: str = None, non_suppressed : bool= True
+    meta_path: str, concept_details: Dict, source: Optional[str] = None, lang: str = None, non_suppressed : bool= True
 ):
     """
     Read the concepts file MRCONSO.RRF from a UMLS release and store it in

From 1fd7b99cbe37073317ef16f053ef1360035b75fd Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Sun, 7 May 2023 15:19:51 +0200
Subject: [PATCH 07/26] Set English as default language

to maintain default behavior
---
 scispacy/umls_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py
index 3f26c7b..f3f90c6 100644
--- a/scispacy/umls_utils.py
+++ b/scispacy/umls_utils.py
@@ -38,7 +38,7 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
 
 
 def read_umls_concepts(
-    meta_path: str, concept_details: Dict, source: Optional[str] = None, lang: str = None, non_suppressed : bool= True
+    meta_path: str, concept_details: Dict, source: Optional[str] = None, lang: str = "ENG", non_suppressed : bool= True
 ):
     """
     Read the concepts file MRCONSO.RRF from a UMLS release and store it in

From abdd0fe874ad6912f0c64740ad7fcfbdacaef3c0 Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Sun, 7 May 2023 15:23:48 +0200
Subject: [PATCH 08/26] Include --non_supressed

And correct lang default to "ENG"
---
 scripts/export_umls_json.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py
index 976c382..e98ff18 100644
--- a/scripts/export_umls_json.py
+++ b/scripts/export_umls_json.py
@@ -93,11 +93,12 @@ def main(meta_path: str, output_path: str, lang: str = None, source: str = None)
     )
     parser.add_argument(
         '--output_path',
-        help="Path to the output jsonl file"
+        help="Path to the output jsonl file."
     )
     parser.add_argument(
         '--lang',
-        help="Language subset of UMLS"
+        default="ENG",
+        help="Language subset of UMLS."
     )
     parser.add_argument(
         '--source',
@@ -105,5 +106,10 @@ def main(meta_path: str, output_path: str, lang: str = None, source: str = None)
         default=None,
         help="Whether to filter for a only a single UMLS source."
     )
+    parser.add_argument(
+        '--non_supressed',
+        default=True,
+        help="Whether to include non supressed terms."
+    )
     args = parser.parse_args()
     main(args.meta_path, args.output_path, args.lang, args.source)

From ea1fcc11c847d31b60e924a9ea92371750e98480 Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Sun, 7 May 2023 16:50:45 +0200
Subject: [PATCH 09/26] Include `args.non_suppressed` in main call

and correct a typo
---
 scripts/export_umls_json.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/export_umls_json.py b/scripts/export_umls_json.py
index e98ff18..e3acd74 100644
--- a/scripts/export_umls_json.py
+++ b/scripts/export_umls_json.py
@@ -107,9 +107,9 @@ def main(meta_path: str, output_path: str, lang: str = None, source: str = None)
         help="Whether to filter for a only a single UMLS source."
     )
     parser.add_argument(
-        '--non_supressed',
+        '--non_suppressed',
         default=True,
         help="Whether to include non supressed terms."
     )
     args = parser.parse_args()
-    main(args.meta_path, args.output_path, args.lang, args.source)
+    main(args.meta_path, args.output_path, args.lang, args.source, args.non_suppressed)

From 93a77264a1ed6a554a185a50b4a5b6aca8a41d26 Mon Sep 17 00:00:00 2001
From: nachollorca <madwayesp@gmail.com>
Date: Mon, 8 May 2023 18:08:30 +0200
Subject: [PATCH 10/26] Correct linting errors in `read_umls_concepts()`

---
 scispacy/umls_utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py
index f3f90c6..943faa5 100644
--- a/scispacy/umls_utils.py
+++ b/scispacy/umls_utils.py
@@ -38,7 +38,11 @@ def read_umls_file_headers(meta_path: str, filename: str) -> List[str]:
 
 
 def read_umls_concepts(
-    meta_path: str, concept_details: Dict, source: Optional[str] = None, lang: str = "ENG", non_suppressed : bool= True
+    meta_path: str,
+    concept_details: Dict,
+    source: Optional[str] = None,
+    lang: str = "ENG",
+    non_suppressed: bool = True
 ):
     """
     Read the concepts file MRCONSO.RRF from a UMLS release and store it in

From a4a9141b38c60ddcb569a5fe3af2bbaec699dda9 Mon Sep 17 00:00:00 2001
From: illorca <ignacio.rodriguez@access2.hpc.dhclab.i.hpi.de>
Date: Thu, 11 May 2023 18:54:45 +0200
Subject: [PATCH 11/26] Line length 88

---
 scispacy/umls_utils.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/scispacy/umls_utils.py b/scispacy/umls_utils.py
index 943faa5..1506bc9 100644
--- a/scispacy/umls_utils.py
+++ b/scispacy/umls_utils.py
@@ -42,7 +42,7 @@ def read_umls_concepts(
     concept_details: Dict,
     source: Optional[str] = None,
     lang: str = "ENG",
-    non_suppressed: bool = True
+    non_suppressed: bool = True,
 ):
     """
     Read the concepts file MRCONSO.RRF from a UMLS release and store it in
@@ -72,7 +72,9 @@ def read_umls_concepts(
             splits = line.strip().split("|")
             assert len(headers) == len(splits), (headers, splits)
             concept = dict(zip(headers, splits))
-            if (lang is not None and concept["LAT"] != lang) or (non_suppressed and concept["SUPPRESS"] != "N"):
+            if (lang is not None and concept["LAT"] != lang) or (
+                non_suppressed and concept["SUPPRESS"] != "N"
+            ):
                 continue  # Keep non-suppressed concepts in target language only
 
             if source is not None:

From 6aabe589d88e7bedcab2987e6d8461514c98d71c Mon Sep 17 00:00:00 2001
From: John Giorgi <johnmgiorgi@gmail.com>
Date: Sun, 25 Jun 2023 10:59:49 -0700
Subject: [PATCH 12/26] Add a note about make_serializable argument

---
 README.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/README.md b/README.md
index 041dcd1..c74af2c 100644
--- a/README.md
+++ b/README.md
@@ -123,6 +123,10 @@ for abrv in doc._.abbreviations:
 >>> SBMA 	   	 (6, 7)     Spinal and bulbar muscular atrophy
 >>> AR   		 (29, 30)   androgen receptor
 ```
+
+> **Note**
+> If you want to be able to [serialize your `doc` objects](https://spacy.io/usage/saving-loading), load the abbreviation detector with `make_serializable=True`, e.g. `nlp.add_pipe("abbreviation_detector", config={"make_serializable": True})`
+
 ### EntityLinker
 
 The `EntityLinker` is a SpaCy component which performs linking to a knowledge base. The linker simply performs

From a88bfa4068506a079dcb08cc72a3f127c1a2707c Mon Sep 17 00:00:00 2001
From: John Giorgi <johnmgiorgi@gmail.com>
Date: Thu, 27 Jul 2023 12:03:40 -0400
Subject: [PATCH 13/26] Drop umls and umls_ents attributes in linker

Drop the umls and umls_ents attributes in the linker that were there for backwards compatibility
---
 scispacy/linking.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/scispacy/linking.py b/scispacy/linking.py
index b2bb117..641f419 100644
--- a/scispacy/linking.py
+++ b/scispacy/linking.py
@@ -80,8 +80,6 @@ def __init__(
         max_entities_per_mention: int = 5,
         linker_name: Optional[str] = None,
     ):
-        # TODO(Mark): Remove in scispacy v1.0.
-        Span.set_extension("umls_ents", default=[], force=True)
         Span.set_extension("kb_ents", default=[], force=True)
 
         self.candidate_generator = candidate_generator or CandidateGenerator(
@@ -95,9 +93,6 @@ def __init__(
         self.filter_for_definitions = filter_for_definitions
         self.max_entities_per_mention = max_entities_per_mention
 
-        # TODO(Mark): Remove in scispacy v1.0. This is for backward compatability only.
-        self.umls = self.kb
-
     def __call__(self, doc: Doc) -> Doc:
         mention_strings = []
         if self.resolve_abbreviations and Doc.has_extension("abbreviations"):
@@ -131,7 +126,6 @@ def __call__(self, doc: Doc) -> Doc:
                 if score > self.threshold:
                     predicted.append((cand.concept_id, score))
             sorted_predicted = sorted(predicted, reverse=True, key=lambda x: x[1])
-            mention._.umls_ents = sorted_predicted[: self.max_entities_per_mention]
             mention._.kb_ents = sorted_predicted[: self.max_entities_per_mention]
 
         return doc

From e905961f94277336293f59917d85f032736dfc49 Mon Sep 17 00:00:00 2001
From: Kaushik Acharya <acharya.kaushik@gmail.com>
Date: Thu, 24 Aug 2023 12:05:17 +0530
Subject: [PATCH 14/26] Updating nmslib hyperparameters guide url

---
 scispacy/candidate_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scispacy/candidate_generation.py b/scispacy/candidate_generation.py
index a408a6e..f11a721 100644
--- a/scispacy/candidate_generation.py
+++ b/scispacy/candidate_generation.py
@@ -383,7 +383,7 @@ def create_tfidf_ann_index(
     kb = kb or UmlsKnowledgeBase()
 
     # nmslib hyperparameters (very important)
-    # guide: https://github.com/nmslib/nmslib/blob/master/python_bindings/parameters.md
+    # guide: https://github.com/nmslib/nmslib/blob/master/manual/methods.md
     # Default values resulted in very low recall.
 
     # set to the maximum recommended value. Improves recall at the expense of longer indexing time.

From 9bdc0f848a2caa4e4e5dd4f598623f098f7bc18d Mon Sep 17 00:00:00 2001
From: Kaushik Acharya <acharya.kaushik@gmail.com>
Date: Thu, 24 Aug 2023 12:07:04 +0530
Subject: [PATCH 15/26] Updating UMLS concept alias path variable with
 appropriate name

---
 scispacy/candidate_generation.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scispacy/candidate_generation.py b/scispacy/candidate_generation.py
index f11a721..7e9ae89 100644
--- a/scispacy/candidate_generation.py
+++ b/scispacy/candidate_generation.py
@@ -378,7 +378,7 @@ def create_tfidf_ann_index(
     tfidf_vectorizer_path = f"{out_path}/tfidf_vectorizer.joblib"
     ann_index_path = f"{out_path}/nmslib_index.bin"
     tfidf_vectors_path = f"{out_path}/tfidf_vectors_sparse.npz"
-    uml_concept_aliases_path = f"{out_path}/concept_aliases.json"
+    umls_concept_aliases_path = f"{out_path}/concept_aliases.json"
 
     kb = kb or UmlsKnowledgeBase()
 
@@ -445,9 +445,9 @@ def create_tfidf_ann_index(
     assert len(concept_aliases) == numpy.size(concept_alias_tfidfs, 0)
 
     print(
-        f"Saving list of concept ids and tfidfs vectors to {uml_concept_aliases_path} and {tfidf_vectors_path}"
+        f"Saving list of concept ids and tfidfs vectors to {umls_concept_aliases_path} and {tfidf_vectors_path}"
     )
-    json.dump(concept_aliases, open(uml_concept_aliases_path, "w"))
+    json.dump(concept_aliases, open(umls_concept_aliases_path, "w"))
     scipy.sparse.save_npz(
         tfidf_vectors_path, concept_alias_tfidfs.astype(numpy.float16)
     )

From 36528a96ba5844a0024cfc4ed2ece61a4f52cb5b Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Sat, 26 Aug 2023 00:03:43 -0700
Subject: [PATCH 16/26] update requirements

---
 requirements.in | 2 +-
 setup.py        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.in b/requirements.in
index dc2f1f2..cb24b3e 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1,5 +1,5 @@
 numpy
-spacy>=3.4.0,<3.5.0
+spacy>=3.6.0,<3.7.0
 spacy-lookups-data
 pandas
 requests>=2.0.0,<3.0.0
diff --git a/setup.py b/setup.py
index 91373dc..da52d2c 100644
--- a/setup.py
+++ b/setup.py
@@ -41,7 +41,7 @@
     packages=find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
     license="Apache",
     install_requires=[
-        "spacy>=3.4.0,<3.5.0",
+        "spacy>=3.6.0,<3.7.0",
         "requests>=2.0.0,<3.0.0",
         "conllu",
         "numpy",

From 2474bd6b1d703e6f681af351e2638087a701de83 Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Sat, 26 Aug 2023 00:13:35 -0700
Subject: [PATCH 17/26] minor config changes based on latest spacy

---
 configs/base_ner.cfg           | 4 ++--
 configs/base_ner_scibert.cfg   | 4 ++--
 configs/base_parser_tagger.cfg | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/configs/base_ner.cfg b/configs/base_ner.cfg
index 00b0506..bdf0ff7 100644
--- a/configs/base_ner.cfg
+++ b/configs/base_ner.cfg
@@ -48,8 +48,8 @@ nO = null
 [components.ner.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v2"
 width = 96
-attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"]
-rows = [5000, 2500, 2500, 2500, 100]
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [5000, 1000, 2500, 2500]
 include_static_vectors = ${vars.include_static_vectors}
 
 [components.ner.model.tok2vec.encode]
diff --git a/configs/base_ner_scibert.cfg b/configs/base_ner_scibert.cfg
index c8b7371..7e3a839 100644
--- a/configs/base_ner_scibert.cfg
+++ b/configs/base_ner_scibert.cfg
@@ -45,8 +45,8 @@ nO = null
 [components.ner.model.tok2vec.embed]
 @architectures = "spacy.MultiHashEmbed.v2"
 width = 96
-attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"]
-rows = [5000, 2500, 2500, 2500, 100]
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"]
+rows = [5000, 1000, 2500, 2500]
 include_static_vectors = false
 
 [components.ner.model.tok2vec.encode]
diff --git a/configs/base_parser_tagger.cfg b/configs/base_parser_tagger.cfg
index 738c85c..6801de5 100644
--- a/configs/base_parser_tagger.cfg
+++ b/configs/base_parser_tagger.cfg
@@ -73,8 +73,8 @@ factory = "tok2vec"
 [components.tok2vec.model.embed]
 @architectures = "spacy.MultiHashEmbed.v2"
 width = ${components.tok2vec.model.encode.width}
-attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY"]
-rows = [5000, 2500, 2500, 2500, 100]
+attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE", "SPACY", "IS_SPACE"]
+rows = [5000, 1000, 2500, 2500, 50, 50]
 include_static_vectors = ${vars.include_static_vectors}
 
 [components.tok2vec.model.encode]

From 8e72a496de0987ba1e17c903d2828ac4395481ba Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Sat, 26 Aug 2023 00:55:39 -0700
Subject: [PATCH 18/26] pin scipy

---
 requirements.in | 1 +
 setup.py        | 1 +
 2 files changed, 2 insertions(+)

diff --git a/requirements.in b/requirements.in
index cb24b3e..df702bf 100644
--- a/requirements.in
+++ b/requirements.in
@@ -1,4 +1,5 @@
 numpy
+scipy<1.11
 spacy>=3.6.0,<3.7.0
 spacy-lookups-data
 pandas
diff --git a/setup.py b/setup.py
index da52d2c..c43dca7 100644
--- a/setup.py
+++ b/setup.py
@@ -42,6 +42,7 @@
     license="Apache",
     install_requires=[
         "spacy>=3.6.0,<3.7.0",
+        "scipy<1.11",
         "requests>=2.0.0,<3.0.0",
         "conllu",
         "numpy",

From 81df1dde2a653ff431cc04e2837ab3d1617f9e05 Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Sun, 27 Aug 2023 12:08:08 -0700
Subject: [PATCH 19/26] update release instructions

---
 RELEASE.md | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 65b2e95..d59211c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -15,16 +15,11 @@ Update the version in version.py.
 
 #### Training new models
 
-For the release, new models should be trained using the `scripts/pipeline.sh` and `scripts/ner_pipeline.sh` scripts, for the small, medium and large models, and specialized NER models. Remember to export the `ONTONOTES_PATH` and `ONTONOTES_PERCENT` environment variables to mix in the ontonotes training data.
+The entire pipeline can be run using `spacy project run all`. This will train and package all the models.
 
-```
-bash scripts/pipeline.sh small
-bash scripts/pipeline.sh medium
-bash scripts/pipeline.sh large
-bash scripts/ner_pipeline.sh <path to medium base model>
-```
+The packages should then be uploaded to the `https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/{VERSION}` S3 bucket, and references to previous models (e.g in the readme and in the docs) should be updated. You can find all these places using `git grep <previous version>`.
 
-these should then be uploaded to the `https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/{VERSION}` S3 bucket, and references to previous models (e.g in the readme and in the docs) should be updated. You can find all these places using `git grep <previous version>`.
+The scripts `install_local_packages.py`, `instal_remote_packages.py`, `print_out_metrics.py`, `smoke_test.py`, and `uninstall_local_packages.py` are useful for testing at each step of the process. Before uploading, `install_local_packages.py` and `smoke_test.py` can be used to make sure the packages are installable and do a quick check of output. `print_out_metrics.py` can then be used to easily get the metrics that need to be update in the README. Once the packages have been uploaded, `uninstall_local_packages.py`, `install_remote_packages.py`, and `smoke_test.py` can be used to ensure everything was uploaded correctly.
 
 #### Merge a PR with the above changes
 Merge a PR with the above changes, and publish a release with a tag corresponding to the commit from the merged PR. This should trigger the publish github action, which will create the `scispacy` package and publish it to pypi.

From cd7dfcde871f39df63fca768f0cf937ee22f4624 Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Sun, 27 Aug 2023 12:08:17 -0700
Subject: [PATCH 20/26] update version

---
 scispacy/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scispacy/version.py b/scispacy/version.py
index 7b6fdbf..cb96d45 100644
--- a/scispacy/version.py
+++ b/scispacy/version.py
@@ -1,6 +1,6 @@
 _MAJOR = "0"
 _MINOR = "5"
-_REVISION = "2"
+_REVISION = "3"
 
 VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR)
 VERSION = "{0}.{1}.{2}".format(_MAJOR, _MINOR, _REVISION)

From ccf9e9b511955915b9244a47e0ff9de0e3d363cd Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Mon, 4 Sep 2023 00:20:23 -0700
Subject: [PATCH 21/26] update version again

---
 project.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/project.yml b/project.yml
index 91eae4e..ce7190b 100644
--- a/project.yml
+++ b/project.yml
@@ -2,7 +2,7 @@ title: "scispaCy pipeline"
 description: "All the steps needed in the scispaCy pipeline"
 
 vars:
-  version_string: "0.5.2"
+  version_string: "0.5.3"
   gpu_id: 0
   freqs_loc_s3: "s3://ai2-s2-scispacy/data/gorc_subset.freqs"
   freqs_loc_local: "assets/gorc_subset.freqs"
@@ -166,9 +166,9 @@ commands:
       - "aws s3 cp ${vars.genia_loc_s3}/train.json ${vars.genia_train_loc_local} --no-sign-request"
       - "aws s3 cp ${vars.genia_loc_s3}/dev.json ${vars.genia_dev_loc_local} --no-sign-request"
       - "aws s3 cp ${vars.genia_loc_s3}/test.json ${vars.genia_test_loc_local} --no-sign-request"
-      - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz"
-      - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/"
-      - "rm ${vars.ontonotes_loc_local}.tar.gz"
+      # - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz"
+      # - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/"
+      # - "rm ${vars.ontonotes_loc_local}.tar.gz"
       - "aws s3 cp ${vars.med_mentions_loc_s3} assets/med_mentions.tar.gz --no-sign-request"
       - "tar -xzvf assets/med_mentions.tar.gz -C assets/"
       - "rm assets/med_mentions.tar.gz"

From adeaba4769b071f4e42dc1589ba0c9996adfdfc8 Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Wed, 6 Sep 2023 23:23:10 -0700
Subject: [PATCH 22/26] update numbers

---
 docs/index.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 65b94cd..7f72404 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -34,18 +34,18 @@ Our models achieve performance within 3% of published state of the art dependenc
 
 | model          | UAS | LAS   | POS   | Mentions (F1) | Web UAS | 
 |:---------------|:----|:------|:------|:---|:---|
-| en_core_sci_sm | 89.03| 87.00  |  98.13  |  67.87  |  87.42  |
-| en_core_sci_md | 89.73| 87.85 |  98.40 |  69.53 |  87.79  |
-| en_core_sci_lg | 89.75| 87.79  |  98.49  |  69.69  |  87.74  |
-| en_core_sci_scibert | 92.21| 90.65  |  98.86  |  68.01  |  92.58  |
+| en_core_sci_sm | 89.39| 87.41  |  98.32  |  68.00  |  87.65  |
+| en_core_sci_md | 90.23| 88.39 |  98.39 |  68.95 |  87.63  |
+| en_core_sci_lg | 89.98| 88.15  |  98.50  |  68.67  |  88.21  |
+| en_core_sci_scibert | 92.54| 91.02  |  98.89  |  67.90  |  92.85  |
 
 
 | model          | F1 |   Entity Types|
 |:---------------|:-----|:--------|
-| en_ner_craft_md | 76.75|GGP, SO, TAXON, CHEBI, GO, CL|
-| en_ner_jnlpba_md | 72.28| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN |
-| en_ner_bc5cdr_md | 84.53| DISEASE, CHEMICAL|
-| en_ner_bionlp13cg_md | 76.57| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE |
+| en_ner_craft_md | 77.56|GGP, SO, TAXON, CHEBI, GO, CL|
+| en_ner_jnlpba_md | 72.98| DNA, CELL_TYPE, CELL_LINE, RNA, PROTEIN |
+| en_ner_bc5cdr_md | 84.23| DISEASE, CHEMICAL|
+| en_ner_bionlp13cg_md | 77.36| AMINO_ACID, ANATOMICAL_SYSTEM, CANCER, CELL, CELLULAR_COMPONENT, DEVELOPING_ANATOMICAL_STRUCTURE, GENE_OR_GENE_PRODUCT, IMMATERIAL_ANATOMICAL_ENTITY, MULTI-TISSUE_STRUCTURE, ORGAN, ORGANISM, ORGANISM_SUBDIVISION, ORGANISM_SUBSTANCE, PATHOLOGICAL_FORMATION, SIMPLE_CHEMICAL, TISSUE |
 
 
 ### Example Usage

From 3a87799074e14f9a9839c899cbfdd9996d67e2c5 Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Mon, 11 Sep 2023 00:16:01 -0700
Subject: [PATCH 23/26] undo comment

---
 project.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/project.yml b/project.yml
index ce7190b..4c75c93 100644
--- a/project.yml
+++ b/project.yml
@@ -166,9 +166,9 @@ commands:
       - "aws s3 cp ${vars.genia_loc_s3}/train.json ${vars.genia_train_loc_local} --no-sign-request"
       - "aws s3 cp ${vars.genia_loc_s3}/dev.json ${vars.genia_dev_loc_local} --no-sign-request"
       - "aws s3 cp ${vars.genia_loc_s3}/test.json ${vars.genia_test_loc_local} --no-sign-request"
-      # - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz"
-      # - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/"
-      # - "rm ${vars.ontonotes_loc_local}.tar.gz"
+      - "aws s3 cp ${vars.ontonotes_loc_s3} ${vars.ontonotes_loc_local}.tar.gz"
+      - "tar -xzvf ${vars.ontonotes_loc_local}.tar.gz -C assets/"
+      - "rm ${vars.ontonotes_loc_local}.tar.gz"
       - "aws s3 cp ${vars.med_mentions_loc_s3} assets/med_mentions.tar.gz --no-sign-request"
       - "tar -xzvf assets/med_mentions.tar.gz -C assets/"
       - "rm assets/med_mentions.tar.gz"

From 2c5c6a53a411df68d78138297e820003a1e9b0bf Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Sat, 30 Sep 2023 12:40:17 -0700
Subject: [PATCH 24/26] update links

---
 docs/index.md                      | 16 ++++++++--------
 scripts/install_remote_packages.py |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/index.md b/docs/index.md
index 7f72404..5313b2f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -17,14 +17,14 @@ pip install <Model URL>
 
 | Model          | Description       | Install URL
 |:---------------|:------------------|:----------|
-| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz)|
-| en_core_sci_md |  A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz)|
-| en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz)|
-| en_core_sci_lg |  A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz)|
-| en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz)|
-| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz)|
-| en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz)|
-| en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz)|
+| en_core_sci_sm | A full spaCy pipeline for biomedical data. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz)|
+| en_core_sci_md |  A full spaCy pipeline for biomedical data with a larger vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz)|
+| en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz)|
+| en_core_sci_lg |  A full spaCy pipeline for biomedical data with a larger vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz)|
+| en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_craft_md-0.5.3.tar.gz)|
+| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_jnlpba_md-0.5.3.tar.gz)|
+| en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz)|
+| en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bionlp13cg_md-0.5.3.tar.gz)|
 
 
 
diff --git a/scripts/install_remote_packages.py b/scripts/install_remote_packages.py
index 60ff0f5..6232a5c 100644
--- a/scripts/install_remote_packages.py
+++ b/scripts/install_remote_packages.py
@@ -4,7 +4,7 @@
 
 
 def main():
-    s3_prefix = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/"
+    s3_prefix = "https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/"
     model_names = [
         "en_core_sci_sm",
         "en_core_sci_md",

From c6c35be951cd55dca660f60e78dbadefc2577d77 Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Sat, 30 Sep 2023 12:42:26 -0700
Subject: [PATCH 25/26] update readme links

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index c74af2c..bd8b1f7 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ pip install scispacy
 to install a model (see our full selection of available models below), run a command like the following:
 
 ```bash
-pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
+pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz
 ```
 
 Note: We strongly recommend that you use an isolated Python environment (such as virtualenv or conda) to install scispacy.
@@ -76,14 +76,14 @@ pip install CMD-V(to paste the copied URL)
 
 | Model          | Description       | Install URL
 |:---------------|:------------------|:----------|
-| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz)|
-| en_core_sci_md |  A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz)|
-| en_core_sci_lg |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz)|
-| en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_scibert-0.5.1.tar.gz)|
-| en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_craft_md-0.5.1.tar.gz)|
-| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_jnlpba_md-0.5.1.tar.gz)|
-| en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz)|
-| en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bionlp13cg_md-0.5.1.tar.gz)|
+| en_core_sci_sm | A full spaCy pipeline for biomedical data with a ~100k vocabulary. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz)|
+| en_core_sci_md |  A full spaCy pipeline for biomedical data with a ~360k vocabulary and 50k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_md-0.5.3.tar.gz)|
+| en_core_sci_lg |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and 600k word vectors. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_lg-0.5.3.tar.gz)|
+| en_core_sci_scibert |  A full spaCy pipeline for biomedical data with a ~785k vocabulary and `allenai/scibert-base` as the transformer model. You may want to [use a GPU](https://spacy.io/usage#gpu) with this model. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_scibert-0.5.3.tar.gz)|
+| en_ner_craft_md|  A spaCy NER model trained on the CRAFT corpus.|[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_craft_md-0.5.3.tar.gz)|
+| en_ner_jnlpba_md | A spaCy NER model trained on the JNLPBA corpus.| [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_jnlpba_md-0.5.3.tar.gz)|
+| en_ner_bc5cdr_md |  A spaCy NER model trained on the BC5CDR corpus. | [Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bc5cdr_md-0.5.3.tar.gz)|
+| en_ner_bionlp13cg_md |  A spaCy NER model trained on the BIONLP13CG corpus. |[Download](https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_ner_bionlp13cg_md-0.5.3.tar.gz)|
 
 
 ## Additional Pipeline Components

From b4cef3d61a8ef0f181ca988e1977f5fb81940de7 Mon Sep 17 00:00:00 2001
From: Daniel King <daking@hey.com>
Date: Sat, 30 Sep 2023 12:43:22 -0700
Subject: [PATCH 26/26] update version in dockerfile

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 8417070..42d8f6e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -18,7 +18,7 @@ WORKDIR /work
 COPY requirements.in .
 
 RUN pip install -r requirements.in
-RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
+RUN pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.3/en_core_sci_sm-0.5.3.tar.gz
 RUN python -m spacy download en_core_web_sm
 RUN python -m spacy download en_core_web_md