add support for multiple models

MartinoMensio · Apr 22, 2020 · 846f2aa · 846f2aa
1 parent f072abf
commit 846f2aa
Show file tree

Hide file tree

Showing 13 changed files with 465 additions and 61 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,7 @@
 use_model/**
+models/**
 use_package/**
+packages/**
 universal_sentence_encoder/models/**
 .DS_store
 

diff --git a/README.md b/README.md
@@ -5,27 +5,75 @@ Motivation to have different models:
 https://blog.floydhub.com/when-the-best-nlp-model-is-not-the-best-choice/
 The USE is trained on different tasks which are more suited to identifying sentence similarity. Source Google AI blog https://ai.googleblog.com/2018/05/advances-in-semantic-textual-similarity.html 
 
+## Install
+
+You can install the following models with pip:
+
+| model name | source | pip package |
+|------------|--------|---|
+| en_use_md  | https://tfhub.dev/google/universal-sentence-encoder | `pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/en_use_md-0.2.0/en_use_md-0.2.0.tar.gz#en_use_md-0.2.0 ` |
+| en_use_lg  | https://tfhub.dev/google/universal-sentence-encoder-large | `pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/en_use_lg-0.2.0/en_use_lg-0.2.0.tar.gz#en_use_lg-0.2.0` |
+| xx_use_md  | https://tfhub.dev/google/universal-sentence-encoder-multilingual | `pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/xx_use_md-0.2.0/xx_use_md-0.2.0.tar.gz#xx_use_md-0.2.0 ` |
+| xx_use_lg  | https://tfhub.dev/google/universal-sentence-encoder-multilingual-large | `pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/xx_use_lg-0.2.0/xx_use_lg-0.2.0.tar.gz#xx_use_lg-0.2.0` |
+
+
 ## Build model
-Or use the model built provided in the "packages" of this repo.
+Or instead, you can build the models on your own, by selecting which model to build in the `build_use.sh` script, and then run:
+
 ```bash
 bash build_use.sh
 ```
 
-## Install
-
-```bash
-pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/en_use-0.1.0/en_use-0.1.0.tar.gz#en_use-0.1.0
-```
 
 ## Usage
 
+You can use the models on their own:
+
 ```python
+# usual spacy way of doing
 import spacy
-nlp = spacy.load('en_use')
+# this loads the wrapper
+nlp = spacy.load('en_use_md')
+# get two documents
+doc_1 = nlp('Hi there, how are you?')
+doc_2 = nlp('Hello there, how are you doing today?')
+# get the vector of the Doc, Span or Token
+print(doc_1.vector.shape)
+print(doc_1[3].vector.shape)
+print(doc_1[2:4].vector.shape)
+# or use the similarity method that is based on the vectors, on Doc, Span or Token
+print(doc_1.similarity(doc_2[0:7]))
 ```
 
+Or use them just on a already available language pipeline (e.g. to keep your components or to have better parsing than the base spacy model used here):
 
-## TODOs 
+```python
+import spacy
+# this is your nlp object that can be anything
+nlp = spacy.load('en_core_web_sm')
+# import the specific
+
+# get the pipe component
+overwrite_vectors = nlp.create_pipe('overwrite_vectors')
+# add to your nlp the pipeline stage
+nlp.add_pipe(overwrite_vectors)
+# use the vector with the default `en_use_md` model
+doc = nlp('Hi')
+
+
+
+# or use a different model
+other_model_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'
+
+# by setting the extension `tfhub_model_url` on the doc
+doc._.tfhub_model_url = other_module_url
+
+# or by adding a pipeline component that sets on every document
+def set_tfhub_model_url(doc):
+    doc._.tfhub_model_url = other_model_url
+    return doc
+# add this pipeline component before the `overwrite_vectors`, because it will look at that extension
+nlp.add_pipe(set_tfhub_model_url, before='overwrite_vectors')
+
+```
 
-Model config:
-- `enable_cache` flag is not working
diff --git a/build_use.sh b/build_use.sh
@@ -1,16 +1,24 @@
 set -e
 
+# select here which one to build
+
+# MODEL_NAME='en_use_md'
+# MODEL_NAME='en_use_lg'
+# MODEL_NAME='xx_use_md'
+MODEL_NAME='xx_use_lg'
+
+mkdir -p models/$MODEL_NAME
 # create the nlp and save to disk
-python create.py
+python create.py $MODEL_NAME
 # overwrite meta.json
-cp meta/meta.json use_model/meta.json
+cp meta/$MODEL_NAME.json models/$MODEL_NAME/meta.json
 
 # create the package
-mkdir -p use_package
-python -m spacy package use_model use_package --force
-pushd use_package/en_use-0.1.3
+mkdir -p packages
+python -m spacy package models/$MODEL_NAME packages --force
+pushd packages/$MODEL_NAME-0.2.0
 # zip it
 python setup.py sdist
 # install the tar.gz from dist/en_use-0.1.1.tar.gz
-pip install dist/en_use-0.1.3.tar.gz
+pip install dist/$MODEL_NAME-0.2.0.tar.gz
 popd
diff --git a/create.py b/create.py
@@ -1,7 +1,16 @@
-from universal_sentence_encoder import language
+import typer
+from universal_sentence_encoder import language, util
 
-nlp = language.UniversalSentenceEncoder.create_nlp()
-print(nlp.pipe_names)
-doc = nlp('Hello my friend')
-print(doc.vector)
-nlp.to_disk('use_model')
+def main(model_name):
+    if model_name not in util.configs:
+        raise ValueError(f'Model "{model_name}" not available')
+    selected_config = util.configs[model_name]
+    nlp = language.UniversalSentenceEncoder.create_nlp(selected_config['spacy_base_model'], selected_config['tfhub_model_url'])
+    # nlp.vocab.reset_vectors(width=0)
+    print(nlp.pipe_names)
+    doc = nlp('Hello my friend')
+    print(doc.vector.shape)
+    nlp.to_disk(f'models/{model_name}')
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/meta/en_use_lg.json b/meta/en_use_lg.json
@@ -0,0 +1,164 @@
+{
+    "lang": "en",
+    "name": "use_lg",
+    "version": "0.2.0",
+    "spacy_version": ">=2.2.3",
+    "description": "TensorFlow Hub wrapper for Universal Sentence Encoder",
+    "author": "Martino Mensio",
+    "email": "martino.mensio@open.ac.uk",
+    "url": "https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub",
+    "license": "MIT",
+    "requirements": [
+        "universal_sentence_encoder @ git+ssh://git@github.com/MartinoMensio/spacy-universal-sentence-embedding-tfhub"
+    ],
+    "sources": [{
+        "name": "Universal Sentence Encoder - Large",
+        "url": "https://tfhub.dev/google/universal-sentence-encoder-large",
+        "license": "Apache-2.0"
+    }, {
+        "name": "SpaCy English model",
+        "url": "https://spacy.io/models/en#en_core_web_sm",
+        "license": "MIT"
+    }],
+    "vectors": {
+        "width": 512,
+        "vectors": 0,
+        "keys": 0,
+        "name": null
+    },
+    "pipeline": [
+        "tagger",
+        "parser",
+        "ner",
+        "save_tfhub_model_url",
+        "overwrite_vectors"
+    ],
+    "factories": {
+        "tagger": "tagger",
+        "parser": "parser",
+        "ner": "ner",
+        "save_tfhub_model_url": "save_tfhub_model_url",
+        "overwrite_vectors": "overwrite_vectors"
+    },
+    "labels": {
+        "tagger": [
+            "$",
+            "''",
+            ",",
+            "-LRB-",
+            "-RRB-",
+            ".",
+            ":",
+            "ADD",
+            "AFX",
+            "CC",
+            "CD",
+            "DT",
+            "EX",
+            "FW",
+            "HYPH",
+            "IN",
+            "JJ",
+            "JJR",
+            "JJS",
+            "LS",
+            "MD",
+            "NFP",
+            "NN",
+            "NNP",
+            "NNPS",
+            "NNS",
+            "PDT",
+            "POS",
+            "PRP",
+            "PRP$",
+            "RB",
+            "RBR",
+            "RBS",
+            "RP",
+            "SYM",
+            "TO",
+            "UH",
+            "VB",
+            "VBD",
+            "VBG",
+            "VBN",
+            "VBP",
+            "VBZ",
+            "WDT",
+            "WP",
+            "WP$",
+            "WRB",
+            "XX",
+            "_SP",
+            "``"
+        ],
+        "parser": [
+            "ROOT",
+            "acl",
+            "acomp",
+            "advcl",
+            "advmod",
+            "agent",
+            "amod",
+            "appos",
+            "attr",
+            "aux",
+            "auxpass",
+            "case",
+            "cc",
+            "ccomp",
+            "compound",
+            "conj",
+            "csubj",
+            "csubjpass",
+            "dative",
+            "dep",
+            "det",
+            "dobj",
+            "expl",
+            "intj",
+            "mark",
+            "meta",
+            "neg",
+            "nmod",
+            "npadvmod",
+            "nsubj",
+            "nsubjpass",
+            "nummod",
+            "oprd",
+            "parataxis",
+            "pcomp",
+            "pobj",
+            "poss",
+            "preconj",
+            "predet",
+            "prep",
+            "prt",
+            "punct",
+            "quantmod",
+            "relcl",
+            "xcomp"
+        ],
+        "ner": [
+            "CARDINAL",
+            "DATE",
+            "EVENT",
+            "FAC",
+            "GPE",
+            "LANGUAGE",
+            "LAW",
+            "LOC",
+            "MONEY",
+            "NORP",
+            "ORDINAL",
+            "ORG",
+            "PERCENT",
+            "PERSON",
+            "PRODUCT",
+            "QUANTITY",
+            "TIME",
+            "WORK_OF_ART"
+        ]
+    }
+}
diff --git a/meta/meta.json → meta/en_use_md.json b/meta/meta.json → meta/en_use_md.json
@@ -1,18 +1,27 @@
 {
     "lang": "en",
-    "name": "use",
-    "version": "0.1.3",
+    "name": "use_md",
+    "version": "0.2.0",
     "spacy_version": ">=2.2.3",
-    "description": "Using TFHub USE",
+    "description": "TensorFlow Hub wrapper for Universal Sentence Encoder",
     "author": "Martino Mensio",
     "email": "martino.mensio@open.ac.uk",
-    "url": "https://github.com/MartinoMensio/spacy-universal-sentence-embedding-tfhub",
-    "license": "",
+    "url": "https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub",
+    "license": "MIT",
     "requirements": [
         "universal_sentence_encoder @ git+ssh://git@github.com/MartinoMensio/spacy-universal-sentence-embedding-tfhub"
     ],
+    "sources": [{
+        "name": "Universal Sentence Encoder",
+        "url": "https://tfhub.dev/google/universal-sentence-encoder",
+        "license": "Apache-2.0"
+    }, {
+        "name": "SpaCy English model",
+        "url": "https://spacy.io/models/en#en_core_web_sm",
+        "license": "MIT"
+    }],
     "vectors": {
-        "width": 0,
+        "width": 512,
         "vectors": 0,
         "keys": 0,
         "name": null
@@ -21,17 +30,14 @@
         "tagger",
         "parser",
         "ner",
+        "save_tfhub_model_url",
         "overwrite_vectors"
     ],
-    "pipeline_args": {
-        "overwrite_vectors": {
-            "enable_cache": true
-        }
-    },
     "factories": {
         "tagger": "tagger",
         "parser": "parser",
         "ner": "ner",
+        "save_tfhub_model_url": "save_tfhub_model_url",
         "overwrite_vectors": "overwrite_vectors"
     },
     "labels": {