Skip to content

Commit

Permalink
add support for multiple models
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinoMensio committed Apr 22, 2020
1 parent f072abf commit 846f2aa
Show file tree
Hide file tree
Showing 13 changed files with 465 additions and 61 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use_model/**
models/**
use_package/**
packages/**
universal_sentence_encoder/models/**
.DS_store

Expand Down
68 changes: 58 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,75 @@ Motivation to have different models:
https://blog.floydhub.com/when-the-best-nlp-model-is-not-the-best-choice/
The USE is trained on different tasks which are more suited to identifying sentence similarity. Source Google AI blog https://ai.googleblog.com/2018/05/advances-in-semantic-textual-similarity.html

## Install

You can install the following models with pip:

| model name | source | pip package |
|------------|--------|---|
| en_use_md | https://tfhub.dev/google/universal-sentence-encoder | `pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/en_use_md-0.2.0/en_use_md-0.2.0.tar.gz#en_use_md-0.2.0 ` |
| en_use_lg | https://tfhub.dev/google/universal-sentence-encoder-large | `pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/en_use_lg-0.2.0/en_use_lg-0.2.0.tar.gz#en_use_lg-0.2.0` |
| xx_use_md | https://tfhub.dev/google/universal-sentence-encoder-multilingual | `pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/xx_use_md-0.2.0/xx_use_md-0.2.0.tar.gz#xx_use_md-0.2.0 ` |
| xx_use_lg | https://tfhub.dev/google/universal-sentence-encoder-multilingual-large | `pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/xx_use_lg-0.2.0/xx_use_lg-0.2.0.tar.gz#xx_use_lg-0.2.0` |


## Build model
Or use the model built provided in the "packages" of this repo.
Or instead, you can build the models on your own, by selecting which model to build in the `build_use.sh` script, and then run:

```bash
bash build_use.sh
```

## Install

```bash
pip install https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub/releases/download/en_use-0.1.0/en_use-0.1.0.tar.gz#en_use-0.1.0
```

## Usage

You can use the models on their own:

```python
# usual spacy way of doing
import spacy
nlp = spacy.load('en_use')
# this loads the wrapper
nlp = spacy.load('en_use_md')
# get two documents
doc_1 = nlp('Hi there, how are you?')
doc_2 = nlp('Hello there, how are you doing today?')
# get the vector of the Doc, Span or Token
print(doc_1.vector.shape)
print(doc_1[3].vector.shape)
print(doc_1[2:4].vector.shape)
# or use the similarity method that is based on the vectors, on Doc, Span or Token
print(doc_1.similarity(doc_2[0:7]))
```

Or use them just on a already available language pipeline (e.g. to keep your components or to have better parsing than the base spacy model used here):

## TODOs
```python
import spacy
# this is your nlp object that can be anything
nlp = spacy.load('en_core_web_sm')
# import the specific

# get the pipe component
overwrite_vectors = nlp.create_pipe('overwrite_vectors')
# add to your nlp the pipeline stage
nlp.add_pipe(overwrite_vectors)
# use the vector with the default `en_use_md` model
doc = nlp('Hi')



# or use a different model
other_model_url = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual/3'

# by setting the extension `tfhub_model_url` on the doc
doc._.tfhub_model_url = other_module_url

# or by adding a pipeline component that sets on every document
def set_tfhub_model_url(doc):
doc._.tfhub_model_url = other_model_url
return doc
# add this pipeline component before the `overwrite_vectors`, because it will look at that extension
nlp.add_pipe(set_tfhub_model_url, before='overwrite_vectors')

```

Model config:
- `enable_cache` flag is not working
20 changes: 14 additions & 6 deletions build_use.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
set -e

# select here which one to build

# MODEL_NAME='en_use_md'
# MODEL_NAME='en_use_lg'
# MODEL_NAME='xx_use_md'
MODEL_NAME='xx_use_lg'

mkdir -p models/$MODEL_NAME
# create the nlp and save to disk
python create.py
python create.py $MODEL_NAME
# overwrite meta.json
cp meta/meta.json use_model/meta.json
cp meta/$MODEL_NAME.json models/$MODEL_NAME/meta.json

# create the package
mkdir -p use_package
python -m spacy package use_model use_package --force
pushd use_package/en_use-0.1.3
mkdir -p packages
python -m spacy package models/$MODEL_NAME packages --force
pushd packages/$MODEL_NAME-0.2.0
# zip it
python setup.py sdist
# install the tar.gz from dist/en_use-0.1.1.tar.gz
pip install dist/en_use-0.1.3.tar.gz
pip install dist/$MODEL_NAME-0.2.0.tar.gz
popd
21 changes: 15 additions & 6 deletions create.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
from universal_sentence_encoder import language
import typer
from universal_sentence_encoder import language, util

nlp = language.UniversalSentenceEncoder.create_nlp()
print(nlp.pipe_names)
doc = nlp('Hello my friend')
print(doc.vector)
nlp.to_disk('use_model')
def main(model_name):
if model_name not in util.configs:
raise ValueError(f'Model "{model_name}" not available')
selected_config = util.configs[model_name]
nlp = language.UniversalSentenceEncoder.create_nlp(selected_config['spacy_base_model'], selected_config['tfhub_model_url'])
# nlp.vocab.reset_vectors(width=0)
print(nlp.pipe_names)
doc = nlp('Hello my friend')
print(doc.vector.shape)
nlp.to_disk(f'models/{model_name}')

if __name__ == "__main__":
typer.run(main)
164 changes: 164 additions & 0 deletions meta/en_use_lg.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
{
"lang": "en",
"name": "use_lg",
"version": "0.2.0",
"spacy_version": ">=2.2.3",
"description": "TensorFlow Hub wrapper for Universal Sentence Encoder",
"author": "Martino Mensio",
"email": "martino.mensio@open.ac.uk",
"url": "https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub",
"license": "MIT",
"requirements": [
"universal_sentence_encoder @ git+ssh://git@github.com/MartinoMensio/spacy-universal-sentence-embedding-tfhub"
],
"sources": [{
"name": "Universal Sentence Encoder - Large",
"url": "https://tfhub.dev/google/universal-sentence-encoder-large",
"license": "Apache-2.0"
}, {
"name": "SpaCy English model",
"url": "https://spacy.io/models/en#en_core_web_sm",
"license": "MIT"
}],
"vectors": {
"width": 512,
"vectors": 0,
"keys": 0,
"name": null
},
"pipeline": [
"tagger",
"parser",
"ner",
"save_tfhub_model_url",
"overwrite_vectors"
],
"factories": {
"tagger": "tagger",
"parser": "parser",
"ner": "ner",
"save_tfhub_model_url": "save_tfhub_model_url",
"overwrite_vectors": "overwrite_vectors"
},
"labels": {
"tagger": [
"$",
"''",
",",
"-LRB-",
"-RRB-",
".",
":",
"ADD",
"AFX",
"CC",
"CD",
"DT",
"EX",
"FW",
"HYPH",
"IN",
"JJ",
"JJR",
"JJS",
"LS",
"MD",
"NFP",
"NN",
"NNP",
"NNPS",
"NNS",
"PDT",
"POS",
"PRP",
"PRP$",
"RB",
"RBR",
"RBS",
"RP",
"SYM",
"TO",
"UH",
"VB",
"VBD",
"VBG",
"VBN",
"VBP",
"VBZ",
"WDT",
"WP",
"WP$",
"WRB",
"XX",
"_SP",
"``"
],
"parser": [
"ROOT",
"acl",
"acomp",
"advcl",
"advmod",
"agent",
"amod",
"appos",
"attr",
"aux",
"auxpass",
"case",
"cc",
"ccomp",
"compound",
"conj",
"csubj",
"csubjpass",
"dative",
"dep",
"det",
"dobj",
"expl",
"intj",
"mark",
"meta",
"neg",
"nmod",
"npadvmod",
"nsubj",
"nsubjpass",
"nummod",
"oprd",
"parataxis",
"pcomp",
"pobj",
"poss",
"preconj",
"predet",
"prep",
"prt",
"punct",
"quantmod",
"relcl",
"xcomp"
],
"ner": [
"CARDINAL",
"DATE",
"EVENT",
"FAC",
"GPE",
"LANGUAGE",
"LAW",
"LOC",
"MONEY",
"NORP",
"ORDINAL",
"ORG",
"PERCENT",
"PERSON",
"PRODUCT",
"QUANTITY",
"TIME",
"WORK_OF_ART"
]
}
}
28 changes: 17 additions & 11 deletions meta/meta.json → meta/en_use_md.json
Original file line number Diff line number Diff line change
@@ -1,18 +1,27 @@
{
"lang": "en",
"name": "use",
"version": "0.1.3",
"name": "use_md",
"version": "0.2.0",
"spacy_version": ">=2.2.3",
"description": "Using TFHub USE",
"description": "TensorFlow Hub wrapper for Universal Sentence Encoder",
"author": "Martino Mensio",
"email": "martino.mensio@open.ac.uk",
"url": "https://github.com/MartinoMensio/spacy-universal-sentence-embedding-tfhub",
"license": "",
"url": "https://github.com/MartinoMensio/spacy-universal-sentence-encoder-tfhub",
"license": "MIT",
"requirements": [
"universal_sentence_encoder @ git+ssh://git@github.com/MartinoMensio/spacy-universal-sentence-embedding-tfhub"
],
"sources": [{
"name": "Universal Sentence Encoder",
"url": "https://tfhub.dev/google/universal-sentence-encoder",
"license": "Apache-2.0"
}, {
"name": "SpaCy English model",
"url": "https://spacy.io/models/en#en_core_web_sm",
"license": "MIT"
}],
"vectors": {
"width": 0,
"width": 512,
"vectors": 0,
"keys": 0,
"name": null
Expand All @@ -21,17 +30,14 @@
"tagger",
"parser",
"ner",
"save_tfhub_model_url",
"overwrite_vectors"
],
"pipeline_args": {
"overwrite_vectors": {
"enable_cache": true
}
},
"factories": {
"tagger": "tagger",
"parser": "parser",
"ner": "ner",
"save_tfhub_model_url": "save_tfhub_model_url",
"overwrite_vectors": "overwrite_vectors"
},
"labels": {
Expand Down
Loading

0 comments on commit 846f2aa

Please sign in to comment.