Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-563: prepare release 0.4.3 #1037

Merged
merged 26 commits into from
Aug 26, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
378853d
add embeddings by @stefan-it and documentation
Aug 23, 2019
1c6d940
Update TUTORIAL_3_WORD_EMBEDDING.md
Aug 23, 2019
15725c5
Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
Aug 23, 2019
f5d1770
Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
Aug 23, 2019
bb86795
Update TUTORIAL_3_WORD_EMBEDDING.md
Aug 23, 2019
14a4e4e
Update TUTORIAL_3_WORD_EMBEDDING.md
Aug 23, 2019
5cdb765
Update TUTORIAL_3_WORD_EMBEDDING.md
Aug 23, 2019
feaf083
Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
Aug 23, 2019
127526f
Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
Aug 23, 2019
a3320c3
Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
Aug 23, 2019
65ba382
Update TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md
Aug 23, 2019
e40a7ea
Update TUTORIAL_6_CORPUS.md
Aug 23, 2019
b177a69
Update TUTORIAL_6_CORPUS.md
Aug 23, 2019
7d9b0e2
Update EXPERIMENTS.md
Aug 23, 2019
604a85a
Update EXPERIMENTS.md
Aug 23, 2019
ab7399b
Update EXPERIMENTS.md
Aug 23, 2019
3f4c55c
Update EXPERIMENTS.md
Aug 23, 2019
8804529
Documentation of data loaders
Aug 23, 2019
a754922
Merge branch 'GH-563-prepare-release-0-4-3' of github.com:zalandorese…
Aug 23, 2019
81be6d8
embeddings: add support for large ELMo model (trained on 5.5B tokens)
stefan-it Aug 25, 2019
43c7b97
Merge pull request #1032 from zalandoresearch/more-elmo-models
yosipk Aug 26, 2019
a3ccf17
Added comments to datasets
Aug 26, 2019
fb5f6ed
documentation
Aug 26, 2019
cd5d58b
Update BYTE_PAIR_EMBEDDINGS.md
Aug 26, 2019
7f7823f
comments
Aug 26, 2019
6d56f9b
Merge branch 'GH-563-prepare-release-0-4-3' of github.com:zalandorese…
Aug 26, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
476 changes: 418 additions & 58 deletions flair/datasets.py

Large diffs are not rendered by default.

13 changes: 10 additions & 3 deletions flair/embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
from .nn import LockedDropout, WordDropout
from .data import Dictionary, Token, Sentence
from .file_utils import cached_path, open_inside_zip
from .training_utils import log_line

log = logging.getLogger("flair")

Expand Down Expand Up @@ -730,6 +729,9 @@ def __init__(
if model == "medium":
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
if model in ["large", "5.5B"]:
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
if model == "pt" or model == "portuguese":
options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_options.json"
weight_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/contributed/pt/elmo_pt_weights.hdf5"
Expand Down Expand Up @@ -1653,8 +1655,10 @@ def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512):
"es-forward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_forward/lm-es-forward-fast.pt",
"es-backward-fast": f"{aws_path}/embeddings-v0.4/language_model_es_backward/lm-es-backward-fast.pt",
# Basque
"eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt",
"eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt",
"eu-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.2.pt",
"eu-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.2.pt",
"eu-v1-forward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-forward-v0.1.pt",
"eu-v1-backward": f"{aws_path}/embeddings-stefan-it/lm-eu-opus-large-backward-v0.1.pt",
"eu-v0-forward": f"{aws_path}/embeddings-v0.4/lm-eu-large-forward-v0.1.pt",
"eu-v0-backward": f"{aws_path}/embeddings-v0.4/lm-eu-large-backward-v0.1.pt",
# Persian
Expand Down Expand Up @@ -1713,6 +1717,9 @@ def __init__(self, model, fine_tune: bool = False, chars_per_chunk: int = 512):
"sv-backward": f"{aws_path}/embeddings-stefan-it/lm-sv-opus-large-backward-v0.1.pt",
"sv-v0-forward": f"{aws_path}/embeddings-v0.4/lm-sv-large-forward-v0.1.pt",
"sv-v0-backward": f"{aws_path}/embeddings-v0.4/lm-sv-large-backward-v0.1.pt",
# Tamil
"ta-forward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-forward-v0.1.pt",
"ta-backward": f"{aws_path}/embeddings-stefan-it/lm-ta-opus-large-backward-v0.1.pt",
}

if type(model) == str:
Expand Down
14 changes: 14 additions & 0 deletions flair/models/sequence_tagger_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,20 @@ def __init__(
train_initial_hidden_state: bool = False,
pickle_module: str = "pickle",
):
"""
Initializes a SequenceTagger
:param hidden_size: number of hidden states in RNN
:param embeddings: word embeddings used in tagger
:param tag_dictionary: dictionary of tags you want to predict
:param tag_type: string identifier for tag type
:param use_crf: if True use CRF decoder, else project directly to tag space
:param use_rnn: if True use RNN layer, otherwise use word embeddings directly
:param rnn_layers: number of RNN layers
:param dropout: dropout probability
:param word_dropout: word dropout probability
:param locked_dropout: locked dropout probability
:param train_initial_hidden_state: if True, trains initial hidden state of RNN
"""

super(SequenceTagger, self).__init__()

Expand Down
8 changes: 8 additions & 0 deletions flair/models/text_classification_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,14 @@ def __init__(
multi_label: bool = None,
multi_label_threshold: float = 0.5,
):
"""
Initializes a TextClassifier
:param document_embeddings: embeddings used to embed each data point
:param label_dictionary: dictionary of labels you want to predict
:param multi_label: auto-detected by default, but you can set this to True to force multi-label prediction
or False to force single-label prediction
:param multi_label_threshold: If multi-label you can set the threshold to make predictions
"""

super(TextClassifier, self).__init__()

Expand Down
13 changes: 10 additions & 3 deletions flair/trainers/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,16 +40,24 @@ def __init__(
corpus: Corpus,
optimizer: torch.optim.Optimizer = SGD,
epoch: int = 0,
loss: float = 10000.0,
optimizer_state: dict = None,
scheduler_state: dict = None,
use_tensorboard: bool = False,
):
"""
Initialize a model trainer
:param model: The model that you want to train. The model should inherit from flair.nn.Model
:param corpus: The dataset used to train the model, should be of type Corpus
:param optimizer: The optimizer to use (typically SGD or Adam)
:param epoch: The starting epoch (normally 0 but could be higher if you continue training model)
:param optimizer_state: Optimizer state (necessary if continue training from checkpoint)
:param scheduler_state: Scheduler state (necessary if continue training from checkpoint)
:param use_tensorboard: If True, writes out tensorboard information
"""
self.model: flair.nn.Model = model
self.corpus: Corpus = corpus
self.optimizer: torch.optim.Optimizer = optimizer
self.epoch: int = epoch
self.loss: float = loss
self.scheduler_state: dict = scheduler_state
self.optimizer_state: dict = optimizer_state
self.use_tensorboard: bool = use_tensorboard
Expand Down Expand Up @@ -539,7 +547,6 @@ def load_from_checkpoint(
corpus,
optimizer,
epoch=checkpoint["epoch"],
loss=checkpoint["loss"],
optimizer_state=checkpoint["optimizer_state_dict"],
scheduler_state=checkpoint["scheduler_state_dict"],
)
Expand Down
67 changes: 37 additions & 30 deletions resources/docs/EXPERIMENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,11 @@ resources/tasks/conll_03/eng.testb
resources/tasks/conll_03/eng.train
```

This allows the `NLPTaskDataFetcher` class to read the data into our data structures. Use the `NLPTask` enum to select
the dataset, as follows:
This allows the `CONLL_03()` corpus object to read the data into our data structures. Initialize the corpus as follows:

```python
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
from flair.datasets import CONLL_03
corpus: Corpus = CONLL_03(base_path='resources/tasks')
```

This gives you a `Corpus` object that contains the data. Now, select `ner` as the tag you wish to predict and init the embeddings you wish to use.
Expand All @@ -41,12 +41,12 @@ The full code to get a state-of-the-art model for English NER is as follows:

```python
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.datasets import CONLL_03
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

# 1. get the corpus
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks')
corpus: Corpus = CONLL_03(base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -83,6 +83,7 @@ from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
train_with_dev=True,
max_epochs=150)
```

Expand All @@ -109,12 +110,12 @@ FastText word embeddings and German contextual string embeddings. The full code

```python
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.datasets import CONLL_03_GERMAN
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

# 1. get the corpus
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_GERMAN, base_path='resources/tasks')
corpus: Corpus = CONLL_03_GERMAN(base_path='resources/tasks')

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -145,6 +146,7 @@ from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
train_with_dev=True,
max_epochs=150)
```

Expand All @@ -164,12 +166,12 @@ FastText word embeddings and German contextual string embeddings. The full code

```python
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.datasets import CONLL_03_DUTCH
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings
from typing import List

# 1. get the corpus
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')
corpus: Corpus = CONLL_03_DUTCH()

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -200,6 +202,7 @@ from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
train_with_dev=True,
max_epochs=150)
```

Expand All @@ -215,16 +218,16 @@ Data is included in Flair and will get automatically downloaded when you run the

#### Best Known Configuration
Once you have the data, reproduce our experiments exactly like for CoNLL-03, just with a different dataset and with
FastText word embeddings and German contextual string embeddings. The full code then is as follows:
FastText word embeddings for twitter and crawls. The full code then is as follows:

```python
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.datasets import WNUT_17
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

# 1. get the corpus
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks')
corpus: Corpus = WNUT_17()

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -256,6 +259,7 @@ from flair.trainers import ModelTrainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
train_with_dev=True,
max_epochs=150)
```

Expand Down Expand Up @@ -283,16 +287,18 @@ resources/tasks/onto-ner/eng.train
#### Best Known Configuration

Once you have the data, reproduce our experiments exactly like for CoNLL-03, just with a different dataset and with
FastText embeddings (they work better on this dataset). The full code then is as follows:
FastText embeddings (they work better on this dataset). You also need to provide a `column_format` for the `ColumnCorpus` object indicating which column in the training file is the 'ner' information. The full code then is as follows:

```python
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.datasets import ColumnCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

# 1. get the corpus
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.ONTONER, base_path='resources/tasks')
corpus: Corpus = flair.datasets.ColumnCorpus('resources/tasks/onto-ner',
column_format={0: 'text', 1: 'pos', 2: 'upos', 3: 'ner'},
tag_to_bioes='ner')

# 2. what tag do we want to predict?
tag_type = 'ner'
Expand Down Expand Up @@ -324,8 +330,9 @@ trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
learning_rate=0.1,
# it's a big dataset so maybe set embeddings_in_memory to False
embeddings_in_memory=False)
train_with_dev=True,
# it's a big dataset so maybe set embeddings_storage_mode to 'none' (embeddings are not kept in memory)
embeddings_storage_mode='none')
```


Expand All @@ -340,12 +347,12 @@ trainer.train('resources/taggers/example-ner',

Get the [Penn treebank](https://catalog.ldc.upenn.edu/ldc99t42) and follow the guidelines
in [Collins (2002)](http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf) to produce train, dev and test splits.
Convert splits into CoNLLU-U format and place train, test and dev data in `resources/tasks/penn/` as follows:
Convert splits into CoNLLU-U format and place train, test and dev data in `/path/to/penn/` as follows:

```
resources/tasks/penn/test.conll
resources/tasks/penn/train.conll
resources/tasks/penn/valid.conll
/path/to/penn/test.conll
/path/to/penn/train.conll
/path/to/penn/valid.conll
```

Then, run the experiments with extvec embeddings and contextual string embeddings. Also, select 'pos' as `tag_type`,
Expand All @@ -355,12 +362,12 @@ so the algorithm knows that POS tags and not NER are to be predicted from this d

```python
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.datasets import UniversalDependenciesCorpus
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

# 1. get the corpus
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.PENN, base_path='resources/tasks')
corpus: Corpus = UniversalDependenciesCorpus(base_path='/path/to/penn')

# 2. what tag do we want to predict?
tag_type = 'pos'
Expand Down Expand Up @@ -389,10 +396,9 @@ from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
max_epochs=150,
# its a big dataset, so maybe set embeddings_in_memory=False
embeddings_in_memory=True)
trainer.train('resources/taggers/example-pos',
train_with_dev=True,
max_epochs=150)
```

## CoNLL-2000 Noun Phrase Chunking (English)
Expand All @@ -411,12 +417,12 @@ so the algorithm knows that chunking tags and not NER are to be predicted from t

```python
from flair.data import Corpus
from flair.data_fetcher import NLPTaskDataFetcher, NLPTask
from flair.datasets import CONLL_2000
from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings
from typing import List

# 1. get the corpus
corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_2000)
corpus: Corpus = CONLL_2000()

# 2. what tag do we want to predict?
tag_type = 'np'
Expand Down Expand Up @@ -446,6 +452,7 @@ from flair.trainers import ModelTrainer

trainer: ModelTrainer = ModelTrainer(tagger, corpus)

trainer.train('resources/taggers/example-ner',
trainer.train('resources/taggers/example-chunk',
train_with_dev=True,
max_epochs=150)
```
Loading