diff --git a/resources/docs/EXPERIMENTS.md b/resources/docs/EXPERIMENTS.md index 6d24cbeb41..bebc8d838c 100644 --- a/resources/docs/EXPERIMENTS.md +++ b/resources/docs/EXPERIMENTS.md @@ -26,11 +26,11 @@ resources/tasks/conll_03/eng.testb resources/tasks/conll_03/eng.train ``` -This allows the `NLPTaskDataFetcher` class to read the data into our data structures. Use the `NLPTask` enum to select -the dataset, as follows: +This allows the `CONLL_03()` corpus object to read the data into our data structures. Initialize the corpus as follows: ```python -corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks') +from flair.datasets import CONLL_03 +corpus: Corpus = CONLL_03(base_path='resources/tasks') ``` This gives you a `Corpus` object that contains the data. Now, select `ner` as the tag you wish to predict and init the embeddings you wish to use. @@ -41,12 +41,12 @@ The full code to get a state-of-the-art model for English NER is as follows: ```python from flair.data import Corpus -from flair.data_fetcher import NLPTaskDataFetcher, NLPTask +from flair.datasets import CONLL_03 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings from typing import List # 1. get the corpus -corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03, base_path='resources/tasks') +corpus: Corpus = CONLL_03(base_path='resources/tasks') # 2. what tag do we want to predict? tag_type = 'ner' @@ -83,6 +83,7 @@ from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('resources/taggers/example-ner', + train_with_dev=True, max_epochs=150) ``` @@ -109,12 +110,12 @@ FastText word embeddings and German contextual string embeddings. The full code ```python from flair.data import Corpus -from flair.data_fetcher import NLPTaskDataFetcher, NLPTask +from flair.datasets import CONLL_03_GERMAN from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings from typing import List # 1. get the corpus -corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_GERMAN, base_path='resources/tasks') +corpus: Corpus = CONLL_03_GERMAN(base_path='resources/tasks') # 2. what tag do we want to predict? tag_type = 'ner' @@ -145,6 +146,7 @@ from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('resources/taggers/example-ner', + train_with_dev=True, max_epochs=150) ``` @@ -164,12 +166,12 @@ FastText word embeddings and German contextual string embeddings. The full code ```python from flair.data import Corpus -from flair.data_fetcher import NLPTaskDataFetcher, NLPTask +from flair.datasets import CONLL_03_DUTCH from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, PooledFlairEmbeddings from typing import List # 1. get the corpus -corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks') +corpus: Corpus = CONLL_03_DUTCH() # 2. what tag do we want to predict? tag_type = 'ner' @@ -200,6 +202,7 @@ from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('resources/taggers/example-ner', + train_with_dev=True, max_epochs=150) ``` @@ -215,16 +218,16 @@ Data is included in Flair and will get automatically downloaded when you run the #### Best Known Configuration Once you have the data, reproduce our experiments exactly like for CoNLL-03, just with a different dataset and with -FastText word embeddings and German contextual string embeddings. The full code then is as follows: +FastText word embeddings for twitter and crawls. The full code then is as follows: ```python from flair.data import Corpus -from flair.data_fetcher import NLPTaskDataFetcher, NLPTask +from flair.datasets import WNUT_17 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List # 1. get the corpus -corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03_DUTCH, base_path='resources/tasks') +corpus: Corpus = WNUT_17() # 2. what tag do we want to predict? tag_type = 'ner' @@ -256,6 +259,7 @@ from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('resources/taggers/example-ner', + train_with_dev=True, max_epochs=150) ``` @@ -283,16 +287,18 @@ resources/tasks/onto-ner/eng.train #### Best Known Configuration Once you have the data, reproduce our experiments exactly like for CoNLL-03, just with a different dataset and with -FastText embeddings (they work better on this dataset). The full code then is as follows: +FastText embeddings (they work better on this dataset). You also need to provide a `column_format` for the `ColumnCorpus` object indicating which column in the training file is the 'ner' information. The full code then is as follows: ```python from flair.data import Corpus -from flair.data_fetcher import NLPTaskDataFetcher, NLPTask +from flair.datasets import ColumnCorpus from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List # 1. get the corpus -corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.ONTONER, base_path='resources/tasks') +corpus: Corpus = flair.datasets.ColumnCorpus('resources/tasks/onto-ner', + column_format={0: 'text', 1: 'pos', 2: 'upos', 3: 'ner'}, + tag_to_bioes='ner') # 2. what tag do we want to predict? tag_type = 'ner' @@ -324,8 +330,9 @@ trainer: ModelTrainer = ModelTrainer(tagger, corpus) trainer.train('resources/taggers/example-ner', learning_rate=0.1, - # it's a big dataset so maybe set embeddings_in_memory to False - embeddings_in_memory=False) + train_with_dev=True, + # it's a big dataset so maybe set embeddings_storage_mode to 'none' (embeddings are not kept in memory) + embeddings_storage_mode='none') ``` @@ -340,12 +347,12 @@ trainer.train('resources/taggers/example-ner', Get the [Penn treebank](https://catalog.ldc.upenn.edu/ldc99t42) and follow the guidelines in [Collins (2002)](http://www.cs.columbia.edu/~mcollins/papers/tagperc.pdf) to produce train, dev and test splits. -Convert splits into CoNLLU-U format and place train, test and dev data in `resources/tasks/penn/` as follows: +Convert splits into CoNLLU-U format and place train, test and dev data in `/path/to/penn/` as follows: ``` -resources/tasks/penn/test.conll -resources/tasks/penn/train.conll -resources/tasks/penn/valid.conll +/path/to/penn/test.conll +/path/to/penn/train.conll +/path/to/penn/valid.conll ``` Then, run the experiments with extvec embeddings and contextual string embeddings. Also, select 'pos' as `tag_type`, @@ -355,12 +362,12 @@ so the algorithm knows that POS tags and not NER are to be predicted from this d ```python from flair.data import Corpus -from flair.data_fetcher import NLPTaskDataFetcher, NLPTask +from flair.datasets import UniversalDependenciesCorpus from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List # 1. get the corpus -corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.PENN, base_path='resources/tasks') +corpus: Corpus = UniversalDependenciesCorpus(base_path='/path/to/penn') # 2. what tag do we want to predict? tag_type = 'pos' @@ -389,10 +396,9 @@ from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) -trainer.train('resources/taggers/example-ner', - max_epochs=150, - # its a big dataset, so maybe set embeddings_in_memory=False - embeddings_in_memory=True) +trainer.train('resources/taggers/example-pos', + train_with_dev=True, + max_epochs=150) ``` ## CoNLL-2000 Noun Phrase Chunking (English) @@ -411,12 +417,12 @@ so the algorithm knows that chunking tags and not NER are to be predicted from t ```python from flair.data import Corpus -from flair.data_fetcher import NLPTaskDataFetcher, NLPTask +from flair.datasets import CONLL_2000 from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings, FlairEmbeddings from typing import List # 1. get the corpus -corpus: Corpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_2000) +corpus: Corpus = CONLL_2000() # 2. what tag do we want to predict? tag_type = 'np' @@ -446,6 +452,7 @@ from flair.trainers import ModelTrainer trainer: ModelTrainer = ModelTrainer(tagger, corpus) -trainer.train('resources/taggers/example-ner', +trainer.train('resources/taggers/example-chunk', + train_with_dev=True, max_epochs=150) ``` diff --git a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md index 5d037b5dc4..b4bf2018c4 100644 --- a/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md +++ b/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md @@ -9,11 +9,11 @@ library. All word embedding classes inherit from the `TokenEmbeddings` class and implement the `embed()` method which you need to call to embed your text. This means that for most users of Flair, the complexity of different embeddings remains -hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text. - -All embeddings produced with our methods are PyTorch vectors, so they can be immediately used for training and +hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text. All embeddings produced with our methods are PyTorch vectors, so they can be immediately used for training and fine-tuning. +This tutorial introduces some common embeddings and shows you how to use them. For more details on these embeddings and an overview of all supported embeddings, check [here](/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md). + ## Classic Word Embeddings @@ -45,7 +45,7 @@ for token in sentence: print(token.embedding) ``` -This prints out the tokens and their embeddings. GloVe embeddings are Pytorch vectors of dimensionality 100. +This prints out the tokens and their embeddings. GloVe embeddings are PyTorch vectors of dimensionality 100. You choose which pre-trained embeddings you load by passing the appropriate id string to the constructor of the `WordEmbeddings` class. Typically, you use @@ -63,16 +63,44 @@ We generally recommend the FastText embeddings, or GloVe if you want a smaller m ## Flair Embeddings +Contextual string embeddings are [powerful embeddings](https://www.aclweb.org/anthology/C18-1139/) + that capture latent syntactic-semantic information that goes beyond +standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and +thus fundamentally model words as sequences of characters. And (2) they are **contextualized** by their +surrounding text, meaning that the *same word will have different embeddings depending on its +contextual use*. + +With Flair, you can use these embeddings simply by instantiating the appropriate embedding class, same as standard word embeddings: + +```python +from flair.embeddings import FlairEmbeddings + +# init embedding +flair_embedding_forward = FlairEmbeddings('news-forward') + +# create a sentence +sentence = Sentence('The grass is green .') + +# embed words in sentence +flair_embedding_forward.embed(sentence) +``` + +You choose which embeddings you load by passing the appropriate string to the constructor of the `FlairEmbeddings` class. For all supported languages, there is a forward and a backward model. You can load a model for a language by using the **two-letter language code** followed by a hyphen and either **forward** or **backward**. So, if you want to load the forward and backward Flair models for German, do it like this: + +```python +# init forward embedding for German +flair_embedding_forward = FlairEmbeddings('de-forward') +flair_embedding_backward = FlairEmbeddings('de-backward') +``` ## Stacked Embeddings Stacked embeddings are one of the most important concepts of this library. You can use them to combine different embeddings together, for instance if you want to use both traditional embeddings together with contextual string -embeddings (see next chapter). -Stacked embeddings allow you to mix and match. We find that a combination of embeddings often gives best results. +embeddings. Stacked embeddings allow you to mix and match. We find that a combination of embeddings often gives best results. All you need to do is use the `StackedEmbeddings` class and instantiate it by passing a list of embeddings that you wish -to combine. For instance, lets combine classic GloVe embeddings with character embeddings. This is effectively the architecture proposed in (Lample et al., 2016). +to combine. For instance, lets combine classic GloVe embeddings with forward and backward Flair embeddings. This is a combination that we generally recommend to most users, especially for sequence labeling. First, instantiate the two embeddings you wish to combine: @@ -82,20 +110,25 @@ from flair.embeddings import WordEmbeddings, CharacterEmbeddings # init standard GloVe embedding glove_embedding = WordEmbeddings('glove') -# init standard character embeddings -character_embeddings = CharacterEmbeddings() +# init Flair forward and backwards embeddings +flair_embedding_forward = FlairEmbeddings('news-forward') +flair_embedding_backward = FlairEmbeddings('news-backward') ``` Now instantiate the `StackedEmbeddings` class and pass it a list containing these two embeddings. ```python -from flair.embeddings import StackedEmbeddings - -# now create the StackedEmbedding object that combines all embeddings -stacked_embeddings = StackedEmbeddings( - embeddings=[glove_embedding, character_embeddings]) +from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings + +# create a StackedEmbedding object that combines glove and forward/backward flair embeddings +stacked_embeddings = StackedEmbeddings([ + glove_embedding, + flair_embedding_forward, + flair_embedding_backward, + ]) ``` + That's it! Now just use this embedding like all the other embeddings, i.e. call the `embed()` method over your sentences. ```python @@ -110,11 +143,12 @@ for token in sentence: print(token.embedding) ``` -Words are now embedded using a concatenation of two different embeddings. This means that the resulting embedding +Words are now embedded using a concatenation of three different embeddings. This means that the resulting embedding vector is still a single PyTorch vector. ## Next -You can now either look into [BERT, ELMo, and Flair embeddings](/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md), +To get more details on this embeddings and a full overview of all embeddings that we support, you can look into this +[tutorial](/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md), or go directly to the tutorial about [loading your corpus](/resources/docs/TUTORIAL_6_CORPUS.md), which is a pre-requirement for [training your own models](/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md). diff --git a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md index afd9b22220..daf1c64a25 100644 --- a/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md +++ b/resources/docs/TUTORIAL_4_ELMO_BERT_FLAIR_EMBEDDING.md @@ -1,28 +1,37 @@ -# Tutorial 4: BERT, ELMo, and Flair Embeddings +# Tutorial 4: List of All Word Embeddings -Next to standard WordEmbeddings and CharacterEmbeddings, we also provide classes for BERT, ELMo and Flair embeddings. These embeddings enable you to train truly state-of-the-art NLP models. +This is not so much a tutorial, but rather a list of all embeddings that we currently support in Flair. We assume that you're familiar with the [base types](/resources/docs/TUTORIAL_1_BASICS.md) of this library as well as [standard word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md), in particular the `StackedEmbeddings` class. -This tutorial explains how to use these embeddings. We assume that you're familiar with the [base types](/resources/docs/TUTORIAL_1_BASICS.md) of this library as well as [standard word embeddings](/resources/docs/TUTORIAL_3_WORD_EMBEDDING.md), in particular the `StackedEmbeddings` class. - -## Embeddings +## Overview All word embedding classes inherit from the `TokenEmbeddings` class and implement the `embed()` method which you need to call to embed your text. This means that for most users of Flair, the complexity of different embeddings remains hidden behind this interface. Simply instantiate the embedding class you require and call `embed()` to embed your text. -All embeddings produced with our methods are Pytorch vectors, so they can be immediately used for training and -fine-tuning. - - +The following word embeddings are currently supported: + +| Class | Type | Paper | +| ------------- | ------------- | ------------- | +| [`BertEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained BERT | [Devlin et. al, 2018](https://www.aclweb.org/anthology/N19-1423/) | +| [`BytePairEmbeddings`](/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md) | Subword-level word embeddings | [Heinzerling and Strube, 2018](https://www.aclweb.org/anthology/L18-1473) | +| [`CharacterEmbeddings`](/resources/docs/embeddings/CHARACTER_EMBEDDINGS.md) | Task-trained character-level embeddings of words | [Lample et al., 2016](https://www.aclweb.org/anthology/N16-1030) | +| [`ELMoEmbeddings`](/resources/docs/embeddings/ELMO_EMBEDDINGS.md) | Contextualized word-level embeddings | [Peters et al., 2018](https://aclweb.org/anthology/N18-1202) | +| [`FastTextEmbeddings`](/resources/docs/embeddings/FASTTEXT_EMBEDDINGS.md) | Word embeddings with subword features | [Bojanowski et al., 2017](https://aclweb.org/anthology/Q17-1010) | +| [`FlairEmbeddings`](/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) | Contextualized character-level embeddings | [Akbik et. al, 2018](https://www.aclweb.org/anthology/C18-1139/) | +| [`PooledFlairEmbeddings`](/resources/docs/embeddings/FLAIR_EMBEDDINGS.md) | Pooled variant of `FlairEmbeddings` | [Akbik et. al, 2019](https://www.aclweb.org/anthology/N19-1078/) | +| [`OpenAIGPTEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) and [`OpenAIGPT2Embeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained OpenAIGPT models | | +| [`RoBERTaEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from RoBERTa | | +| [`TransformerXLEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained transformer-XL | | +| [`WordEmbeddings`](/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md) | Classic word embeddings | | +| [`XLNetEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained XLNet | | +| [`XLMEmbeddings`](/resources/docs/embeddings/TRANSFOMER_EMBEDDINGS.md) | Embeddings from pretrained XLM | | ## Combining BERT and Flair You can very easily mix and match Flair, ELMo, BERT and classic word embeddings. All you need to do is instantiate each embedding you wish to combine and use them in a StackedEmbedding. -For instance, let's say we want to combine the multilingual Flair and BERT embeddings to train a hyper-powerful multilingual downstream task model. - -First, instantiate the embeddings you wish to combine: +For instance, let's say we want to combine the multilingual Flair and BERT embeddings to train a hyper-powerful multilingual downstream task model. First, instantiate the embeddings you wish to combine: ```python from flair.embeddings import FlairEmbeddings, BertEmbeddings diff --git a/resources/docs/TUTORIAL_6_CORPUS.md b/resources/docs/TUTORIAL_6_CORPUS.md index 9b250fc356..172e7c65fc 100644 --- a/resources/docs/TUTORIAL_6_CORPUS.md +++ b/resources/docs/TUTORIAL_6_CORPUS.md @@ -227,11 +227,11 @@ data the first time you call the corresponding constructor ID. The following dat | 'WASSA_SADNESS' | English | The [WASSA](https://competitions.codalab.org/competitions/16380#learn_the_details) emotion-intensity detection challenge (sadness) | -So to load the 20 newsgroups corpus for text classification, simply do: +So to load the IMDB corpus for sentiment text classification, simply do: ```python import flair.datasets -corpus = flair.datasets.NEWSGROUPS() +corpus = flair.datasets.IMDB() ``` This downloads and sets up everything you need to train your model. @@ -339,10 +339,7 @@ corpus: Corpus = CSVClassificationCorpus(data_folder, #### FastText Format -If using `CSVClassificationCorpus` is not practical, you may format your data to the -FastText format, in which each line in the file represents a -text document. A document can have one or multiple labels that are defined at the beginning of the line starting with -the prefix `__label__`. This looks like this: +If using `CSVClassificationCorpus` is not practical, you may format your data to the FastText format, in which each line in the file represents a text document. A document can have one or multiple labels that are defined at the beginning of the line starting with the prefix `__label__`. This looks like this: ```bash __label__