From 6042d953f78e4e8e4cad508b0257406b1f9f9b23 Mon Sep 17 00:00:00 2001 From: aakbik Date: Mon, 13 May 2019 15:58:52 +0200 Subject: [PATCH] GH-457: classification data loader --- flair/datasets.py | 152 ++++++++++++++++++++++++++++++++++-- tests/test_data_fetchers.py | 18 ++--- 2 files changed, 156 insertions(+), 14 deletions(-) diff --git a/flair/datasets.py b/flair/datasets.py index 6c2672a2d9..9d53414108 100644 --- a/flair/datasets.py +++ b/flair/datasets.py @@ -149,6 +149,85 @@ def __init__( ) +class ClassificationCorpus(TaggedCorpus): + def __init__( + self, + data_folder: Union[str, Path], + train_file=None, + test_file=None, + dev_file=None, + use_tokenizer: bool = True, + max_tokens_per_doc=-1, + ): + """ + Helper function to get a TaggedCorpus from text classification-formatted task data + + :param data_folder: base folder with the task data + :param train_file: the name of the train file + :param test_file: the name of the test file + :param dev_file: the name of the dev file, if None, dev data is sampled from train + :return: a TaggedCorpus with annotated train, dev and test data + """ + + if type(data_folder) == str: + data_folder: Path = Path(data_folder) + + if train_file is not None: + train_file = data_folder / train_file + if test_file is not None: + test_file = data_folder / test_file + if dev_file is not None: + dev_file = data_folder / dev_file + + # automatically identify train / test / dev files + if train_file is None: + for file in data_folder.iterdir(): + file_name = file.name + if "train" in file_name: + train_file = file + if "test" in file_name: + test_file = file + if "dev" in file_name: + dev_file = file + if "testa" in file_name: + dev_file = file + if "testb" in file_name: + test_file = file + + log.info("Reading data from {}".format(data_folder)) + log.info("Train: {}".format(train_file)) + log.info("Dev: {}".format(dev_file)) + log.info("Test: {}".format(test_file)) + + train: Dataset = ClassificationDataset( + train_file, + use_tokenizer=use_tokenizer, + max_tokens_per_doc=max_tokens_per_doc, + ) + test: Dataset = ClassificationDataset( + test_file, + use_tokenizer=use_tokenizer, + max_tokens_per_doc=max_tokens_per_doc, + ) + + if dev_file is not None: + dev: Dataset = ClassificationDataset( + dev_file, + use_tokenizer=use_tokenizer, + max_tokens_per_doc=max_tokens_per_doc, + ) + else: + train_length = len(train) + dev_size: int = round(train_length / 10) + splits = random_split(train, [train_length - dev_size, dev_size]) + train = splits[0] + dev = splits[1] + + super(ClassificationCorpus, self).__init__( + train, dev, test, name=data_folder.name + ) + + class ColumnDataset(Dataset): def __init__( self, @@ -219,9 +298,6 @@ def __len__(self): def __getitem__(self, index: int = 0) -> Sentence: return self.sentences[index] - def __iter__(self): - return iter(self.sentences) - class UniversalDependenciesDataset(Dataset): def __init__(self, path_to_conll_file: Path): @@ -274,8 +350,60 @@ def __len__(self): def __getitem__(self, index: int = 0) -> Sentence: return self.sentences[index] - def __iter__(self): - return iter(self.sentences) + +class ClassificationDataset(Dataset): + def __init__( + self, path_to_file: Union[str, Path], max_tokens_per_doc=-1, use_tokenizer=True + ): + """ + Reads a data file for text classification. The file should contain one document/text per line. + The line should have the following format: + __label__ + If you have a multi class task, you can have as many labels as you want at the beginning of the line, e.g., + __label__ __label__ + :param path_to_file: the path to the data file + :param max_tokens_per_doc: Takes at most this amount of tokens per document. If set to -1 all documents are taken as is. + :return: list of sentences + """ + if type(path_to_file) == str: + path_to_file: Path = Path(path_to_file) + + assert path_to_file.exists() + + label_prefix = "__label__" + self.sentences = [] + + with open(str(path_to_file), encoding="utf-8") as f: + for line in f: + words = line.split() + + labels = [] + l_len = 0 + + for i in range(len(words)): + if words[i].startswith(label_prefix): + l_len += len(words[i]) + 1 + label = words[i].replace(label_prefix, "") + labels.append(label) + else: + break + + text = line[l_len:].strip() + + if text and labels: + sentence = Sentence( + text, labels=labels, use_tokenizer=use_tokenizer + ) + if len(sentence) > max_tokens_per_doc and max_tokens_per_doc > 0: + sentence.tokens = sentence.tokens[:max_tokens_per_doc] + if len(sentence.tokens) > 0: + self.sentences.append(sentence) + + def __len__(self): + return len(self.sentences) + + def __getitem__(self, index: int = 0) -> Sentence: + return self.sentences[index] class CONLL_03(ColumnCorpus): @@ -292,6 +420,20 @@ def __init__(self, base_path=None, tag_to_biloes: str = "ner"): ) +class GERMEVAL(ColumnCorpus): + def __init__(self, base_path=None, tag_to_biloes: str = "ner"): + columns = {1: "text", 2: "ner"} + + # default dataset folder is the cache root + if not base_path: + base_path = Path(flair.cache_root) / "datasets" + data_folder = base_path / "germeval" + + super(GERMEVAL, self).__init__( + data_folder, columns, tag_to_biloes=tag_to_biloes + ) + + class CONLL_2000(ColumnCorpus): def __init__(self, base_path=None, tag_to_biloes: str = "np"): columns = {0: "text", 1: "pos", 2: "np"} diff --git a/tests/test_data_fetchers.py b/tests/test_data_fetchers.py index eeec489aa4..ec3b65b639 100644 --- a/tests/test_data_fetchers.py +++ b/tests/test_data_fetchers.py @@ -9,7 +9,7 @@ def test_load_imdb_data(tasks_base_path): # get training, test and dev data - corpus = NLPTaskDataFetcher.load_corpus("imdb", tasks_base_path) + corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb") assert len(corpus.train) == 5 assert len(corpus.dev) == 5 @@ -26,8 +26,11 @@ def test_load_ag_news_data(tasks_base_path): def test_load_sequence_labeling_data(tasks_base_path): + # get training, test and dev data - corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION, tasks_base_path) + corpus = flair.datasets.ColumnCorpus( + tasks_base_path / "fashion", column_format={0: "text", 2: "ner"} + ) assert len(corpus.train) == 6 assert len(corpus.dev) == 1 @@ -36,7 +39,7 @@ def test_load_sequence_labeling_data(tasks_base_path): def test_load_germeval_data(tasks_base_path): # get training, test and dev data - corpus = NLPTaskDataFetcher.load_corpus(NLPTask.GERMEVAL, tasks_base_path) + corpus = flair.datasets.GERMEVAL(tasks_base_path) assert len(corpus.train) == 2 assert len(corpus.dev) == 1 @@ -57,9 +60,6 @@ def test_load_no_dev_data(tasks_base_path): corpus = flair.datasets.ColumnCorpus( tasks_base_path / "fashion_nodev", column_format={0: "text", 2: "ner"} ) - # corpus = NLPTaskDataFetcher.load_column_corpus( - # tasks_base_path / "fashion_nodev", {0: "text", 2: "ner"} - # ) assert len(corpus.train) == 5 assert len(corpus.dev) == 1 @@ -68,9 +68,9 @@ def test_load_no_dev_data(tasks_base_path): def test_load_no_dev_data_explicit(tasks_base_path): # get training, test and dev data - corpus = NLPTaskDataFetcher.load_column_corpus( + corpus = flair.datasets.ColumnCorpus( tasks_base_path / "fashion_nodev", - {0: "text", 2: "ner"}, + column_format={0: "text", 2: "ner"}, train_file="train.tsv", test_file="test.tsv", ) @@ -93,7 +93,7 @@ def test_multi_corpus(tasks_base_path): def test_download_load_data(tasks_base_path): # get training, test and dev data for full English UD corpus from web - corpus = NLPTaskDataFetcher.load_corpus(NLPTask.UD_ENGLISH) + corpus = flair.datasets.UD_ENGLISH() assert len(corpus.train) == 12543 assert len(corpus.dev) == 2002