flairNLP · alanakbik · May 7, 2019 · Apr 26, 2019 · May 2, 2019 · May 2, 2019
diff --git a/flair/data_fetcher.py b/flair/data_fetcher.py
@@ -1,3 +1,4 @@
+import os
 from typing import List, Dict, Union
 import re
 import logging
@@ -99,6 +100,10 @@ class NLPTask(Enum):
 
     # text regression format
     REGRESSION = "regression"
+    WASSA_ANGER = "wassa-anger"
+    WASSA_FEAR = "wassa-fear"
+    WASSA_JOY = "wassa-joy"
+    WASSA_SADNESS = "wassa-sadness"
 
 
 class NLPTaskDataFetcher:
@@ -234,6 +239,11 @@ def load_corpus(
                 data_folder, columns, tag_to_biloes="ner"
             )
 
+        if task.startswith("wassa"):
+            return NLPTaskDataFetcher.load_classification_corpus(
+                data_folder, use_tokenizer=True
+            )
+
     @staticmethod
     def load_column_corpus(
         data_folder: Union[str, Path],
@@ -1292,6 +1302,41 @@ def download_dataset(task: NLPTask):
                 Path("datasets") / task.value,
             )
 
+        if task.value.startswith("wassa"):
+
+            emotion = task.value[6:]
+
+            for split in ["train", "dev", "test"]:
+
+                data_file = (
+                    Path(flair.cache_root)
+                    / "datasets"
+                    / task.value
+                    / f"{emotion}-{split}.txt"
+                )
+
+                if not data_file.is_file():
+
+                    if split == "train":
+                        url = f"http://saifmohammad.com/WebDocs/EmoInt%20Train%20Data/{emotion}-ratings-0to1.train.txt"
+                    if split == "dev":
+                        url = f"http://saifmohammad.com/WebDocs/EmoInt%20Dev%20Data%20With%20Gold/{emotion}-ratings-0to1.dev.gold.txt"
+                    if split == "test":
+                        url = f"http://saifmohammad.com/WebDocs/EmoInt%20Test%20Gold%20Data/{emotion}-ratings-0to1.test.gold.txt"
+
+                    path = cached_path(url, Path("datasets") / task.value)
+
+                    with open(path, "r") as f:
+                        with open(data_file, "w") as out:
+                            next(f)
+                            for line in f:
+                                fields = line.split("\t")
+                                out.write(
+                                    f"__label__{fields[3].rstrip()} {fields[1]}\n"
+                                )
+
+                    os.remove(path)
+
         if task == NLPTask.UD_GERMAN_HDT:
             cached_path(
                 f"{ud_path}UD_German-HDT/dev/de_hdt-ud-dev.conllu",

diff --git a/flair/file_utils.py b/flair/file_utils.py
@@ -99,6 +99,50 @@ def cached_path(url_or_filename: str, cache_dir: Path) -> Path:
         )
 
 
+def unzip_file(file: Path, unzip_to: Path):
+    # unpack and write out in CoNLL column-like format
+    from zipfile import ZipFile
+
+    with ZipFile(file, "r") as zipObj:
+        # Extract all the contents of zip file in current directory
+        zipObj.extractall(unzip_to)
+
+
+def download_file(url: str, cache_dir: Path):
+    cache_dir.mkdir(parents=True, exist_ok=True)
+
+    filename = re.sub(r".+/", "", url)
+    # get cache path to put the file
+    cache_path = cache_dir / filename
+    print(cache_path)
+
+    # Download to temporary file, then copy to cache dir once finished.
+    # Otherwise you get corrupt cache entries if the download gets interrupted.
+    fd, temp_filename = tempfile.mkstemp()
+    logger.info("%s not found in cache, downloading to %s", url, temp_filename)
+
+    # GET file object
+    req = requests.get(url, stream=True)
+    content_length = req.headers.get("Content-Length")
+    total = int(content_length) if content_length is not None else None
+    progress = Tqdm.tqdm(unit="B", total=total)
+    with open(temp_filename, "wb") as temp_file:
+        for chunk in req.iter_content(chunk_size=1024):
+            if chunk:  # filter out keep-alive new chunks
+                progress.update(len(chunk))
+                temp_file.write(chunk)
+
+    progress.close()
+
+    logger.info("copying %s to cache at %s", temp_filename, cache_path)
+    shutil.copyfile(temp_filename, str(cache_path))
+    logger.info("removing temp file %s", temp_filename)
+    os.close(fd)
+    os.remove(temp_filename)
+
+    progress.close()
+
+
 # TODO(joelgrus): do we want to do checksums or anything like that?
 def get_from_cache(url: str, cache_dir: Path = None) -> Path:
     """

diff --git a/flair/models/text_regression_model.py b/flair/models/text_regression_model.py
@@ -1,6 +1,7 @@
 from pathlib import Path
 
 import flair
+import flair.embeddings
 import torch
 import torch.nn as nn
 from typing import List, Union
@@ -26,13 +27,13 @@ def __init__(self, document_embeddings: flair.embeddings.DocumentEmbeddings):
 
     def _labels_to_indices(self, sentences: List[Sentence]):
         indices = [
-            torch.FloatTensor([float(label.value) for label in sentence.labels])
+            torch.tensor(
+                [float(label.value) for label in sentence.labels], dtype=torch.float
+            )
             for sentence in sentences
         ]
 
-        vec = torch.cat(indices, 0)
-        if torch.cuda.is_available():
-            vec = vec.cuda()
+        vec = torch.cat(indices, 0).to(flair.device)
 
         return vec