From 36173642cc5100c9208b9c9bc383e98182c4b376 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 15 Oct 2019 14:21:55 +0200
Subject: [PATCH 1/3] update tutorial notebook

---
 tutorials/1_farm_building_blocks.ipynb | 709 +++----------------------
 1 file changed, 70 insertions(+), 639 deletions(-)

diff --git a/tutorials/1_farm_building_blocks.ipynb b/tutorials/1_farm_building_blocks.ipynb
index b5bf2c77a..20df8fcb1 100644
--- a/tutorials/1_farm_building_blocks.ipynb
+++ b/tutorials/1_farm_building_blocks.ipynb
@@ -57,22 +57,26 @@
     "from farm.modeling.prediction_head import TextClassificationHead\n",
     "from farm.modeling.adaptive_model import AdaptiveModel\n",
     "from farm.modeling.optimization import initialize_optimizer\n",
-    "from farm.train import Trainer"
+    "from farm.train import Trainer\n",
+    "from farm.utils import MLFlowLogger"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Farm allows simple logging of many parameters & metrics. Let's use MLflow framework to track our experiment ...\n",
+    "ml_logger = MLFlowLogger(tracking_uri=\"https://public-mlflow.deepset.ai/\")\n",
+    "ml_logger.init_experiment(experiment_name=\"Public_FARM\", run_name=\"Tutorial1_Colab\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Devices available: cpu\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# We need to fetch the right device to drive the growth of our model\n",
     "\n",
@@ -105,41 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['Selbst',\n",
-       " 'ein',\n",
-       " 'bl',\n",
-       " '##inde',\n",
-       " '##s',\n",
-       " 'Hu',\n",
-       " '##hn',\n",
-       " 'findet',\n",
-       " 'mal',\n",
-       " 'ein',\n",
-       " 'Korn',\n",
-       " '.']"
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# We can test out how it will do on an example sentence\n",
-    "\n",
-    "EXAMPLE_SENTENCE = \"Selbst ein blindes Huhn findet mal ein Korn.\"\n",
-    "tokenizer.tokenize(EXAMPLE_SENTENCE)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -159,209 +129,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:34:03 - INFO - farm.data_handler.data_silo -   \n",
-      "Loading data into the data silo ... \n",
-      "              ______\n",
-      "               |o  |   !\n",
-      "   __          |:`_|---'-.\n",
-      "  |__|______.-/ _ \\-----.|       \n",
-      " (o)(o)------'\\ _ /     ( )      \n",
-      " \n",
-      "08/31/2019 15:34:03 - INFO - farm.data_handler.data_silo -   Loading train set from: data/germeval18/train.tsv \n",
-      "08/31/2019 15:34:03 - INFO - farm.data_handler.processor -   Got ya 5 parallel workers to fill the baskets with samples (chunksize = 1000)...\n",
-      "100%|██████████| 5009/5009 [00:01<00:00, 2785.99it/s]\n",
-      "08/31/2019 15:34:05 - INFO - farm.data_handler.processor -   Got ya 5 parallel workers to featurize samples in baskets (chunksize = 1000) ...\n",
-      "100%|██████████| 5009/5009 [00:00<00:00, 7582.03it/s]\n",
-      "  0%|          | 0/5009 [00:00<?, ?it/s]\n",
-      "08/31/2019 15:34:06 - INFO - farm.data_handler.processor -   *** Show 3 random examples ***\n",
-      "08/31/2019 15:34:06 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: train-4206-0\n",
-      "Clear Text: \n",
-      " \ttext: @realDonaldTrump Die Deutschen halten sie für einen großen Präsidenten der uns Hoffnung gibt, diese Merkel-Diktatur zu überstehen!\n",
-      " \ttext_classification_label: OTHER\n",
-      "Tokenized: \n",
-      " \ttokens: ['[UNK]', 'real', '##Donald', '##Tr', '##ump', 'Die', 'Deutschen', 'halten', 'sie', 'für', 'einen', 'großen', 'Präsidenten', 'der', 'uns', 'Hoffnung', 'gibt', ',', 'diese', 'Merkel', '-', 'Dikt', '##atur', 'zu', 'übers', '##te', '##hen', '[UNK]']\n",
-      " \toffsets: [0, 5, 9, 15, 17, 17, 21, 31, 38, 42, 46, 52, 59, 71, 75, 79, 88, 92, 94, 100, 106, 107, 111, 116, 119, 124, 126, 129]\n",
-      " \tstart_of_word: [True, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, False, False, False, True, True, False, False, False]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 2, 12524, 26092, 20446, 9779, 125, 1700, 3721, 213, 142, 303, 1714, 4978, 21, 2099, 8772, 893, 2036, 620, 5654, 243, 13804, 851, 81, 14723, 26, 215, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \ttext_classification_label_ids: [0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:34:06 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: train-2018-0\n",
-      "Clear Text: \n",
-      " \ttext: @sonneundmond1 Liebe @sonneundmond1, |LBR| daher dieser Kommentar und das Interview. Daher unser Vorgehen mittels Meldungen und Anzeigen gegen facebook und twitter-Kommentare. Daher die klare Positionierung und Berichterstattung über den Antisemitismus aller Seiten. Können wir mehr tun?\n",
-      " \ttext_classification_label: OTHER\n",
-      "Tokenized: \n",
-      " \ttokens: ['[UNK]', 'sonn', '##eu', '##nd', '##mond', '##1', 'Liebe', '[UNK]', 'sonn', '##eu', '##nd', '##mond', '##1', ',', '|', 'LB', '##R', '|', 'daher', 'dieser', 'Kommentar', 'und', 'das', 'Interview', '.', 'Daher', 'unser', 'Vorgehen', 'mittels', 'Meldungen', 'und', 'Anzeigen', 'gegen', 'fa', '##ce', '##bo', '##ok', 'und', 't', '##witter', '-', 'Kommentare', '.', 'Daher', 'die', 'klare', 'Position', '##ierung', 'und', 'Berichter', '##stattung', 'über', 'den', 'Antisemit', '##ismus', 'aller', 'Seiten', '.', 'Können', 'wir', 'mehr', 'tun', '[UNK]']\n",
-      " \toffsets: [0, 5, 9, 11, 13, 17, 15, 21, 26, 30, 32, 34, 38, 39, 37, 38, 40, 41, 43, 49, 56, 66, 70, 74, 83, 85, 91, 97, 106, 114, 124, 128, 137, 143, 145, 147, 149, 152, 156, 157, 163, 164, 174, 176, 182, 186, 192, 200, 207, 211, 220, 229, 234, 238, 247, 253, 259, 265, 267, 274, 278, 283, 286]\n",
-      " \tstart_of_word: [True, False, False, False, False, False, True, True, False, False, False, False, False, False, True, False, False, False, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, False, False, False, True, True, False, False, False, False, True, True, True, True, False, True, True, False, True, True, True, False, True, True, False, True, True, True, True, False]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 2, 20157, 217, 12251, 11020, 26927, 5792, 2, 20157, 217, 12251, 11020, 26927, 2036, 24854, 12586, 26938, 24854, 1913, 534, 9973, 42, 93, 6405, 4813, 6135, 2689, 8383, 7128, 22884, 42, 21388, 383, 20568, 950, 2188, 493, 42, 209, 9170, 243, 18930, 4813, 6135, 30, 13847, 4685, 536, 42, 11597, 5657, 204, 86, 25850, 1500, 1007, 2530, 4813, 23272, 232, 380, 4013, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \ttext_classification_label_ids: [0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:34:06 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: train-5003-0\n",
-      "Clear Text: \n",
-      " \ttext: @vadderland Deshalb sprach ich vom WENDEN 😂😂😂 vorne, hinten, innen, außen - einen f. Fanuar, einen für den Februar usw. 12 Stück 😂😂😂\n",
-      " \ttext_classification_label: OTHER\n",
-      "Tokenized: \n",
-      " \ttokens: ['[UNK]', 'v', '##add', '##er', '##land', 'Deshalb', 'sprach', 'ich', 'vom', 'W', '##EN', '##DE', '##N', '[UNK]', 'vorne', ',', 'hinten', ',', 'innen', ',', 'außen', '-', 'einen', 'f', '.', 'Fan', '##uar', ',', 'einen', 'für', 'den', 'Februar', 'usw', '.', '12', 'Stück', '[UNK]']\n",
-      " \toffsets: [0, 5, 6, 9, 11, 12, 20, 27, 31, 35, 36, 38, 40, 42, 46, 51, 53, 59, 61, 66, 68, 74, 76, 82, 83, 85, 88, 91, 93, 99, 103, 107, 115, 118, 120, 123, 129]\n",
-      " \tstart_of_word: [True, False, False, False, False, True, True, True, True, True, False, False, False, True, True, False, True, False, True, False, True, True, True, True, False, True, False, False, True, True, True, True, True, False, True, True, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 2, 31, 21536, 6, 374, 5372, 5118, 1169, 275, 79, 14972, 18800, 26947, 2, 10330, 2036, 11084, 2036, 11440, 2036, 6942, 243, 303, 69, 4813, 12690, 26185, 2036, 303, 142, 86, 1466, 14230, 4813, 810, 4428, 2, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \ttext_classification_label_ids: [0]\n",
-      "_____________________________________________________\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:34:06 - INFO - farm.data_handler.data_silo -   Loading dev set as a slice of train set\n",
-      "08/31/2019 15:34:06 - INFO - farm.data_handler.data_silo -   Took 500 samples out of train set to create dev set (dev split = 0.1)\n",
-      "08/31/2019 15:34:06 - INFO - farm.data_handler.data_silo -   Loading test set from: data/germeval18/test.tsv\n",
-      "08/31/2019 15:34:06 - INFO - farm.data_handler.processor -   Got ya 3 parallel workers to fill the baskets with samples (chunksize = 1000)...\n",
-      "100%|██████████| 3532/3532 [00:01<00:00, 3103.76it/s]\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.processor -   Got ya 3 parallel workers to featurize samples in baskets (chunksize = 1000) ...\n",
-      "100%|██████████| 3532/3532 [00:00<00:00, 23974.74it/s]\n",
-      "  0%|          | 0/3532 [00:00<?, ?it/s]\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.processor -   *** Show 3 random examples ***\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: test-512-0\n",
-      "Clear Text: \n",
-      " \ttext: #Fakt: Mehr als 2 Mio. Menschen sind in #Nigeria auf der Flucht, darunter 1,87 Mio., die vor #BokoHaram fliehen mussten.\n",
-      " \ttext_classification_label: OTHER\n",
-      "Tokenized: \n",
-      " \ttokens: ['#', 'Fa', '##kt', ':', 'Mehr', 'als', '2', 'Mio', '.', 'Menschen', 'sind', 'in', '#', 'Nigeria', 'auf', 'der', 'Flucht', ',', 'darunter', '1', ',', '87', 'Mio', '.', ',', 'die', 'vor', '#', 'Bo', '##ko', '##Har', '##am', 'fliehen', 'mussten', '.']\n",
-      " \toffsets: [0, 1, 3, 5, 7, 12, 16, 18, 21, 23, 32, 37, 40, 41, 49, 53, 57, 63, 65, 74, 75, 76, 79, 82, 83, 85, 89, 93, 94, 96, 98, 101, 104, 112, 119]\n",
-      " \tstart_of_word: [True, False, False, False, True, True, True, True, False, True, True, True, True, False, True, True, True, False, True, True, False, False, True, False, False, True, True, True, False, False, False, False, True, True, False]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 13002, 4602, 146, 5982, 1972, 153, 99, 8860, 4813, 1075, 287, 50, 13002, 22062, 115, 21, 6647, 2036, 3237, 62, 2036, 14973, 8860, 4813, 2036, 30, 200, 13002, 1035, 1186, 25151, 92, 17841, 4657, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \ttext_classification_label_ids: [0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: test-1907-0\n",
-      "Clear Text: \n",
-      " \ttext: Für mich sind die Neuen Rechten vor allem ein Haufen von Hysterikern. #NoAfD #AfD\n",
-      " \ttext_classification_label: OTHER\n",
-      "Tokenized: \n",
-      " \ttokens: ['Für', 'mich', 'sind', 'die', 'Neuen', 'Rechten', 'vor', 'allem', 'ein', 'Hau', '##fen', 'von', 'Hy', '##ster', '##ikern', '.', '#', 'No', '##A', '##f', '##D', '#', 'AfD']\n",
-      " \toffsets: [0, 4, 9, 14, 18, 24, 32, 36, 42, 46, 49, 53, 57, 59, 63, 68, 70, 71, 73, 74, 75, 77, 78]\n",
-      " \tstart_of_word: [True, True, True, True, True, True, True, True, True, True, False, True, True, False, False, False, True, False, False, False, False, True, False]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 864, 3277, 287, 30, 7870, 9469, 200, 1208, 39, 10344, 451, 88, 7003, 245, 13058, 4813, 13002, 6967, 26924, 26913, 26926, 13002, 10732, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \ttext_classification_label_ids: [0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: test-540-0\n",
-      "Clear Text: \n",
-      " \ttext: @nairax23 @Harlekin1991 @m8Flo @everlast_fs @rspctfl @BolzAndrea @Innenwelttramp @LGnimag @dasstimmvieh @NordicSkulled @Suzimiya @waldenmonk @ThomasMichael71 @Coffee__Grind @houelle_beck @THesmert @Chinasky64a @MartinDomig Naira, ich habe in Hamburg ein Baby im Krabbelalter in der Krippe gesehen mit Kopftuch und Stecknadeln am Kopf. |LBR| Wie krank ist sowas eigentlich?\n",
-      " \ttext_classification_label: OFFENSE\n",
-      "Tokenized: \n",
-      " \ttokens: ['[UNK]', 'na', '##ira', '##x', '##23', '[UNK]', 'Har', '##le', '##kin', '##199', '##1', '[UNK]', 'm', '##8', '##Fl', '##o', '[UNK]', 'ev', '##er', '##last', '[UNK]', 'f', '##s', '[UNK]', 'r', '##sp', '##ct', '##fl', '[UNK]', 'Bol', '##z', '##An', '##dre', '##a', '[UNK]', 'Innen', '##welt', '##tra', '##m', '##p', '[UNK]', 'LG', '##ni', '##mag', '[UNK]', 'dass', '##tim', '##m', '##vi', '##eh', '[UNK]', 'Nord', '##ic', '##Sk', '##ull', '##ed', '[UNK]', 'Su', '##zi', '##mi', '##ya', '[UNK]', 'wa', '##ld', '##en', '##mon', '##k', '[UNK]', 'Thomas', '##Mich', '##ael', '##71', '[UNK]', 'Co', '##ffe', '##e', '[UNK]', '[UNK]', 'Gri', '##nd', '[UNK]', 'ho', '##uelle', '[UNK]', 'be', '##ck', '[UNK]', 'TH', '##es', '##mert', '[UNK]', 'Chinas', '##ky', '##64', '##a', '[UNK]', 'Martin', '##Do', '##mi', '##g', 'Na', '##ira', ',', 'ich', 'habe', 'in', 'Hamburg', 'ein', 'Baby', 'im', 'Kra', '##bb', '##ela', '##lt', '##er', 'in', 'der', 'Kri', '##ppe', 'gesehen', 'mit', 'Kopf', '##t', '##uch', 'und', 'Ste']\n",
-      " \toffsets: [0, 5, 7, 10, 11, 10, 15, 18, 20, 23, 26, 24, 29, 30, 31, 33, 31, 36, 38, 40, 44, 49, 50, 44, 49, 50, 52, 54, 53, 58, 61, 62, 64, 67, 65, 70, 75, 79, 82, 83, 81, 86, 88, 90, 90, 95, 99, 102, 103, 105, 104, 109, 113, 115, 117, 120, 119, 124, 126, 128, 130, 129, 134, 136, 138, 140, 143, 141, 146, 152, 156, 159, 158, 163, 165, 168, 169, 174, 179, 182, 173, 178, 180, 185, 190, 192, 187, 192, 194, 196, 197, 202, 208, 210, 212, 210, 215, 221, 223, 225, 223, 225, 228, 230, 234, 239, 242, 250, 254, 259, 262, 265, 267, 270, 272, 275, 278, 282, 285, 289, 297, 301, 305, 306, 310, 314]\n",
-      " \tstart_of_word: [True, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, True, False, False, False, False, True, False, False, False, False, False, False, False, True, False, False, False, False, False, True, False, False, False, True, False, False, False, False, True, False, False, False, False, True, False, False, True, True, True, True, True, True, True, True, False, False, False, False, True, True, True, False, True, True, True, False, False, True, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 2, 14335, 7872, 26965, 4827, 2, 3238, 134, 12358, 18070, 26927, 2, 59, 26956, 7098, 26910, 2, 16262, 6, 3001, 2, 69, 26902, 2, 496, 168, 1920, 986, 2, 8797, 26916, 1845, 2549, 26903, 2, 2879, 2190, 277, 26911, 26920, 2, 18254, 14314, 8638, 2, 221, 21258, 26911, 10610, 1197, 2, 1188, 434, 14694, 2701, 319, 2, 2290, 517, 25717, 5630, 2, 21179, 5359, 7, 6131, 26917, 2, 3075, 19662, 2663, 6684, 2, 1145, 2110, 26897, 2, 2, 19676, 12251, 2, 1438, 3701, 2, 131, 110, 2, 22672, 16, 11696, 2, 15975, 9640, 6017, 26903, 2, 3810, 16898, 25717, 26908, 5615, 7872, 2036, 1169, 555, 50, 2917, 39, 13588, 106, 5795, 25053, 8553, 362, 6, 50, 21, 17172, 3863, 4856, 114, 3506, 26901, 108, 42, 601, 4]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \ttext_classification_label_ids: [1]\n",
-      "_____________________________________________________\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.data_silo -   Examples in train: 4509\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.data_silo -   Examples in dev  : 500\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.data_silo -   Examples in test : 3532\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.data_silo -   \n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.data_silo -   Max sequence length:     128\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.data_silo -   Average sequence length: 40.17853182523841\n",
-      "08/31/2019 15:34:08 - INFO - farm.data_handler.data_silo -   Proportion clipped:      0.01752051452650255\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# We need a DataSilo in order to keep our train, dev and test sets separate.\n",
     "# The DataSilo will call the functions in the Processor to generate these sets.\n",
@@ -398,39 +168,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:34:09 - INFO - pytorch_transformers.modeling_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json from cache at /home/mp/.cache/torch/pytorch_transformers/e653e2fe0970d519c5a3b6c0286e1630ad2f0eade78f82b4916ec945d6f06d48.958be95f8721c8cc3ff0998c94a7b77083ecd9345f62423db24ad2387d599c7d\n",
-      "08/31/2019 15:34:09 - INFO - pytorch_transformers.modeling_utils -   Model config {\n",
-      "  \"attention_probs_dropout_prob\": 0.1,\n",
-      "  \"finetuning_task\": null,\n",
-      "  \"hidden_act\": \"gelu\",\n",
-      "  \"hidden_dropout_prob\": 0.1,\n",
-      "  \"hidden_size\": 768,\n",
-      "  \"initializer_range\": 0.02,\n",
-      "  \"intermediate_size\": 3072,\n",
-      "  \"layer_norm_eps\": 1e-12,\n",
-      "  \"max_position_embeddings\": 512,\n",
-      "  \"num_attention_heads\": 12,\n",
-      "  \"num_hidden_layers\": 12,\n",
-      "  \"num_labels\": 2,\n",
-      "  \"output_attentions\": false,\n",
-      "  \"output_hidden_states\": false,\n",
-      "  \"torchscript\": false,\n",
-      "  \"type_vocab_size\": 2,\n",
-      "  \"vocab_size\": 30000\n",
-      "}\n",
-      "\n",
-      "08/31/2019 15:34:09 - INFO - pytorch_transformers.modeling_utils -   loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-pytorch_model.bin from cache at /home/mp/.cache/torch/pytorch_transformers/e32f648561b03f77a129832928b7f16decdc5e0870f1e6558857e046169d4133.4e5eda3a0f09b32a0b7d1a9185034da1b3506d5c5b0c6880a7ca0122ab5eef2e\n",
-      "08/31/2019 15:34:11 - INFO - farm.modeling.language_model -   Automatically detected language from language model name: german\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# The language model is the foundation on which modern NLP systems are built.\n",
     "# They encapsulate a general understanding of sentence semantics\n",
@@ -450,7 +190,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -476,7 +216,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -505,17 +245,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:34:11 - INFO - farm.modeling.optimization -   Number of optimization steps: 141\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# Here we initialize a Bert Adam optimizer that has a linear warmup and warmdown\n",
     "# Here you can set learning rate, the warmup proportion and number of epochs to train for\n",
@@ -534,7 +266,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -564,6 +296,26 @@
     "model = trainer.train(model)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Test your model on a sample (Inference)\n",
+    "from farm.infer import Inferencer\n",
+    "from pprint import PrettyPrinter\n",
+    "\n",
+    "infer_model = Inferencer(processor=processor, model=model, gpu=True)\n",
+    "\n",
+    "basic_texts = [\n",
+    "    {\"text\": \"Martin ist ein Idiot\"},\n",
+    "    {\"text\": \"Martin Müller spielt Handball in Berlin\"},\n",
+    "]\n",
+    "result = infer_model.inference_from_dicts(dicts=basic_texts)\n",
+    "PrettyPrinter().pprint(result)"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -582,33 +334,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# Import the new building blocks\n",
     "\n",
     "from farm.data_handler.processor import NERProcessor\n",
-    "from farm.modeling.prediction_head import TokenClassificationHead"
+    "from farm.modeling.prediction_head import TokenClassificationHead\n",
+    "ml_logger.init_experiment(experiment_name=\"Public_FARM\", run_name=\"Tutorial1_Colab_NER\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
     "# This processor will preprocess the data for the CoNLL03 NER task\n",
-    "\n",
-    "ner_processor = NERProcessor(tokenizer=tokenizer, max_seq_len=128, data_dir=\"data/conll03-de\")\n",
-    "\n",
     "ner_labels = [\"[PAD]\", \"X\", \"O\", \"B-MISC\", \"I-MISC\", \"B-PER\", \"I-PER\", \"B-ORG\", \"I-ORG\", \"B-LOC\", \"I-LOC\", \"B-OTH\", \"I-OTH\"]\n",
-    "processor.add_task(\"ner\", \"seq_f1\", ner_labels)"
+    "\n",
+    "ner_processor = NERProcessor(tokenizer=tokenizer, \n",
+    "                             max_seq_len=128, \n",
+    "                             data_dir=\"../data/conll03-de\",\n",
+    "                             label_list=ner_labels,\n",
+    "                             metric=\"seq_f1\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 32,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -623,340 +378,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 33,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:35:55 - INFO - farm.data_handler.data_silo -   \n",
-      "Loading data into the data silo ... \n",
-      "              ______\n",
-      "               |o  |   !\n",
-      "   __          |:`_|---'-.\n",
-      "  |__|______.-/ _ \\-----.|       \n",
-      " (o)(o)------'\\ _ /     ( )      \n",
-      " \n",
-      "08/31/2019 15:35:55 - INFO - farm.data_handler.data_silo -   Loading train set from: data/conll03-de/train.txt \n",
-      "08/31/2019 15:35:55 - INFO - farm.data_handler.utils -    Couldn't find data/conll03-de/train.txt locally. Trying to download ...\n",
-      "08/31/2019 15:35:55 - INFO - farm.data_handler.utils -   downloading and extracting file conll03-de to dir /home/mp/deepset/dev/FARM/data\n",
-      "08/31/2019 15:36:00 - INFO - farm.data_handler.processor -   Got ya 8 parallel workers to fill the baskets with samples (chunksize = 1000)...\n",
-      "\n",
-      "  0%|          | 0/24000 [00:00<?, ?it/s]\u001b[A\n",
-      "  0%|          | 1/24000 [00:01<11:06:25,  1.67s/it]\u001b[A\n",
-      "  4%|▍         | 1001/24000 [00:02<7:27:07,  1.17s/it]\u001b[A\n",
-      "  8%|▊         | 2001/24000 [00:02<4:59:23,  1.22it/s]\u001b[A\n",
-      " 33%|███▎      | 8001/24000 [00:03<2:32:25,  1.75it/s]\u001b[A\n",
-      " 38%|███▊      | 9001/24000 [00:03<1:40:02,  2.50it/s]\u001b[A\n",
-      " 42%|████▏     | 10001/24000 [00:04<1:05:23,  3.57it/s]\u001b[A\n",
-      " 67%|██████▋   | 16001/24000 [00:05<26:09,  5.10it/s]  \u001b[A\n",
-      " 71%|███████   | 17001/24000 [00:05<16:02,  7.27it/s]\u001b[A\n",
-      " 83%|████████▎ | 20001/24000 [00:06<06:25, 10.39it/s]\u001b[A\n",
-      "100%|██████████| 24000/24000 [00:06<00:00, 3911.33it/s]\u001b[A08/31/2019 15:36:07 - INFO - farm.data_handler.processor -   Got ya 8 parallel workers to featurize samples in baskets (chunksize = 1000) ...\n",
-      "\n",
-      "  0%|          | 0/24000 [00:00<?, ?it/s]\u001b[A\n",
-      "  0%|          | 1/24000 [00:00<1:40:16,  3.99it/s]\u001b[A\n",
-      "  8%|▊         | 2001/24000 [00:00<1:04:20,  5.70it/s]\u001b[A\n",
-      " 17%|█▋        | 4001/24000 [00:00<40:57,  8.14it/s]  \u001b[A\n",
-      " 29%|██▉       | 7001/24000 [00:00<24:22, 11.63it/s]\u001b[A\n",
-      " 38%|███▊      | 9001/24000 [00:00<15:03, 16.60it/s]\u001b[A\n",
-      " 46%|████▌     | 11001/24000 [00:01<09:08, 23.69it/s]\u001b[A\n",
-      " 54%|█████▍    | 13001/24000 [00:01<05:25, 33.82it/s]\u001b[A\n",
-      " 63%|██████▎   | 15001/24000 [00:01<03:06, 48.26it/s]\u001b[A\n",
-      " 75%|███████▌  | 18001/24000 [00:01<01:27, 68.89it/s]\u001b[A\n",
-      " 92%|█████████▏| 22001/24000 [00:01<00:20, 98.33it/s]\u001b[A\n",
-      "100%|██████████| 24000/24000 [00:01<00:00, 14648.45it/s]\u001b[A\n",
-      "  0%|          | 0/24000 [00:00<?, ?it/s]\u001b[A08/31/2019 15:36:10 - INFO - farm.data_handler.processor -   *** Show 3 random examples ***\n",
-      "08/31/2019 15:36:10 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: train-12216-0\n",
-      "Clear Text: \n",
-      " \ttext: Hier führte er eine enge Zusammenarbeit mit Pohl als Geschäftsführer des SS-Verlagswesens .\n",
-      " \tner_label: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'O', 'B-ORG', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['Hier', 'führte', 'er', 'eine', 'enge', 'Zusammenarbeit', 'mit', 'Po', '##hl', 'als', 'Geschäftsführer', 'des', 'SS', '-', 'Verlags', '##wesens', '.']\n",
-      " \toffsets: [0, 5, 12, 15, 20, 25, 40, 44, 46, 49, 53, 69, 73, 75, 76, 83, 90]\n",
-      " \tstart_of_word: [True, True, True, True, True, True, True, True, False, True, True, True, True, False, False, False, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 1731, 2219, 67, 155, 13753, 4990, 114, 3461, 268, 153, 5893, 91, 7462, 243, 12938, 13918, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 2, 2, 2, 2, 2, 2, 2, 5, 1, 2, 2, 2, 7, 1, 1, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:36:10 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: train-1449-0\n",
-      "Clear Text: \n",
-      " \ttext: Wichtigste Vorbeugemaßnahme gegen diese eventuell lebensbedrohliche bakterielle Durchfallerkrankung ist sorgfältige Hygiene ( z.\n",
-      " \tner_label: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['Wichtig', '##ste', 'Vorbe', '##uge', '##maßnahme', 'gegen', 'diese', 'eventuell', 'lebens', '##bedro', '##hl', '##iche', 'b', '##akter', '##ielle', 'Durch', '##falle', '##r', '##kran', '##kung', 'ist', 'sorgfältig', '##e', 'Hy', '##giene', '(', 'z', '.']\n",
-      " \toffsets: [0, 7, 11, 16, 19, 28, 34, 40, 50, 56, 61, 63, 68, 69, 74, 80, 85, 90, 91, 95, 100, 104, 114, 116, 118, 124, 126, 127]\n",
-      " \tstart_of_word: [True, False, True, False, False, True, True, True, True, False, False, False, True, False, False, True, False, False, False, False, True, True, False, True, False, True, True, False]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 18840, 116, 20800, 16070, 12857, 383, 620, 16799, 9251, 15645, 268, 21792, 40, 3029, 4132, 1116, 22945, 26900, 11853, 1538, 127, 24222, 26897, 7003, 23042, 123, 44, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 2, 1, 2, 1, 1, 2, 2, 2, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 2, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:36:10 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: train-2515-0\n",
-      "Clear Text: \n",
-      " \ttext: Bei Zwischenstopps sei dies außerdem nur dann der Fall gewesen , wenn die Besatzung komplett gewechselt habe .\n",
-      " \tner_label: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['Bei', 'Zwischen', '##stopp', '##s', 'sei', 'dies', 'außerdem', 'nur', 'dann', 'der', 'Fall', 'gewesen', ',', 'wenn', 'die', 'Besatzung', 'komplett', 'gewechselt', 'habe', '.']\n",
-      " \toffsets: [0, 4, 12, 17, 19, 23, 28, 37, 41, 46, 50, 55, 63, 65, 70, 74, 84, 93, 104, 109]\n",
-      " \tstart_of_word: [True, True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 467, 2568, 24304, 26902, 350, 419, 3402, 356, 670, 21, 1000, 1396, 2036, 557, 30, 12826, 7286, 22959, 555, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:36:10 - INFO - farm.data_handler.data_silo -   Loading dev set from: data/conll03-de/dev.txt\n",
-      "08/31/2019 15:36:10 - INFO - farm.data_handler.processor -   Got ya 2 parallel workers to fill the baskets with samples (chunksize = 1000)...\n",
-      "\n",
-      "  0%|          | 0/2200 [00:00<?, ?it/s]\u001b[A\n",
-      "  0%|          | 1/2200 [00:00<35:32,  1.03it/s]\u001b[A\n",
-      " 46%|████▌     | 1001/2200 [00:01<13:33,  1.47it/s]\u001b[A\n",
-      "100%|██████████| 2200/2200 [00:01<00:00, 1927.43it/s]\u001b[A08/31/2019 15:36:12 - INFO - farm.data_handler.processor -   Got ya 2 parallel workers to featurize samples in baskets (chunksize = 1000) ...\n",
-      "\n",
-      "  0%|          | 0/2200 [00:00<?, ?it/s]\u001b[A\n",
-      "  0%|          | 1/2200 [00:00<10:42,  3.42it/s]\u001b[A\n",
-      " 46%|████▌     | 1001/2200 [00:00<04:05,  4.89it/s]\u001b[A\n",
-      "100%|██████████| 2200/2200 [00:00<00:00, 5444.66it/s]\u001b[A\n",
-      "  0%|          | 0/2200 [00:00<?, ?it/s]\u001b[A08/31/2019 15:36:13 - INFO - farm.data_handler.processor -   *** Show 3 random examples ***\n",
-      "08/31/2019 15:36:13 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: dev-152-0\n",
-      "Clear Text: \n",
-      " \ttext: Demirović schied aber bereits im ersten Halbfinale des ESC in Moskau aus .\n",
-      " \tner_label: ['B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-LOC', 'O', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['[UNK]', 'schied', 'aber', 'bereits', 'im', 'ersten', 'Halbfinale', 'des', 'ES', '##C', 'in', 'Moskau', 'aus', '.']\n",
-      " \toffsets: [0, 10, 17, 22, 30, 33, 40, 51, 55, 57, 59, 62, 69, 73]\n",
-      " \tstart_of_word: [True, True, True, True, True, True, True, True, True, False, True, True, True, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 2, 10100, 386, 777, 106, 781, 9481, 91, 17629, 26958, 50, 6461, 147, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 5, 2, 2, 2, 2, 2, 2, 2, 7, 1, 2, 9, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:36:13 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: dev-1647-0\n",
-      "Clear Text: \n",
-      " \ttext: Deutschland solle sich intensiver um die Probleme im Nahen Osten , auf dem Balkan , im Kaukasus und um die Stärkung Europas kümmern .\n",
-      " \tner_label: ['B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['Deutschland', 'solle', 'sich', 'intensiver', 'um', 'die', 'Probleme', 'im', 'Nahen', 'Osten', ',', 'auf', 'dem', 'Balkan', ',', 'im', 'Kau', '##kasus', 'und', 'um', 'die', 'Stärkung', 'Europas', 'kümmern', '.']\n",
-      " \toffsets: [0, 12, 18, 23, 34, 37, 41, 50, 53, 59, 65, 67, 71, 75, 82, 84, 87, 90, 96, 100, 103, 107, 116, 124, 132]\n",
-      " \tstart_of_word: [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 832, 5988, 144, 24165, 259, 30, 5034, 106, 20885, 3211, 2036, 115, 128, 19307, 2036, 106, 14937, 25400, 42, 259, 30, 24648, 8356, 16609, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 9, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 9, 2, 2, 9, 1, 2, 2, 2, 2, 9, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:36:13 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: dev-299-0\n",
-      "Clear Text: \n",
-      " \ttext: Geographie Der Ort liegt sechs Kilometer südlich von Modrava im Luzenské údolí ( Lusental ) , das nördlich vom Berg Lusen in Richtung Modrava verläuft .\n",
-      " \tner_label: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'B-LOC', 'O', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['Ge', '##ographie', 'Der', 'Ort', 'liegt', 'sechs', 'Kilometer', 'südlich', 'von', 'Mod', '##ra', '##va', 'im', '[UNK]', '[UNK]', '(', 'Lu', '##sent', '##al', ')', ',', 'das', 'nördlich', 'vom', 'Berg', 'Lu', '##sen', 'in', 'Richtung', 'Mod', '##ra', '##va', 'verläuft', '.']\n",
-      " \toffsets: [0, 2, 11, 15, 19, 25, 31, 41, 49, 53, 56, 58, 61, 64, 73, 79, 81, 83, 87, 90, 92, 94, 98, 107, 111, 116, 118, 122, 125, 134, 137, 139, 142, 151]\n",
-      " \tstart_of_word: [True, False, True, True, True, True, True, True, True, True, False, False, True, True, True, True, True, False, False, True, True, True, True, True, True, True, False, True, True, True, False, False, True, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 241, 6981, 233, 874, 968, 1938, 2706, 4770, 88, 7873, 83, 4973, 106, 2, 2, 123, 2105, 26828, 54, 5133, 2036, 93, 4879, 275, 2134, 2105, 138, 50, 3134, 7873, 83, 4973, 6388, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 9, 1, 1, 2, 9, 10, 2, 9, 1, 1, 2, 2, 2, 2, 2, 2, 9, 1, 2, 2, 9, 1, 1, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:36:13 - INFO - farm.data_handler.data_silo -   Loading test set from: data/conll03-de/test.txt\n",
-      "08/31/2019 15:36:13 - INFO - farm.data_handler.processor -   Got ya 5 parallel workers to fill the baskets with samples (chunksize = 1000)...\n",
-      "\n",
-      "  0%|          | 0/5100 [00:00<?, ?it/s]\u001b[A\n",
-      "  0%|          | 1/5100 [00:01<2:13:39,  1.57s/it]\u001b[A\n",
-      " 59%|█████▉    | 3001/5100 [00:01<38:30,  1.10s/it]\u001b[A\n",
-      "100%|██████████| 5100/5100 [00:01<00:00, 2911.92it/s]\u001b[A08/31/2019 15:36:16 - INFO - farm.data_handler.processor -   Got ya 5 parallel workers to featurize samples in baskets (chunksize = 1000) ...\n",
-      "\n",
-      "  0%|          | 0/5100 [00:00<?, ?it/s]\u001b[A\n",
-      "  0%|          | 1/5100 [00:00<45:45,  1.86it/s]\u001b[A\n",
-      " 39%|███▉      | 2001/5100 [00:00<19:27,  2.65it/s]\u001b[A\n",
-      "100%|██████████| 5100/5100 [00:00<00:00, 6619.18it/s]\u001b[A\n",
-      "  0%|          | 0/5100 [00:00<?, ?it/s]\u001b[A08/31/2019 15:36:17 - INFO - farm.data_handler.processor -   *** Show 3 random examples ***\n",
-      "08/31/2019 15:36:17 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: test-4589-0\n",
-      "Clear Text: \n",
-      " \ttext: Dazu würden noch dringend Sponsoren gesucht .\n",
-      " \tner_label: ['O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['Dazu', 'würden', 'noch', 'dringend', 'Sponsoren', 'gesucht', '.']\n",
-      " \toffsets: [0, 5, 12, 17, 26, 36, 44]\n",
-      " \tstart_of_word: [True, True, True, True, True, True, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 4220, 2488, 357, 13750, 22636, 13677, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:36:17 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: test-1952-0\n",
-      "Clear Text: \n",
-      " \ttext: Der Headcoach sieht noch sehr viel \" harte Arbeit \" .\n",
-      " \tner_label: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['Der', 'Head', '##co', '##ach', 'sieht', 'noch', 'sehr', 'viel', '\"', 'harte', 'Arbeit', '\"', '.']\n",
-      " \toffsets: [0, 4, 8, 10, 14, 20, 25, 30, 35, 37, 43, 50, 52]\n",
-      " \tstart_of_word: [True, True, False, False, True, True, True, True, True, True, True, True, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 233, 21178, 1923, 71, 2648, 357, 1120, 870, 151, 18286, 944, 151, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "08/31/2019 15:36:17 - INFO - farm.data_handler.processor -   \n",
-      "\n",
-      "      .--.        _____                       _      \n",
-      "    .'_\\/_'.     / ____|                     | |     \n",
-      "    '. /\\ .'    | (___   __ _ _ __ ___  _ __ | | ___ \n",
-      "      \"||\"       \\___ \\ / _` | '_ ` _ \\| '_ \\| |/ _ \\ \n",
-      "       || /\\     ____) | (_| | | | | | | |_) | |  __/\n",
-      "    /\\ ||//\\)   |_____/ \\__,_|_| |_| |_| .__/|_|\\___|\n",
-      "   (/\\||/                             |_|           \n",
-      "______\\||/___________________________________________                     \n",
-      "\n",
-      "ID: test-3166-0\n",
-      "Clear Text: \n",
-      " \ttext: Und prompt : die Besucherinnen und Besucher hatten Extrawünsche .\n",
-      " \tner_label: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']\n",
-      "Tokenized: \n",
-      " \ttokens: ['Und', 'prom', '##pt', ':', 'die', 'Besucher', '##innen', 'und', 'Besucher', 'hatten', 'Extra', '##wün', '##sche', '.']\n",
-      " \toffsets: [0, 4, 8, 11, 13, 17, 25, 31, 35, 44, 51, 56, 59, 64]\n",
-      " \tstart_of_word: [True, True, False, True, True, True, False, True, True, True, True, False, False, True]\n",
-      "Features: \n",
-      " \tinput_ids: [3, 1356, 4535, 395, 5982, 30, 6326, 1443, 42, 6326, 1520, 20250, 15948, 1065, 4813, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tpadding_mask: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tsegment_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tinitial_mask: [0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      " \tner_label_ids: [1, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 1, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
-      "_____________________________________________________\n",
-      "08/31/2019 15:36:18 - INFO - farm.data_handler.data_silo -   Examples in train: 24000\n",
-      "08/31/2019 15:36:18 - INFO - farm.data_handler.data_silo -   Examples in dev  : 2200\n",
-      "08/31/2019 15:36:18 - INFO - farm.data_handler.data_silo -   Examples in test : 5100\n",
-      "08/31/2019 15:36:18 - INFO - farm.data_handler.data_silo -   \n",
-      "08/31/2019 15:36:18 - INFO - farm.data_handler.data_silo -   Max sequence length:     83\n",
-      "08/31/2019 15:36:18 - INFO - farm.data_handler.data_silo -   Average sequence length: 26.766625\n",
-      "08/31/2019 15:36:18 - INFO - farm.data_handler.data_silo -   Proportion clipped:      0.0\n",
-      "08/31/2019 15:36:18 - INFO - farm.modeling.optimization -   Number of optimization steps: 750\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "# We can integrate these new pieces with the rest using this code\n",
     "# It is pretty much the same structure as what we had above for text classification\n",
@@ -966,7 +390,7 @@
     "LEARNING_RATE = 2e-5\n",
     "WARMUP_PROPORTION = 0.1\n",
     "N_EPOCHS = 1\n",
-    "N_GPU = 0\n",
+    "N_GPU = 1\n",
     "\n",
     "data_silo = DataSilo(\n",
     "    processor=ner_processor,\n",
@@ -1004,6 +428,13 @@
    "source": [
     "model = trainer.train(model)"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -1022,18 +453,18 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.3"
+   "version": "3.6.5"
   },
   "pycharm": {
    "stem_cell": {
     "cell_type": "raw",
-    "source": [],
     "metadata": {
      "collapsed": false
-    }
+    },
+    "source": []
    }
   }
  },
  "nbformat": 4,
  "nbformat_minor": 2
-}
\ No newline at end of file
+}

From c72b4d3986be687683244181c29f698b82f40507 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 15 Oct 2019 15:19:43 +0200
Subject: [PATCH 2/3] add second tutorial notebook

---
 ...ild_a_processor_for_your_own_dataset.ipynb | 465 ++++++++++++++++++
 1 file changed, 465 insertions(+)
 create mode 100644 tutorials/2_Build_a_processor_for_your_own_dataset.ipynb

diff --git a/tutorials/2_Build_a_processor_for_your_own_dataset.ipynb b/tutorials/2_Build_a_processor_for_your_own_dataset.ipynb
new file mode 100644
index 000000000..009a9a25e
--- /dev/null
+++ b/tutorials/2_Build_a_processor_for_your_own_dataset.ipynb
@@ -0,0 +1,465 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## FARM: Use your own dataset\n",
+    "    \n",
+    "In Tutorial 1 you already learned about the major building blocks.\n",
+    "In this tutorial, you will see how to use FARM with your own dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Let's start by adjust the working directory so that it is the root of the repository\n",
+    "# This should be run just once.\n",
+    "\n",
+    "import os\n",
+    "os.chdir('../')\n",
+    "print(\"Current working directory is {}\".format(os.getcwd()))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1) How a Processor works"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Architecture\n",
+    "The Processor converts a <b>raw input (e.g File) into a Pytorch dataset</b>.   \n",
+    "For using an own dataset we need to adjust this Processor.\n",
+    "\n",
+    "<img src=\"https://raw.githubusercontent.com/deepset-ai/FARM/master/docs/img/data_silo_no_bg.jpg\" width=\"400\" height=\"400\" align=\"left\"/>\n",
+    "<br/><br/>\n",
+    "<br/><br/>\n",
+    "<br/><br/>\n",
+    "<br/><br/>\n",
+    "<br/><br/>\n",
+    "<br/><br/>\n",
+    "<br/><br/>\n",
+    "\n",
+    "​\n",
+    "### Main Conversion Stages \n",
+    "1. Read from file / raw input \n",
+    "2. Create samples\n",
+    "3. Featurize samples\n",
+    "4. Create PyTorch Dataset\n",
+    "\n",
+    "### Functions to implement\n",
+    "1. file\\_to_dicts()\n",
+    "2. \\_dict_to_samples()\n",
+    "3. \\_sample_to_features()  "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Example: TextClassificationProcessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from farm.data_handler.processor import *\n",
+    "from farm.data_handler.samples import Sample\n",
+    "from farm.modeling.tokenization import BertTokenizer\n",
+    "#from farm.modeling.tokenization import tokenize_with_metadata\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "class TextClassificationProcessor(Processor):\n",
+    "    \"\"\"\n",
+    "    Used to handle the text classification datasets that come in tabular format (CSV, TSV, etc.)\n",
+    "    \"\"\"\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        tokenizer,\n",
+    "        max_seq_len,\n",
+    "        data_dir,\n",
+    "        label_list=None,\n",
+    "        metric=None,\n",
+    "        train_filename=\"train.tsv\",\n",
+    "        dev_filename=None,\n",
+    "        test_filename=\"test.tsv\",\n",
+    "        dev_split=0.1,\n",
+    "        delimiter=\"\\t\",\n",
+    "        quote_char=\"'\",\n",
+    "        skiprows=None,\n",
+    "        label_column_name=\"label\",\n",
+    "        multilabel=False,\n",
+    "        header=0,\n",
+    "        **kwargs,\n",
+    "    ):\n",
+    "        #TODO If an arg is misspelt, e.g. metrics, it will be swallowed silently by kwargs\n",
+    "\n",
+    "        # Custom processor attributes\n",
+    "        self.delimiter = delimiter\n",
+    "        self.quote_char = quote_char\n",
+    "        self.skiprows = skiprows\n",
+    "        self.header = header\n",
+    "\n",
+    "        super(TextClassificationProcessor, self).__init__(\n",
+    "            tokenizer=tokenizer,\n",
+    "            max_seq_len=max_seq_len,\n",
+    "            train_filename=train_filename,\n",
+    "            dev_filename=dev_filename,\n",
+    "            test_filename=test_filename,\n",
+    "            dev_split=dev_split,\n",
+    "            data_dir=data_dir,\n",
+    "            tasks={},\n",
+    "        )\n",
+    "        #TODO raise info when no task is added due to missing \"metric\" or \"labels\" arg\n",
+    "        if metric and label_list:\n",
+    "            if multilabel:\n",
+    "                task_type = \"multilabel_classification\"\n",
+    "            else:\n",
+    "                task_type = \"classification\"\n",
+    "            self.add_task(name=\"text_classification\",\n",
+    "                          metric=metric,\n",
+    "                          label_list=label_list,\n",
+    "                          label_column_name=label_column_name,\n",
+    "                          task_type=task_type)\n",
+    "\n",
+    "    def file_to_dicts(self, file: str) -> [dict]:\n",
+    "        column_mapping = {task[\"label_column_name\"]: task[\"label_name\"] for task in self.tasks.values()}\n",
+    "        dicts = read_tsv(\n",
+    "            filename=file,\n",
+    "            delimiter=self.delimiter,\n",
+    "            skiprows=self.skiprows,\n",
+    "            quotechar=self.quote_char,\n",
+    "            rename_columns=column_mapping,\n",
+    "            header=self.header\n",
+    "            )\n",
+    "\n",
+    "        return dicts\n",
+    "\n",
+    "    def _dict_to_samples(self, dict: dict, **kwargs) -> [Sample]:\n",
+    "        # this tokenization also stores offsets\n",
+    "        tokenized = tokenize_with_metadata(dict[\"text\"], self.tokenizer, self.max_seq_len)\n",
+    "        return [Sample(id=None, clear_text=dict, tokenized=tokenized)]\n",
+    "\n",
+    "    def _sample_to_features(self, sample) -> dict:\n",
+    "        features = sample_to_features_text(\n",
+    "            sample=sample,\n",
+    "            tasks=self.tasks,\n",
+    "            max_seq_len=self.max_seq_len,\n",
+    "            tokenizer=self.tokenizer,\n",
+    "        )\n",
+    "        return features\n",
+    "      \n",
+    "      \n",
+    "# Helper\n",
+    "def read_tsv(filename, rename_columns, quotechar='\"', delimiter=\"\\t\", skiprows=None, header=0):\n",
+    "    \"\"\"Reads a tab separated value file. Tries to download the data if filename is not found\"\"\"\n",
+    "    \n",
+    "    # get remote dataset if needed\n",
+    "    if not (os.path.exists(filename)):\n",
+    "        logger.info(f\" Couldn't find {filename} locally. Trying to download ...\")\n",
+    "        _download_extract_downstream_data(filename)\n",
+    "    \n",
+    "    # read file into df\n",
+    "    df = pd.read_csv(\n",
+    "        filename,\n",
+    "        sep=delimiter,\n",
+    "        encoding=\"utf-8\",\n",
+    "        quotechar=quotechar,\n",
+    "        dtype=str,\n",
+    "        skiprows=skiprows,\n",
+    "        header=header\n",
+    "    )\n",
+    "\n",
+    "    # let's rename our target columns to the default names FARM expects: \n",
+    "    # \"text\": contains the text\n",
+    "    # \"text_classification_label\": contains a label for text classification\n",
+    "    columns = [\"text\"] + list(rename_columns.keys())\n",
+    "    df = df[columns]\n",
+    "    for source_column, label_name in rename_columns.items():\n",
+    "        df[label_name] = df[source_column]\n",
+    "        df.drop(columns=[source_column], inplace=True)\n",
+    "    \n",
+    "    if \"unused\" in df.columns:\n",
+    "        df.drop(columns=[\"unused\"], inplace=True)\n",
+    "    raw_dict = df.to_dict(orient=\"records\")\n",
+    "    return raw_dict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The default format is: \n",
+    "# - tab separated\n",
+    "# - column \"text\"\n",
+    "# - column \"label\" \n",
+    "\n",
+    "import pandas as pd\n",
+    "\n",
+    "df = pd.DataFrame({\"text\": [\"The concerts supercaliphractisch was great!\", \"I hate people ignoring climate change.\"],\n",
+    "                  \"label\": [\"positive\",\"negative\"]\n",
+    "                  })\n",
+    "print(df)\n",
+    "df.to_csv(\"train.tsv\", sep=\"\\t\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = BertTokenizer.from_pretrained(\n",
+    "    pretrained_model_name_or_path=\"bert-base-uncased\")\n",
+    "\n",
+    "processor = TextClassificationProcessor(data_dir = \"\", \n",
+    "                                        tokenizer=tokenizer,\n",
+    "                                        max_seq_len=64,\n",
+    "                                        label_list=[\"positive\",\"negative\"],\n",
+    "                                        label_column_name=\"label\",\n",
+    "                                        metric=\"acc\",\n",
+    "                                       )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  1. One File -> Dictionarie(s) with \"raw data\"\n",
+    "dicts = processor.file_to_dicts(file=\"train.tsv\")\n",
+    "print(dicts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#  2. One Dictionary -> Sample(s) \n",
+    "#     (Sample = \"clear text\" model input + meta information) \n",
+    "samples = processor._dict_to_samples(dict=dicts[0])\n",
+    "# print each attribute of sample\n",
+    "print(samples[0].clear_text)\n",
+    "print(samples[0].tokenized)\n",
+    "print(samples[0].features)\n",
+    "print(\"----------------------------------\\n\\n\\n\")\n",
+    "# or in a nicer, formatted style\n",
+    "print(samples[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# 3. One Sample -> Features\n",
+    "#    (Features = \"vectorized\" model input)\n",
+    "features = processor._sample_to_features(samples[0])\n",
+    "print(features[0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2) Hands-On: Adjust it to your dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 1: Use an existing Processor\n",
+    "\n",
+    "This works if you have:\n",
+    "- standard tasks\n",
+    "- common file formats \n",
+    "\n",
+    "**Example: Text classification on CSV with multiple columns**\n",
+    "\n",
+    "Dataset: GermEval18 (Hatespeech detection)  \n",
+    "Format: TSV  \n",
+    "Columns: `text coarse_label fine_label`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download dataset\n",
+    "from farm.data_handler import utils\n",
+    "utils._download_extract_downstream_data(\"germeval18/train.tsv\")\n",
+    "!head -n 10 germeval18/train.tsv\n",
+    "\n",
+    "# TODO: Initialize a processor for the above file by passing the right arguments\n",
+    "\n",
+    "processor = TextClassificationProcessor(tokenizer=tokenizer,\n",
+    "                                        max_seq_len=128,\n",
+    "                                        data_dir=\"germeval18\",\n",
+    "                                        train_filename=\"train.tsv\",\n",
+    "                                        label_list=[\"OTHER\",\"OFFENSE\"],\n",
+    "                                        metric=\"acc\",\n",
+    "                                        label_column_name=\"coarse_label\"\n",
+    "                                        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test it\n",
+    "dicts = processor.file_to_dicts(file=\"germeval18/train.tsv\")\n",
+    "print(dicts[0])\n",
+    "assert dicts[0] == {'text': '@corinnamilborn Liebe Corinna, wir würden dich gerne als Moderatorin für uns gewinnen! Wärst du begeisterbar?', 'text_classification_label': 'OTHER'}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Task 2: Build your own Processor\n",
+    "This works best for:\n",
+    "- custom input files\n",
+    "- special preprocessing steps\n",
+    "- advanced multitask learning \n",
+    "\n",
+    "**Example: Text classification with JSON as input file** \n",
+    "\n",
+    "Dataset: [100k Yelp reviews](https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/yelp_reviews_100k.json) ( [full dataset](https://https://www.yelp.com/dataset/download), [documentation](https://https://www.yelp.com/dataset/documentation/main))\n",
+    "\n",
+    "Format: \n",
+    "\n",
+    "``` \n",
+    "{\n",
+    "...\n",
+    "    // integer, star rating\n",
+    "    \"stars\": 4,\n",
+    "\n",
+    "    // string, the review itself\n",
+    "    \"text\": \"Great place to hang out after work: the prices are decent, and the ambience is fun. It's a bit loud, but very lively. The staff is friendly, and the food is good. They have a good selection of drinks.\",\n",
+    "...\n",
+    "}\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download dataset\n",
+    "!wget https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-downstream/yelp_reviews_100k.json\n",
+    "!head -5 yelp_reviews_100k.json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "# TODO: Create a new Processor class and overwrite the function that reads from the file\n",
+    "# The dicts created should look like this to comply with the default TextClassificationProcessor.\n",
+    "#{'text': 'Total bill for this horrible service? ...',\n",
+    "# 'text_classification_label': '4'}\n",
+    "\n",
+    "\n",
+    "class CustomTextClassificationProcessor(TextClassificationProcessor):\n",
+    "  \n",
+    "    # we need to overwrite this function from the parent class\n",
+    "    def file_to_dicts(self, file: str) -> [dict]:\n",
+    "      # read into df\n",
+    "      df = pd.read_json(file, lines=True)\n",
+    "      # rename\n",
+    "      df[\"text_classification_label\"] = df[\"stars\"].astype(str)\n",
+    "      # drop unused\n",
+    "      columns = [\"text_classification_label\",\"text\"]\n",
+    "      df = df[columns]\n",
+    "      # convert to dicts\n",
+    "      dicts = df.to_dict(orient=\"records\")\n",
+    "      return dicts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "processor = CustomTextClassificationProcessor(tokenizer=tokenizer,\n",
+    "                                              max_seq_len=128,\n",
+    "                                              data_dir=\"\",\n",
+    "                                              label_list=[\"1\",\"2\",\"3\",\"4\",\"5\"],\n",
+    "                                              metric=\"acc\",\n",
+    "                                              )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# test it\n",
+    "\n",
+    "dicts = processor.file_to_dicts(file=\"yelp_reviews_100k.json\")\n",
+    "print(dicts[0])\n",
+    "\n",
+    "assert dicts[0] == {'text_classification_label': '1', 'text': 'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.'}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

From 73066d9ad653ee8d8ff041bc04adae3428026ab7 Mon Sep 17 00:00:00 2001
From: Branden Chan <brandenchan@icloud.com>
Date: Tue, 15 Oct 2019 15:24:55 +0200
Subject: [PATCH 3/3] update links in readme

---
 readme.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/readme.rst b/readme.rst
index 6403bf7d7..fc81cd1ad 100644
--- a/readme.rst
+++ b/readme.rst
@@ -53,8 +53,8 @@ Resources
 - `Full Documentation <https://farm.deepset.ai>`_
 - `Intro to Transfer Learning (Blog) <https://medium.com/voice-tech-podcast/https-medium-com-deepset-ai-transfer-learning-entering-a-new-era-in-nlp-db523d9e667b>`_
 - `Intro to Transfer Learning & FARM (Video) <https://www.youtube.com/watch?v=hoDgtvE-u9E&feature=youtu.be>`_
-- Tutorial 1 (Overview of building blocks): `Jupyter notebook <https://github.com/deepset-ai/FARM/blob/master/tutorials/1_farm_building_blocks.ipynb>`_  or `Colab <https://colab.research.google.com/drive/130_7dgVC3VdLBPhiEkGULHmqSlflhmVM>`_
-- Tutorial 2 (How to use custom datasets): `Colab notebook <https://colab.research.google.com/drive/1Ce_wWu-fsy_g16jaGioe8M5mAFdLN1Yx>`_
+- Tutorial 1 (Overview of building blocks): `Jupyter notebook 1 <https://github.com/deepset-ai/FARM/blob/master/tutorials/1_farm_building_blocks.ipynb>`_  or `Colab 1 <https://colab.research.google.com/drive/130_7dgVC3VdLBPhiEkGULHmqSlflhmVM>`_
+- Tutorial 2 (How to use custom datasets): `Jupyter notebook 2 <https://github.com/deepset-ai/FARM/blob/master/tutorials/2_Build_a_processor_for_your_own_dataset.ipynb>`_  or `Colab 2 <https://colab.research.google.com/drive/1Ce_wWu-fsy_g16jaGioe8M5mAFdLN1Yx>`_
 
 
 Installation