{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2024-11-06 12:46:54--  https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/shared_task.jsonl\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 890185 (869K) [text/plain]\n",
      "Saving to: ‘shared_task.jsonl.4’\n",
      "\n",
      "shared_task.jsonl.4 100%[===================>] 869.32K  --.-KB/s    in 0.04s   \n",
      "\n",
      "2024-11-06 12:46:55 (23.8 MB/s) - ‘shared_task.jsonl.4’ saved [890185/890185]\n",
      "\n",
      "--2024-11-06 12:46:55--  https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/games.csv\n",
      "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...\n",
      "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 197686 (193K) [text/plain]\n",
      "Saving to: ‘games.csv.4’\n",
      "\n",
      "games.csv.4         100%[===================>] 193.05K  --.-KB/s    in 0.02s   \n",
      "\n",
      "2024-11-06 12:46:55 (12.4 MB/s) - ‘games.csv.4’ saved [197686/197686]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget \"https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/shared_task.jsonl\"\n",
    "!wget \"https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/games.csv\"\n",
    "\n",
    "import pandas as pd\n",
    "import json\n",
    "\n",
    "inputs = []\n",
    "outputs_models = []\n",
    "outputs_gold = []\n",
    "\n",
    "# load the model outputs\n",
    "games_csv = pd.read_csv('games.csv')\n",
    "\n",
    "# collect the inputs, model outputs and gold outputs\n",
    "with open('shared_task.jsonl', 'r') as f:\n",
    "    for line, game in zip(f, games_csv.iterrows()):\n",
    "        j = json.loads(line)\n",
    "\n",
    "        outputs_models.append(game[1]['GENERATED_TEXT'])\n",
    "        outputs_gold.append(j['cleaned_text'])\n",
    "\n",
    "        # remove any outputs from the input data\n",
    "        for key in ['cleaned_text', 'detokenized_text', 'summary']:\n",
    "            j.pop(key)\n",
    "\n",
    "        inputs.append(j)\n",
    "\n",
    "# save the inputs into a JSONL file\n",
    "with open('inputs.jsonl', 'w') as f:\n",
    "    for i in inputs:\n",
    "        f.write(json.dumps(i) + '\\n')\n",
    "\n",
    "# save the outputs into text files\n",
    "with open('outputs_gold.txt', 'w') as f:\n",
    "    for o in outputs_gold:\n",
    "        f.write(o + '\\n')\n",
    "\n",
    "with open('outputs_model.txt', 'w') as f:\n",
    "    for o in outputs_models:\n",
    "        f.write(o + '\\n')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datetime\n",
    "import random\n",
    "import json\n",
    "import os\n",
    "\n",
    "random.seed(42)\n",
    "\n",
    "gsml = pd.read_csv('gsml.csv')\n",
    "\n",
    "os.makedirs('shared-task-annotations/files', exist_ok=True)\n",
    "\n",
    "# -----------------------------\n",
    "# create a simple metadata file\n",
    "# -----------------------------\n",
    "def random_highlight_color():\n",
    "    rgb = [random.randint(200, 255) for i in range(3)]\n",
    "    hex_code = f\"#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}\"\n",
    "    return hex_code\n",
    "\n",
    "span_categories_list = list(gsml['TYPE'].unique())\n",
    "span_categories = [{\"name\": cat, \"color\": random_highlight_color(), \"description\": cat} for i, cat in enumerate(span_categories_list)] # only dummy descriptions\n",
    "\n",
    "metadata = {\n",
    "    \"id\": \"shared-task-annotations\",\n",
    "    \"mode\": \"external\",\n",
    "    \"config\": {\n",
    "        \"annotation_span_categories\": span_categories\n",
    "    },\n",
    "    \"created\": datetime.datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n",
    "}\n",
    "\n",
    "with open('shared-task-annotations/metadata.json', 'w') as f:\n",
    "    json.dump(metadata, f, indent=4)\n",
    "\n",
    "\n",
    "# -----------------------------\n",
    "# collect the annotations\n",
    "# -----------------------------\n",
    "def get_char_pos(sent_token_start, lines, i):\n",
    "    tokens = lines[i].split()\n",
    "    if sent_token_start == 0:\n",
    "        return 0\n",
    "    \n",
    "    char_pos = 0\n",
    "    for token in tokens[:sent_token_start-1]:\n",
    "        char_pos += len(token) + 1\n",
    "\n",
    "    return char_pos\n",
    "\n",
    "\n",
    "ann_records = []\n",
    "db_records = []\n",
    "\n",
    "# iterate over the annotations\n",
    "# each annotation has `TEXT_ID` which is in the format `SNNN.txt`, where NNN is the index of the game (padded with 0)\n",
    "for i in range(len(inputs)):\n",
    "    # get the text id (padded)\n",
    "    text_id = f\"S{i+1:03d}.txt\"\n",
    "    \n",
    "    # get all the annotations for this text id\n",
    "    annotation_records = gsml[gsml['TEXT_ID'] == text_id]\n",
    "\n",
    "    annotations = sorted([\n",
    "        {\n",
    "            \"reason\" : record['CORRECTION'] if not pd.isna(record['CORRECTION']) else None,\n",
    "            \"text\": record['TOKENS'],\n",
    "            \"type\": span_categories_list.index(record['TYPE']),\n",
    "            \"start\": get_char_pos(record['DOC_TOKEN_START'], outputs_models, i),\n",
    "        } for _, record in annotation_records.iterrows()\n",
    "    ], key=lambda x: x['start'])\n",
    "    ann_record = {\n",
    "        \"metadata\" : {\n",
    "            \"annotator_group\": 0,\n",
    "            \"annotator_id\": \"anonymous\",\n",
    "            \"annotation_span_categories\": span_categories,\n",
    "            \"campaign_id\": \"shared-task-annotations\",\n",
    "        },\n",
    "        \"dataset\": \"rotowire-shared-task\",\n",
    "        \"setup_id\":  \"shared-task-model\",\n",
    "        \"split\": \"test\",\n",
    "        \"example_idx\": i,\n",
    "        \"annotations\": annotations\n",
    "    }\n",
    "    ann_records.append(ann_record)\n",
    "    # dataset,split,example_idx,setup_id,annotator_id,status,start,end\n",
    "    \n",
    "    db_record = ann_record.copy()\n",
    "    db_record.pop('annotations')\n",
    "    db_records.append(db_record)\n",
    "\n",
    "# save the annotations to a JSONL file in the `shared-task-annotations/files` directory\n",
    "with open('shared-task-annotations/files/annotations.jsonl', 'w') as f:\n",
    "    for record in ann_records:\n",
    "        f.write(json.dumps(record) + \"\\n\")\n",
    "\n",
    "# save the database records to a CSV file in the `shared-task-annotations/db.csv`file\n",
    "db_df = pd.DataFrame(db_records)\n",
    "\n",
    "db_df.to_csv('shared-task-annotations/db.csv', index=False)\n",
    "\n",
    "# now just copy over the `shared-task-annotations` directory to `factgenie/campaigns`\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "factgenie",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}