{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2024-11-06 12:46:54-- https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/shared_task.jsonl\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 890185 (869K) [text/plain]\n", "Saving to: ‘shared_task.jsonl.4’\n", "\n", "shared_task.jsonl.4 100%[===================>] 869.32K --.-KB/s in 0.04s \n", "\n", "2024-11-06 12:46:55 (23.8 MB/s) - ‘shared_task.jsonl.4’ saved [890185/890185]\n", "\n", "--2024-11-06 12:46:55-- https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/games.csv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 197686 (193K) [text/plain]\n", "Saving to: ‘games.csv.4’\n", "\n", "games.csv.4 100%[===================>] 193.05K --.-KB/s in 0.02s \n", "\n", "2024-11-06 12:46:55 (12.4 MB/s) - ‘games.csv.4’ saved [197686/197686]\n", "\n" ] } ], "source": [ "!wget \"https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/shared_task.jsonl\"\n", "!wget \"https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/games.csv\"\n", "\n", "import pandas as pd\n", "import json\n", "\n", "inputs = []\n", "outputs_models = []\n", "outputs_gold = []\n", "\n", "# load the model outputs\n", "games_csv = pd.read_csv('games.csv')\n", "\n", "# collect the inputs, model outputs and gold outputs\n", "with open('shared_task.jsonl', 'r') as f:\n", " for line, game in zip(f, games_csv.iterrows()):\n", " j = json.loads(line)\n", "\n", " outputs_models.append(game[1]['GENERATED_TEXT'])\n", " outputs_gold.append(j['cleaned_text'])\n", "\n", " # remove any outputs from the input data\n", " for key in ['cleaned_text', 'detokenized_text', 'summary']:\n", " j.pop(key)\n", "\n", " inputs.append(j)\n", "\n", "# save the inputs into a JSONL file\n", "with open('inputs.jsonl', 'w') as f:\n", " for i in inputs:\n", " f.write(json.dumps(i) + '\\n')\n", "\n", "# save the outputs into text files\n", "with open('outputs_gold.txt', 'w') as f:\n", " for o in outputs_gold:\n", " f.write(o + '\\n')\n", "\n", "with open('outputs_model.txt', 'w') as f:\n", " for o in outputs_models:\n", " f.write(o + '\\n')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import datetime\n", "import random\n", "import json\n", "import os\n", "\n", "random.seed(42)\n", "\n", "gsml = pd.read_csv('gsml.csv')\n", "\n", "os.makedirs('shared-task-annotations/files', exist_ok=True)\n", "\n", "# -----------------------------\n", "# create a simple metadata file\n", "# -----------------------------\n", "def random_highlight_color():\n", " rgb = [random.randint(200, 255) for i in range(3)]\n", " hex_code = f\"#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}\"\n", " return hex_code\n", "\n", "span_categories_list = list(gsml['TYPE'].unique())\n", "span_categories = [{\"name\": cat, \"color\": random_highlight_color(), \"description\": cat} for i, cat in enumerate(span_categories_list)] # only dummy descriptions\n", "\n", "metadata = {\n", " \"id\": \"shared-task-annotations\",\n", " \"mode\": \"external\",\n", " \"config\": {\n", " \"annotation_span_categories\": span_categories\n", " },\n", " \"created\": datetime.datetime.now().strftime(\"%Y-%m-%d %H:%M:%S\")\n", "}\n", "\n", "with open('shared-task-annotations/metadata.json', 'w') as f:\n", " json.dump(metadata, f, indent=4)\n", "\n", "\n", "# -----------------------------\n", "# collect the annotations\n", "# -----------------------------\n", "def get_char_pos(sent_token_start, lines, i):\n", " tokens = lines[i].split()\n", " if sent_token_start == 0:\n", " return 0\n", " \n", " char_pos = 0\n", " for token in tokens[:sent_token_start-1]:\n", " char_pos += len(token) + 1\n", "\n", " return char_pos\n", "\n", "\n", "ann_records = []\n", "db_records = []\n", "\n", "# iterate over the annotations\n", "# each annotation has `TEXT_ID` which is in the format `SNNN.txt`, where NNN is the index of the game (padded with 0)\n", "for i in range(len(inputs)):\n", " # get the text id (padded)\n", " text_id = f\"S{i+1:03d}.txt\"\n", " \n", " # get all the annotations for this text id\n", " annotation_records = gsml[gsml['TEXT_ID'] == text_id]\n", "\n", " annotations = sorted([\n", " {\n", " \"reason\" : record['CORRECTION'] if not pd.isna(record['CORRECTION']) else None,\n", " \"text\": record['TOKENS'],\n", " \"type\": span_categories_list.index(record['TYPE']),\n", " \"start\": get_char_pos(record['DOC_TOKEN_START'], outputs_models, i),\n", " } for _, record in annotation_records.iterrows()\n", " ], key=lambda x: x['start'])\n", " ann_record = {\n", " \"metadata\" : {\n", " \"annotator_group\": 0,\n", " \"annotator_id\": \"anonymous\",\n", " \"annotation_span_categories\": span_categories,\n", " \"campaign_id\": \"shared-task-annotations\",\n", " },\n", " \"dataset\": \"rotowire-shared-task\",\n", " \"setup_id\": \"shared-task-model\",\n", " \"split\": \"test\",\n", " \"example_idx\": i,\n", " \"annotations\": annotations\n", " }\n", " ann_records.append(ann_record)\n", " # dataset,split,example_idx,setup_id,annotator_id,status,start,end\n", " \n", " db_record = ann_record.copy()\n", " db_record.pop('annotations')\n", " db_records.append(db_record)\n", "\n", "# save the annotations to a JSONL file in the `shared-task-annotations/files` directory\n", "with open('shared-task-annotations/files/annotations.jsonl', 'w') as f:\n", " for record in ann_records:\n", " f.write(json.dumps(record) + \"\\n\")\n", "\n", "# save the database records to a CSV file in the `shared-task-annotations/db.csv`file\n", "db_df = pd.DataFrame(db_records)\n", "\n", "db_df.to_csv('shared-task-annotations/db.csv', index=False)\n", "\n", "# now just copy over the `shared-task-annotations` directory to `factgenie/campaigns`\n" ] } ], "metadata": { "kernelspec": { "display_name": "factgenie", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }