In [1]:
!wget "https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/shared_task.jsonl"
!wget "https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/games.csv"

import pandas as pd
import json

inputs = []
outputs_models = []
outputs_gold = []

# load the model outputs
games_csv = pd.read_csv('games.csv')

# collect the inputs, model outputs and gold outputs
with open('shared_task.jsonl', 'r') as f:
 for line, game in zip(f, games_csv.iterrows()):
 j = json.loads(line)

 outputs_models.append(game[1]['GENERATED_TEXT'])
 outputs_gold.append(j['cleaned_text'])

 # remove any outputs from the input data
 for key in ['cleaned_text', 'detokenized_text', 'summary']:
 j.pop(key)

 inputs.append(j)

# save the inputs into a JSONL file
with open('inputs.jsonl', 'w') as f:
 for i in inputs:
 f.write(json.dumps(i) + '\n')

# save the outputs into text files
with open('outputs_gold.txt', 'w') as f:
 for o in outputs_gold:
 f.write(o + '\n')

with open('outputs_model.txt', 'w') as f:
 for o in outputs_models:
 f.write(o + '\n')

--2024-11-06 12:46:54-- https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/shared_task.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 890185 (869K) [text/plain]
Saving to: ‘shared_task.jsonl.4’


2024-11-06 12:46:55 (23.8 MB/s) - ‘shared_task.jsonl.4’ saved [890185/890185]

--2024-11-06 12:46:55-- https://raw.githubusercontent.com/ehudreiter/accuracySharedTask/refs/heads/main/games.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 197686 (193K) [text/plain]
Saving to: ‘games.csv.4’


2024-11-0

In [3]:
import datetime
import random
import json
import os

random.seed(42)

gsml = pd.read_csv('gsml.csv')

os.makedirs('shared-task-annotations/files', exist_ok=True)

# -----------------------------
# create a simple metadata file
# -----------------------------
def random_highlight_color():
 rgb = [random.randint(200, 255) for i in range(3)]
 hex_code = f"#{rgb[0]:02x}{rgb[1]:02x}{rgb[2]:02x}"
 return hex_code

span_categories_list = list(gsml['TYPE'].unique())
span_categories = [{"name": cat, "color": random_highlight_color(), "description": cat} for i, cat in enumerate(span_categories_list)] # only dummy descriptions

metadata = {
 "id": "shared-task-annotations",
 "mode": "external",
 "config": {
 "annotation_span_categories": span_categories
 },
 "created": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
}

with open('shared-task-annotations/metadata.json', 'w') as f:
 json.dump(metadata, f, indent=4)


# -----------------------------
# collect the annotations
# -----------------------------
def get_char_pos(sent_token_start, lines, i):
 tokens = lines[i].split()
 if sent_token_start == 0:
 return 0
 
 char_pos = 0
 for token in tokens[:sent_token_start-1]:
 char_pos += len(token) + 1

 return char_pos


ann_records = []
db_records = []

# iterate over the annotations
# each annotation has `TEXT_ID` which is in the format `SNNN.txt`, where NNN is the index of the game (padded with 0)
for i in range(len(inputs)):
 # get the text id (padded)
 text_id = f"S{i+1:03d}.txt"
 
 # get all the annotations for this text id
 annotation_records = gsml[gsml['TEXT_ID'] == text_id]

 annotations = sorted([
 {
 "reason" : record['CORRECTION'] if not pd.isna(record['CORRECTION']) else None,
 "text": record['TOKENS'],
 "type": span_categories_list.index(record['TYPE']),
 "start": get_char_pos(record['DOC_TOKEN_START'], outputs_models, i),
 } for _, record in annotation_records.iterrows()
 ], key=lambda x: x['start'])
 ann_record = {
 "metadata" : {
 "annotator_group": 0,
 "annotator_id": "anonymous",
 "annotation_span_categories": span_categories,
 "campaign_id": "shared-task-annotations",
 },
 "dataset": "rotowire-shared-task",
 "setup_id": "shared-task-model",
 "split": "test",
 "example_idx": i,
 "annotations": annotations
 }
 ann_records.append(ann_record)
 # dataset,split,example_idx,setup_id,annotator_id,status,start,end
 
 db_record = ann_record.copy()
 db_record.pop('annotations')
 db_records.append(db_record)

# save the annotations to a JSONL file in the `shared-task-annotations/files` directory
with open('shared-task-annotations/files/annotations.jsonl', 'w') as f:
 for record in ann_records:
 f.write(json.dumps(record) + "\n")

# save the database records to a CSV file in the `shared-task-annotations/db.csv`file
db_df = pd.DataFrame(db_records)

db_df.to_csv('shared-task-annotations/db.csv', index=False)

# now just copy over the `shared-task-annotations` directory to `factgenie/campaigns`
