-
Notifications
You must be signed in to change notification settings - Fork 2
/
3.run_canarex.py
107 lines (82 loc) · 3.38 KB
/
3.run_canarex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import pandas as pd
import json
from pathlib import Path
from narratives import Narratives
def create_sentences_json(config):
"""
Extracts co-referenced sentences into list of sentence dictionary
sentences_json, [<sentence_id> : <sentence>]
Saves the sentences_json in json format in the specified location
@param:
data: Co-referenced data.
@return:
None
"""
df = pd.read_json(config['data'], lines=True)
sentences = df[['sentences', 'sentences_idx', 'id']]
sentences = sentences.explode(['sentences', 'sentences_idx'])
sentences['sentence_id'] = sentences.id + '_' + sentences.sentences_idx.astype(str)
sentences = [sentences.sentence_id.tolist(), sentences.sentences.tolist()]
sentences_json = [{"sentence": sent, 'id': id} for sent, id in zip(sentences[1], sentences[0])]
with open(config['sentences_path'], 'w') as f:
json.dump(sentences_json, f)
config['sentences'] = sentences_json
def extract_narratives(config):
"""
Creates narratives (final_narratives.jsonl) from the co-referenced sentences
sentences_json, [<sentence_id> : <sentence>]
Saves the narratives (final_narratives.jsonl) in the the specified location
@param:
config: sentences_json[<sentence_id> : <sentence>] and output_folder
@return:
None
"""
nr = Narratives(output_folder=config['output_folder'])
nr.create_narratives(sentences_json=config['sentences'], save=True)
def merge_with_input(config):
"""
Merge the narratives with original data
@param:
config: data and output_folder
@return:
None
"""
df = pd.read_json((config['data']), lines=True)
df = df.drop(columns=['content', 'clusters', 'sentences', 'sentences_idx'],
errors='ignore')
narratives = pd.read_json('{}/final_narratives.jsonl'.format(config['output_folder']), lines=True)
# Merge keys from the original data
narratives = pd.merge(narratives, df)
output_narrative_path = '{}/{}_final_narratives.jsonl'.format(config['output_folder'],
Path(config['data']).stem)
narratives.to_json(output_narrative_path,
orient='records',
lines=True)
def run_factiva():
config = {'data': 'data/factiva/_first_nations.jsonl',
'sentences_path': 'data/factiva/coref_sentences.json',
'sentences': None,
'output_folder': 'data/factiva'}
create_sentences_json(config)
extract_narratives(config)
merge_with_input(config)
def run_hansard():
config = {'data': 'data/hansard/_first_nations.jsonl',
'sentences_path': 'data/hansard/coref_sentences.json',
'sentences': None,
'output_folder': 'data/hansard'}
create_sentences_json(config)
extract_narratives(config)
merge_with_input(config)
if __name__ == '__main__':
# run_factiva() # data not shared
# run_hansard()
# sample data
config = {'data': 'data/hansard_sample/_first_nations_sample.jsonl',
'sentences_path': 'data/hansard_sample/coref_sentences.json',
'sentences': None,
'output_folder': 'data/hansard_sample'}
create_sentences_json(config)
extract_narratives(config)
merge_with_input(config)