-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathraw_to_amconll.py
64 lines (51 loc) · 2.8 KB
/
raw_to_amconll.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#
# Copyright (c) 2020 Saarland University.
#
# This file is part of AM Parser
# (see https://github.com/coli-saar/am-parser/).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from graph_dependency_parser.components.dataset_readers.amconll_tools import from_raw_text
from graph_dependency_parser.components.spacy_interface import spacy_tokenize
import argparse
parser = argparse.ArgumentParser(description="""Take raw text and generate an empty amconll file that can then be parsed.
NOTE: this is not the method we used to prepare the test set, so results may differ.
If you want to parse the test set (or see what kind of pre-processing is done there),
have a look at scripts/predict.sh
""")
parser.add_argument('text_file', type=str, help='Path to raw text file. We assume one sentence per line.')
parser.add_argument("output_file",type=str, help="Path to amconll file that will be written.")
parser.add_argument("formalism",type=str,help="Which formalism to prepare the amconll for. For instance, DM, PAS, PSD need an additional 'artificial token' at the end of each sentence. Available are: DM, PAS, PSD, EDS, AMR.")
parser.add_argument('--no-tok',action="store_true",
help="Don't use spacy tokenizer and split on spaces.")
args = parser.parse_args()
requires_art_root = {"DM": True, "PAS": True, "PSD": True, "EDS": False, "AMR": False}
requires_ne_merging = {"DM": False, "PAS": False, "PSD": False, "EDS": False, "AMR": True}
if args.formalism not in requires_art_root:
raise ValueError(f"Do not recognize the formalism {args.formalism}; known are: {sorted(requires_art_root.keys())}")
sents = []
with open(args.text_file) as f:
for sentence in f:
if args.no_tok:
sentence = sentence.strip()
words = sentence.split(" ")
else:
words = spacy_tokenize(sentence)
am_sentence = from_raw_text(sentence.rstrip("\n"), words, requires_art_root[args.formalism], dict(),
requires_ne_merging[args.formalism])
sents.append(am_sentence)
with open(args.output_file, "w") as f:
for s in sents:
f.write(str(s))
f.write("\n\n")