-
Notifications
You must be signed in to change notification settings - Fork 21
/
preprocess_raw.sh
49 lines (35 loc) · 1.52 KB
/
preprocess_raw.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/env bash
set -e
# Start a Stanford CoreNLP server before running this script.
# https://stanfordnlp.github.io/CoreNLP/corenlp-server.html
# The compound file is downloaded from
# https://github.com/ChunchuanLv/AMR_AS_GRAPH_PREDICTION/blob/master/data/joints.txt
compound_file=data/AMR/amr_2.0_utils/joints.txt
raw_file=$1
python prepare_raw.py ${raw_file}
python -u -m stog.data.dataset_readers.amr_parsing.preprocess.feature_annotator \
${raw_file}.raw \
--compound_file ${compound_file}
# ############### AMR v2.0 ################
# # Directory where intermediate utils will be saved to speed up processing.
util_dir=data/AMR/amr_2.0_utils
# AMR data with **features**
test_data=${raw_file}.raw.features
# ========== Set the above variables correctly ==========
printf "Cleaning inputs...`date`\n"
python -u -m stog.data.dataset_readers.amr_parsing.preprocess.input_cleaner \
--amr_files ${test_data}
printf "Done.`date`\n\n"
printf "Recategorizing subgraphs...`date`\n"
python -u -m stog.data.dataset_readers.amr_parsing.preprocess.text_anonymizor \
--amr_file ${test_data}.input_clean \
--util_dir ${util_dir}
printf "Done.`date`\n\n"
printf "Removing senses...`date`\n"
python -u -m stog.data.dataset_readers.amr_parsing.preprocess.sense_remover \
--util_dir ${util_dir} \
--amr_files ${test_data}.input_clean.recategorize
printf "Done.`date`\n\n"
printf "Renaming preprocessed files...`date`\n"
mv ${test_data}.input_clean.recategorize.nosense ${test_data}.preproc
rm ${test_data}*.input_clean*