-
Notifications
You must be signed in to change notification settings - Fork 1
/
create_corpus_vocab.sh
executable file
·83 lines (72 loc) · 2.71 KB
/
create_corpus_vocab.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env bash
usage="$(basename "$0") <string> [-h|--help] [-f|--corpus_file <string>]
Create vocabulary with 40k/80k/120k most frequent words from corpus file.
Positional parameters:
absolute path to corpus file containing normalized text
Named parameters
-h|--help show this help text
-t|--target_dir absolute path to target directory to save vocabulary
"
POSITIONAL=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-h|--help)
echo ${usage}
shift
exit
;;
-t|--target_dir)
target_dir="$2"
shift
shift
;;
*) # unknown option
POSITIONAL+=("$1") # save it in an array for later
shift
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
corpus_file=$1
corpus_filename=$(basename -- ${corpus_file})
corpus_filename="${corpus_filename%.*}"
vocab_counts=${corpus_file%.*}.counts
if [[ ! -f "${vocab_counts}" ]] ; then
echo "counting word occurrences and saving them in $vocab_counts..."
cat ${corpus_file} |
pv -s $(stat --printf="%s" ${corpus_file}) | # show progress bar
tr '[:upper:]' '[:lower:]' | # make everything lowercase
tr -s '[:space:]' '\n' | # replace any number of spaces with one newline (one word per line)
grep -v '^\s*$' | # remove empty lines
grep -Ev '[0-9]' | # remove words containing numbers
# awk 'length($0)>1' | # remove words with length 1
sort | uniq -c | sort -bnr > ${vocab_counts} # sort alphanumeric, count unique words, then sort numeric
echo '...done!'
fi
total_sum=$(echo $(cat ${vocab_counts} |
tr -sc '[:digit:]' '+' |
sed 's/+$//') |
bc) # sum everything up
echo "total number of words in vocabulary: $total_sum"
# write all words sorted by frequency to vocab file
cat ${vocab_counts} |
tr -d '[:digit:] ' | # remove counts from lines
tr '\n' ' ' > ${target_dir}/${corpus_filename}.vocab # replace newline with spaces (expected input format for KenLM)
for top_words in 40 80 160
do
vocab_file=${target_dir}/${corpus_filename}_${top_words}k.vocab
n=$((${top_words}*1000))
echo "writing $n top words to $vocab_file"
head -${n} ${vocab_counts} |
tr -d '[:digit:] ' | # remove counts from lines
tr '\n' ' ' > ${vocab_file} # replace newline with spaces (expected input format for KenLM)
top_sum=$(echo $(head -${n} ${vocab_counts} |
tr -sc '[:digit:]' '+' | #remove everything non-numeric by a plus sign
sed 's/+$//') | # remove last plus sign
bc) # sum everything up
echo "number of words in vocabulary: $top_sum"
fraction=$(echo "scale=2 ; 100 * $top_sum / $total_sum" | bc)
echo "Top $n words make up $fraction% of words"
done