-
Notifications
You must be signed in to change notification settings - Fork 2
/
lm.py
77 lines (55 loc) · 2.61 KB
/
lm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from numpy import load as np_load
import pickle
# import socket
# host_name = socket.gethostname()
# print(host_name)
from pynlpl.lm import lm
from clotho_dataloader.data_handling.my_clotho_data_loader import create_dictionaries
import glob
__author__ = "Thomas Pellegrini - 2020"
data_dir='../clotho-dataset/data/'
WORD_LIST = pickle.load(open(data_dir + "/words_list.p", "rb"))# 4367 word types
WORD_FREQ = pickle.load(open(data_dir + "/words_frequencies.p", "rb"))
word2index, index2word = create_dictionaries(WORD_LIST)
def gather_captions_to_text(caption_dir, out_fpath):
fh = open(out_fpath, 'wt')
i = 0
for npy_fpath in glob.glob(caption_dir + '/*.npy'):
recarray = np_load(str(npy_fpath), allow_pickle=True)
word_indices_list = recarray['words_ind'][0]
# print(word_indices_list)
word_str_list = [index2word[w] for w in word_indices_list]
# word_str = ' '.join(word_str_list).replace('<sos> ', '')
word_str = ' '.join(word_str_list)
# print(npy_fpath, word_str)
fh.write(word_str + '\n')
i += 1
# if i==2: break
print("wrote %d lines to file"%i)
# subset = 'clotho_dataset_dev'
subset = 'clotho_dataset_eva'
caption_dir=data_dir + subset
# out_fpath = '../clotho-dataset/lm/dev.txt'
out_fpath = '../clotho-dataset/lm/eva.txt'
# gather_captions_to_text(caption_dir, out_fpath)
# building 3-g LM
# https://cmusphinx.github.io/wiki/tutoriallm/#training-an-arpa-model-with-srilm
# $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram-count -kndiscount -interpolate -text ../clotho-dataset/lm/dev.txt -lm ../clotho-dataset/lm/dev.lm
# $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev.lm -ppl ../clotho-dataset/lm/eva.txt
# file ../clotho-dataset/lm/eva.txt: 5225 sentences, 64350 words, 0 OOVs
# 0 zeroprobs, logprob= -114470.2 ppl= 44.18532 ppl1= 60.09924
# $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev.lm -prune 1e-8 -write-lm ../clotho-dataset/lm/dev_pruned.lm
# $ ~/tools/kaldi/tools/srilm/lm/bin/i686-m64/ngram -lm ../clotho-dataset/lm/dev_pruned.lm -ppl ../clotho-dataset/lm/eva.txt
# file ../clotho-dataset/lm/eva.txt: 5225 sentences, 64350 words, 0 OOVs
# 0 zeroprobs, logprob= -114658.9 ppl= 44.46208 ppl1= 60.50634
# lm_path=b'../clotho-dataset/lm/dev.lm'
#
# lm = LM(lm_path, lower=False)
#
#
# print(len([b"man", b"a"]))
# print(lm.logprob_strings(lm, b"is", [b"man", b"a"]))
lm = lm.ARPALanguageModel('../clotho-dataset/lm/dev.lm')
print("man", lm.scoreword("man"))
print("a man", lm.scoreword("man", history=("a",)))
print("a hyman", lm.scoreword("hyman", history=("a",)))