-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbuild_dict.py
40 lines (36 loc) · 1.98 KB
/
build_dict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import morfeusz2, os, glob, re, emoji, json,sys, copy
morf =morfeusz2.Morfeusz(aggl="permissive", praet="composite")
e = emoji.get_emoji_regexp().pattern#re.compile("\s?([^:\s]+?)\s?(:[^\s]+:)")
emoji_pattern =re.compile(f"(\w+?)\s?((?:(?:\s*)?(?:{e})(?:\s*)?)+)")
dict = {}
dict_data_folder = "dict_data"
output_file = "dict.json"
for filename in glob.glob(f"{dict_data_folder}/*"):
print(filename)
dict_entry = {}
if (not os.path.isfile(filename)): continue
with open(filename, encoding="utf-8") as data_file:
content = data_file.read().replace(".", "").replace(",", "").replace(":","").replace("-","").replace("!","").replace("?","")
for group in emoji_pattern.findall(content):
word = group[0]
if word in ['i', 'lub', 'też', "się", "sie", "ale", "nad"]: continue
emojis = emoji.demojize(group[1])
#warunki
emojis_list = list(dict.fromkeys([x for x in emojis.split(":") if len(x)>1 and 'keycap' not in x]))
emoji_combo = [x for x in emojis.split(":") if len(x)>1 and 'keycap' not in x]
lemma = morf.analyse(word);
for meaning in lemma:
base_word = meaning[2][1]
if "być" in base_word or base_word in ['i:i', 'i:q', 'i:j']: continue
#print(base_word)
#print(dict)
if base_word in dict:
dict[base_word]["emoji_after"]+=copy.deepcopy(emojis_list)
dict[base_word]["emoji_after"]=list(dict.fromkeys( dict[base_word]["emoji_after"]))
dict[base_word]["combo"]+=copy.deepcopy(emoji_combo)
else:
dict_entry["emoji_after"] = emojis_list
dict[base_word] = copy.deepcopy(dict_entry)
dict[base_word]["combo"]=copy.deepcopy(emoji_combo)
with open(output_file, "w+", encoding="utf-8") as f:
json.dump(dict,f, ensure_ascii=False)