-
Notifications
You must be signed in to change notification settings - Fork 1
/
generate_kosh_files.py
executable file
·68 lines (54 loc) · 2.57 KB
/
generate_kosh_files.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import json
def generate_dot_kosh(path_to_freedict, item):
filename = "{}/{}/.kosh".format(path_to_freedict, item)
with open(filename, mode="w") as writer:
# [acc]
writer.write('[{}]'.format(item.replace('-', '_')))
writer.write('\n')
# the only exception is eng-pol where all the entries are located in different XML files ([a..z].xml)
if item == 'eng-pol':
# list all the files in curr_dir
list_of_xml_files = sorted(os.listdir(os.path.join(path_to_freedict, item, 'letters')))
list_of_xml_files = ['letters/' + x for x in list_of_xml_files]
writer.write('files: {}'.format(json.dumps(list_of_xml_files)))
writer.write('\n')
else:
writer.write('files: ["{}.tei"]'.format(item))
writer.write('\n')
# schema: kosh_dict_mapping.json
# abs_path_to_gen_dict_kosh_files = '{}/{}/'.format(abs_path_to_gen_kosh_files, dict)
writer.write('schema: kosh_{}_mapping.json'.format(item))
writer.write("\n")
def generate_mapping(path_to_freedicts, item):
mappings = {}
mappings['mappings'] = {}
xpaths = {}
xpaths['_xpaths'] = {}
## add objects to xpaths
xpaths['_xpaths']['root'] = '//tei:entry'
xpaths['_xpaths']['id'] = 'None'
# fields to be indexed.
# Kosh per default indexes the whole xml entry. NOTE: xml tags and attrs are not indexed.
fields = {}
fields['headword'] = './tei:form/tei:orth'
fields['[translation]'] = ".//tei:sense/tei:cit[@type='trans']/tei:quote"
xpaths['_xpaths']['fields'] = fields
mappings['mappings']['_meta'] = xpaths
properties = {}
properties['headword'] = {}
properties['headword']['type'] = 'keyword'
properties['translation'] = {}
properties['translation']['type'] = 'text'
mappings['mappings']['properties'] = properties
## The 'properties' object provides elastic search with info for manipulating the index
# strings classified as 'keyword' will not be analyzed i.e. they will not be tokenized or lowercased.
# strings classified as 'text' will be analyzed
with open('{}/{}/kosh_{}_mapping.json'.format(path_to_freedicts, item, item), 'w') as outfile:
json.dump(mappings, outfile, indent=4)
def generate_all_kosh_files(path_to_freedict):
for item in os.listdir(path_to_freedict):
if os.path.isdir(os.path.join(path_to_freedict, item)):
# we create a .kosh file for each dict
generate_dot_kosh(path_to_freedict, item)
generate_mapping(path_to_freedict, item)