-
Notifications
You must be signed in to change notification settings - Fork 0
/
init.py
118 lines (93 loc) · 2.99 KB
/
init.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re
import codecs
import sqlite3
f = codecs.open('lemma.txt', encoding='utf-8')
analyses = {}
features = {}
word = None
connection = sqlite3.connect("parsextoto.sqlite")
# Enable access to attributes by dictionary keys.
connection.row_factory = sqlite3.Row
cursor = connection.cursor()
cursor.execute("DROP TABLE IF EXISTS Wort")
cursor.execute("DROP TABLE IF EXISTS Morph")
cursor.execute('CREATE TABLE Wort (WortID INTEGER PRIMARY KEY, Wort TEXT, POS TEXT, Morpheme TEXT)')
cursor.execute('CREATE TABLE Morph (WortID INTEGER, Features TEXT)')
word_id = 0
done = True
first_analysis = False
forms = {}
for line in f:
line = line.strip()
if line.startswith('>'):
word = line[2:]
#print word, word[0:-2], forms.get(word[0:-2])
# Exclude inflection of extremely long words.
if len(word) < 10 or (not forms.get(word[0:-1]) and not forms.get(word[0:-2])):
word_id += 1
first_analysis = True
done = False
if word_id % 10000 == 0:
print word_id, word
connection.commit()
else:
done = True
forms[word] = True
elif not done and not line.startswith('no'):
#print line
morph = ''
tag = None
feats = []
boundary = '#'
# Match
# 1. token1:token2 - token is either a single letter or something in <>
# 2. a single letter.
# This returns a triple of the form (token1, token2, single letter).
re_trans = re.compile(u'([a-zA-Zäöüß]|<.*?>):([a-zA-Zäöüß]|<.*?>)|([a-zA-Zäöüß])')
matches = re_trans.findall(line)
length = 0
for i, m in enumerate(matches):
#print m
token1 = m[0]
token2 = m[1]
single = m[2]
# Single letter matches: simple append it.
if single:
morph += single
length += 1
elif token1.startswith('<'):
# Token one is empty: append token2 (e.g. a suffix)
if token1 == '<>':
morph += token2
length += 1
# Collect morph info.
else:
if tag:
feats.append(token1.replace('<','').replace('>',''))
else:
if token1.startswith('<+'):
tag = token1
if length < len(word):
if not (morph.endswith('#')):
morph += '#'
# Mapping of features to letters.
if not token2.startswith('<'):
morph += token2
# Mapping of letters to other letters or removal of letters.
elif token1 and (token2 != '<>'):
morph += token2
length += 1
if tag: #and tag in ['<+NN>', '<+ADJ>'] or (tag == '<+V>' and '3' in feats):
tag = tag.replace('<','').replace('+', '').replace('>', '')
ok_noun = (tag == 'NN') #and ('Nom' in feats)
ok_verb = (tag == 'V') and '3' in feats and 'Ind' in feats
ok_adj = (tag == 'ADJ')
if ok_noun or ok_verb or ok_adj:
if first_analysis:
#print morph, tag, '_'.join(feats)
cmd = u'INSERT INTO Wort (WortID, Wort, POS, Morpheme) VALUES (%s, "%s", "%s", "%s")' % (word_id, word, tag, morph)
cursor.execute(cmd)
first_analysis = False
cursor.execute(u'INSERT INTO Morph (WortID, Features) VALUES (%s, "%s")' % (word_id, '_'.join(feats)))