-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkorpreader.py
134 lines (100 loc) · 3.94 KB
/
korpreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#!/usr/bin/python
# -*- coding: utf-8 -*-
import ssl
import random
from itertools import islice
from threading import Thread
import re
import json
import html
import ijson.backends.yajl2 as ijson
import urllib.request
import certifi
import time
BATCH_SIZE = 5000
dummy_cp = ["KLK_FI_"+str(i) for i in range(1890, 1900)]
ca_certs = "/etc/ssl/certs/ca_certificates.crt"
ca_path = "/home/kanner/anaconda3/ssl/cacert.pem"
def compose_url_string(regex, start, corpus_list, cqp_query, output_cats, context, sort="random", sample_size="all"):
res = "https://korp.csc.fi/cgi-bin/korp.cgi?command=query"
cqp = 'cqp=['+cqp_query+'="'+regex+'"]'
cqp = regex_decode(cqp)
if context == "sentence":
res += "&"+cqp+"&defaultcontext=1+sentence&defaultwithin=sentence&show="
else:
res += "&"+cqp+"&defaultcontext=0&show="
###METADATA CATEGORIES
res += "%2C".join(output_cats)
res += "&start="+str(start)
###SAMPLE SIZE
if type(sample_size) == int:
res += "&end="+str(sample_size-1)
else:
res += "&end="+str(int(start+BATCH_SIZE-1))
###RANDOMIZE SAMPLE
if sort == "random":
res += "&sort=random"
res += "&random_seed="+str(random.randint(1000000, 9999999))
###CHOOSE CORPORA
res += "&corpus="
for corpus in corpus_list:
res += corpus+"%2C"
res = re.sub("%2C$", "", res)
res += "&incremental=true"
return res
def regex_decode(string):
string = re.sub(",", "%2C", string)
string = re.sub("\[", "%5B", string)
string = re.sub("\]", "%5D", string)
string = re.sub("\*", "%2A", string)
string = re.sub("\.", "%2E", string)
string = re.sub("\|", "%7C", string)
string = re.sub("\(", "%28", string)
string = re.sub("\)", "%29", string)
string = re.sub("ä", "%C3%A4", string)
string = re.sub("ö", "%C3%B6", string)
string = re.sub("å", "%C3%A5", string)
string = re.sub("Ä", "%C3%84", string)
string = re.sub("Ö", "%C3%96", string)
string = re.sub("ü", "%C3%BC", string)
string = re.sub(" ", "%20", string)
string = re.sub("§", "%A7", string)
return string
def query(regex, corpus_list, cqp_query, output_cats, sample_size="all", context="sentence" ):
if sample_size != "all": BATCH_SIZE = sample_size
start = 0
data = 0
hits = -1
lap = 0
while data != hits:
url_string = compose_url_string(regex, start, corpus_list, cqp_query, output_cats, context, sort="random", sample_size=sample_size)
print(url_string)
qround = get_query_round(url_string, regex, data, output_cats)
if type(sample_size) != int or qround[0] < sample_size:
hits = qround[0]
else:
hits = sample_size
print("batching", start, "of", hits, "from", regex)
return {"sample_size" : hits, "full_count" : qround[0], "data" : qround[1] }
data += len(qround[1])
start += BATCH_SIZE
def get_query_round(url_string, regex, index_starter, output_cats):
t = time.time()
f = urllib.request.urlopen(url_string)
response_as_string = f.read().decode("utf-8")
items = json.loads(response_as_string)
if "hits" in items:
hits = int(items["hits"])
else:
print("error in query", url_string)
res = []
if "sentence" in output_cats:
for item in items["kwic"]:
keyword = item["tokens"][item["match"]["start"]]
line = [[word[cat] if cat in word else "?" for cat in output_cats] for word in item["tokens"] if item["tokens"].index(word) != item["match"]["start"]]
res.append({"id" : index_starter, "keyword" : [keyword[cat] if cat in keyword else "?" for cat in output_cats] , "words" : line, "key_loc" : item["match"]["start"]})
index_starter += 1
else:
for item in items["kwic"]:
res.append(item["tokens"])
return [hits, res]