-
Notifications
You must be signed in to change notification settings - Fork 0
/
utils.py
123 lines (103 loc) · 4.79 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import pandas as pd
import re
import spacy
from spacy.tokens import DocBin
from spacy.training import Example
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_spans
import os
from spacy.matcher import Matcher
import json
import sys
import random
def make_docs(folder, doc_list):
nlp = spacy.load('ru_core_news_lg')
"""
this function will take a list of texts and annotations
and transform them in spacy documents
folder: folder consisting .txt and .out files (for this function to work
you should have the same folder name in ../annotated directory )
foc_list: list of documents for appending
"""
out='out'
for filename in os.listdir('data/bsnlp2021_train_r1/raw/{folder}/ru'.format(folder=folder)):
df = pd.read_csv('data/bsnlp2021_train_r1/annotated/{folder}/ru/{filename}{out}'.format(folder=folder, filename=filename[:-3], out='out'), skiprows=1, header=None, sep='\t', encoding='utf8', error_bad_lines=False, engine='python')
f = open('data/bsnlp2021_train_r1/raw/{folder}/ru/{filename}'.format(folder=folder,filename=filename), "r", encoding='utf8')
list_words=df.iloc[:,0].tolist()
labels = df.iloc[:,2].tolist()
text = f.read()
entities=[]
for n in range(len(list_words)):
for m in re.finditer(list_words[n].strip(), text):
entities.append([m.start(), m.end(), labels[n]])
for f in range(len(entities)):
if len(entities[f])==3:
for s in range(f+1, len(entities)):
if len(entities[s])==3 and len(entities[f])==3:
# print(entities[f],entities[s])
# print(f, s)
if entities[f][0]==entities[s][0] or entities[f][1]==entities[s][1]:
# print(entities[f],entities[s])
# print(f, s)
if (entities[f][1]-entities[f][0]) >= (entities[s][1]-entities[s][0]):
entities.pop(s)
entities.insert(s, (''))
else:
entities.pop(f)
entities.insert(f, (''))
if len(entities[s])==3 and len(entities[f])==3:
if entities[f][0] in range(entities[s][0]+1, entities[s][1]):
entities.pop(f)
entities.insert(f, (''))
elif entities[s][0] in range(entities[f][0]+1, entities[f][1]):
entities.pop(s)
entities.insert(s, (''))
entities_cleared = [i for i in entities if len(i)==3]
doc = nlp(text)
tags = offsets_to_biluo_tags(doc, entities_cleared)
#assert tags == ["O", "O", "U-LOC", "O"]
entities_x = biluo_tags_to_spans(doc, tags)
doc.ents = entities_x
doc_list.append(doc)
def collecting_jsonl(folder, text_list):
"""
this function takes raw texts from the folder and
returns a list of json-formatted elements for creating
pretraining corpus.
folder: folder with .txt files
text_list: list of texts for appending
"""
for filename in os.listdir('data/bsnlp2021_train_r1/raw/{folder}/ru'.format(folder=folder)):
f = open('data/bsnlp2021_train_r1/raw/{folder}/ru/{filename}'.format(filename=filename, folder=folder), "r", encoding='utf8')
text=f.read()
tempDict = {}
tempDict["text"]= text
text_list.append(tempDict)
def split_texts(bigList, number):
"""
this function takes list of spacy docs and splits every
document which len is more than 'number' in the list in two
bigList: list of documents to split
number: len of document condition where splitting is executed
return bigList: input list without documents which len is more than number
newList: list with splitted documents from input list
"""
newList=[]
bigList=bigList.copy()
for i in range(len(bigList)):
if bigList[i].__len__()>number:
# print(">511")
el=bigList[i]
bigList[i]=None
for token in el:
if token.text in ('.','?','!') and token.i in range((int(len(el)/2))-20, (int(len(el)/2))+20):
# print(". condition ")
x=el[0:token.i]
y=el[token.i+1: len(el)]
newList.append(x)
# print("append x")
newList.append(y)
# print('append y')
# print('First part: ', x)
# print('SECOND PART: ',y)
break
return bigList, newList