-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsolution5.py
84 lines (63 loc) · 2.56 KB
/
solution5.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import operator
from yargy import rule, not_, Parser, and_
from yargy.predicates import normalized
from yargy.predicates import type as yargy_type
from task4.solution4 import get_inclusion_number, find_most_popular, \
find_all_words, read_all_words_from_file, Normalizer
from models.article import get_article_list_from_file, \
get_all_lines_from_articles
def get_all_collocation(lines, word):
"""
Function for finding all collocations of word and any word after it.
:param lines: list of string
Lines for processing.
:param word: str
Word for searching.
:return:
List of all valid collocations.
"""
if not isinstance(lines, list) or not isinstance(word, str):
raise TypeError
gr = rule(normalized(word), and_(not_(yargy_type('PUNCT')),
not_(yargy_type('OTHER'))))
result_list = []
for line in lines:
if not isinstance(line, str):
raise TypeError
for match in Parser(gr).findall(line):
result_list.append(
' '.join([Normalizer.normalise(token.value)
for token in match.tokens]))
return result_list
def solution5(source_file, result_file, ignore_list_file, n):
"""
Function for finding pairs with most popular words.
:param source_file: str
File with articles.
:param result_file: str
File for saving answers.
:param ignore_list_file: str
File with list of forbidden words.
:param n: int
Number of most popular words for searching.
"""
if not isinstance(source_file, str) or not isinstance(result_file, str) \
or not isinstance(n, int) or not isinstance(ignore_list_file, str):
raise TypeError
try:
lines = get_all_lines_from_articles(
get_article_list_from_file(source_file))
popular = find_most_popular(find_all_words(lines),
read_all_words_from_file(ignore_list_file), n)
collocation_list = []
for word in popular:
collocation_list += get_all_collocation(lines, word)
sorted_collocations = \
sorted(get_inclusion_number(collocation_list).items(),
key=operator.itemgetter(1))[::-1]
with open(result_file, 'wb') as f_out:
f_out.write('\n'.join([collocation + ': ' + str(number)
for collocation, number in
sorted_collocations]).encode())
except FileNotFoundError:
print("can't open file in solution 5")