-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathxmlp.py
109 lines (91 loc) · 2.93 KB
/
xmlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from sklearn.feature_extraction.text import CountVectorizer
from pymorphy2 import MorphAnalyzer
from newspaper import Article
from newspaper import Config
from newspaper.article import ArticleException
import ssl
from mxml import YaSearch
with open("ini.txt", "r", encoding="utf-8") as f:
infa = f.readlines()
# infa= infa.strip("/n")
f.close()
y = YaSearch(infa[0],infa[1])
results = y.search(infa[2], page=1)
if "\n" in y.lem:
y.lem.remove("\n")
if " " in y.lem:
y.lem.remove(" ")
#print(y.lem)
ssl._create_default_https_context = ssl._create_unverified_context
urls = []
if results.error is None:
for result in results.items:
print(result.url)
urls.append(result.url)
stop_url1=['wikipedia.org','vk.com','facebook.com','yandex.ru']
clear_urls=[]
clear_urls = [u for u in urls if all(
s not in u for s in stop_url1
)]
#for fullurl in urls:
# for iurl in stop_url1:
# if iurl not in fullurl:
# clear_urls.append(fullurl)
print("=================================")
print(clear_urls)
stroki = []
user_agent = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent
config.request_timeout = 10
for url in clear_urls:
article = Article(url,config=config)
try:
article.download()
article.parse()
except ArticleException:
continue
# print(article.text)
textm = article.text
textm = textm.replace('\n','')
stroki.append(textm)
with open("text.txt", "w", encoding='utf-8') as file:
for kk in stroki:
file.write(kk)
file.close()
m = MorphAnalyzer()
texts = [z.rstrip() for z in open('text.txt', encoding='utf-8')]
stop_words = [z.rstrip() for z in open('water.txt', encoding='utf-8')]
cvn = CountVectorizer(ngram_range=(2,4), stop_words=stop_words)
words_nf = [' '.join([m.parse(word)[0].normal_form for word in x.split()]) for x in texts]
ngrams = cvn.fit_transform(words_nf)
vb = cvn.vocabulary_
count_values = ngrams.toarray().sum(axis=0)
ngrams ={}
j =1
for ng_count, ng_text in sorted([(count_values[i],k) for k,i in vb.items()], reverse=True):
ngrams[ng_text] = str(ng_count)
print(ng_text, ng_count, sep='\t')
if j>=10:
break
else:
j =j+1
#print(ngrams)
a = []
for k, v in ngrams.items():
a.append(k+";"+str(v))
kk8=0
# тут какой то ахтунг с циклом
for kk8 in range(len(y.lem)):
a[kk8]= a[kk8]+";"+y.lem[kk8]
kk8+1
print(a)
with open(infa[2].strip()+'.csv', 'w', encoding="utf-8") as file:
# wr = file.write(file, delimiter=';')
for xxx in range(len(a)-1):
file.write(a[xxx]+"\n")
file.close()
# wr.writerows(a)
#info_data = pd.DataFrame.from_dict(ngrams, orient = 'index')
#filename = infa[2].strip()+'.xlsx'
#info_data.to_csv(infa[2].strip()+'.csv')