-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparser.py
68 lines (56 loc) · 2.23 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import csv
import urllib.request
from bs4 import BeautifulSoup
from random import choice, uniform
import openpyxl
def get_html(url, useragent=None, proxy=None):
proxy_support = urllib.request.ProxyHandler(proxy)
opener = urllib.request.build_opener(proxy_support)
urllib.request.install_opener(opener)
req = urllib.request.Request(url, headers=useragent)
response = urllib.request.urlopen(req)
return response.read()
def parse(html):
global pages
soup = BeautifulSoup(html, features="lxml")
pages = soup.find_all(attrs={'class': "page-numbers"})[-2].string
table = soup.find("div", id="content")
return table
def main():
useragents = open('useragents.txt').read().split('\n')
proxies = open('proxies.txt').read().split('\n')
url = "https://fobook.ru/page/1/"
while True:
try:
html = get_html(url, useragent={'User-Agent':"{agent}".format(agent=choice(useragents))}, proxy={'https': 'https://' + choice(proxies)})
break
except:
continue
#Код отвечающий за парс
table = parse(html)
wb = openpyxl.load_workbook(filename = 'output.xlsx')
sheet = wb.active
sheet['A1'] = 'Books'
sheet['B1'] = 'Level'
sheet['C1'] = 'Author'
j = 2
for i in range(1, int(pages) + 1):
print("Страница №: {page}".format(page=i))
url = "https://fobook.ru/page/{page}/".format(page=i)
while True:
try:
html = get_html(url, useragent={'User-Agent':"{agent}".format(agent=choice(useragents))}, proxy={'https': 'https://' + choice(proxies)})
break
except:
continue
table = parse(html)
for item in table.find_all("article"):
print(item.find_all('a')[0].string, "-", item.find_all('a')[1].string, '( Author:', item.find("h3", class_="author").string, ')')
sheet.cell(row=j, column = 1).value = item.find_all('a')[0].string
sheet.cell(row=j, column = 2).value = item.find_all('a')[1].string
sheet.cell(row=j, column = 3).value = item.find("h3", class_="author").string
j += 1
print()
wb.save('output.xlsx')
if __name__ == '__main__':
main()