-
Notifications
You must be signed in to change notification settings - Fork 0
/
header_surfer.py
49 lines (36 loc) · 1.26 KB
/
header_surfer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
class HeadersSurfer(object):
def __init__(self, file_handler):
self.handler = file_handler
def get_page(self, url, limits=None):
response = requests.get(url).content
if limits:
return response[limits[0]: limits[1]]
else:
return response
def get_area(self, content, tag, **kwargs):
soup = BeautifulSoup(content, 'lxml')
area = soup(tag, **kwargs)[0]
return area
def get_header_blocks(self, area, tag, **kwargs):
soup = BeautifulSoup(area, 'lxml')
headers = soup(tag, **kwargs)
return headers
def get_text(self, area, tag, index=0, **kwargs):
soup = BeautifulSoup(area, 'lxml')
text = soup(tag, **kwargs)[index].text
return text
def get_href(self, area, tag, **kwargs):
soup = BeautifulSoup(area, 'lxml')
href = soup(tag, **kwargs)[0].get('href')
return href
def write_to_txt(self, string):
self.handler.write(string)
# r = requests.get('https://www.wsj.com').text
# print type(r)
if __name__ == '__main__':
# surfer = HeadersSurfer('fh')
# surfer.get_page('https://www.economist.com', limits=(0, 100000))
pass