forked from KSMubasshir/bd-newspaper-crawlers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hindime.py
122 lines (103 loc) · 4.45 KB
/
hindime.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# - * -coding: utf - 8 - * -
# encoding = utf8
import sys
from importlib import reload
reload(sys)
sys.setdefaultencoding('utf8')
import os
import os
import json
import time
from datetime import date, timedelta
from bs4 import BeautifulSoup
import requests
newspaper_base_url = 'https://hindime.net'
ignore_list = ["keyword", "micr-code", "swift-code", "pin-code", "tech", "internet", "blogging", "seo",
"make-money-online", "cryptocurrency", "share-market", "wiki", "privacy-policy", "keyword", "about",
"contact"]
for index in range(1, 18):
with open("log.txt", "a") as logFile:
logFile.write(str(index) + "\n")
for j in range(8):
if j == 0:
url = newspaper_base_url + '/internet/page/' + str(index)
if j == 1:
url = newspaper_base_url + '/blogging/page/' + str(index)
if j == 2:
url = newspaper_base_url + '/seo/page/' + str(index)
if j == 3:
url = newspaper_base_url + '/cryptocurrency/page/' + str(index)
if j == 4:
url = newspaper_base_url + '/computer/page/' + str(index)
if j == 5:
url = newspaper_base_url + '/share-market/page/' + str(index)
if j == 6:
url = newspaper_base_url + '/tech/page/' + str(index)
if j == 7:
url = newspaper_base_url + '/make-money-online/page/' + str(index)
try:
print(url)
archive_soup = requests.get(url)
except:
print("No response for links in archive,passing")
continue
soup = BeautifulSoup(archive_soup.content, "html.parser")
all_links = soup.find_all("a")
page_links_length = len(all_links)
if (page_links_length == 0):
break
else:
for link in all_links:
link_separator = link.get('href')
try:
link_tokens = link_separator.split("/")
except:
continue
if len(link_tokens) == 5 and "hindime.net" in link_tokens[2]:
if link_tokens[3] in ignore_list:
continue
article_url = link_separator
else:
continue
try:
print(article_url)
article_data = requests.get(article_url).text
except:
print("No response for content in link,trying to reconnect")
time.sleep(2)
continue
article_soup = BeautifulSoup(article_data, "html.parser")
try:
title = article_soup.find("h1", {"class": "entry-title"}).get_text().strip()
except:
title = ""
try:
article_content = article_soup.find("div",
{"class": "td-post-content tagdiv-type"}).get_text().strip()
except:
article_content = ""
data = "<article>\n"
data += "<title>" + title + "</title>\n"
data += "<text>\n" + article_content + "\n</text>\n"
data += "</article>"
output_file_name = link_tokens[3]
output_dir = './Data/'
raw_output_dir = './' + "Raw" + '/'
try:
os.makedirs(output_dir)
except OSError:
pass
try:
os.makedirs(raw_output_dir)
except OSError:
pass
try:
with open(raw_output_dir + '/' + output_file_name, 'w') as file:
file.write(str(article_soup))
except:
pass
try:
with open(output_dir + '/' + output_file_name, 'w') as file:
file.write(data)
except:
pass