-
Notifications
You must be signed in to change notification settings - Fork 6
/
scrap.py
88 lines (80 loc) · 2.74 KB
/
scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import urllib
import re
import subprocess
import time
import sys
import os.path
import os
import pandas as pd
#a log functionality to restart scraping state as it was interrupted
restart = "n" #normal value for restart is no!
if(os.path.isfile('../url_log.csv')):
#retreive the file
log = pd.read_csv('../url_log.csv')
urls = log.iloc[-1].values[0]
times = log.iloc[-1].values[1]
#option to restart!
print "log file detected! "
print urls + " was last saved at " + times
restart = raw_input("Would you like to continue from last url in the log? (y/n)")
if not isinstance(restart, str):
sys.exit('Please enter a valid string')
if (restart.lower() == "n"):
os.remove("../url_log.csv")
log = log[0:0] #empty log
elif(restart.lower() != "y"):
sys.exit("Please enter valid string")
else:
log = pd.DataFrame(columns = ['urls','times'])
print("No log file detected!")
#now, we can scrape any memes!
if (restart.lower() == "n"):
subreddit = raw_input("Please enter the name of subreddit to scrape: ")
if not isinstance(subreddit, str):
sys.exit('Please enter a valid string')
#the url of the website to be scraped!
urls = "https://old.reddit.com/r/" + subreddit + "/"
times = time.strftime("%d-%m-%Y %H:%M:%S", time.localtime())
log = log.append(other = pd.Series([urls,times],index=log.columns),ignore_index = True)
# input the number of pages to scrape memes from!
i = 0
num = input("How many pages would you like to download(25 memes per page)?")
if num < 1:
sys.exit("Number of pages should be > 0")
else :
try:
while i < num:
htmlfile = urllib.urlopen(urls)
htmltext = htmlfile.read()
content = htmltext
# regex to find urls
these_regex = "data-url=\"(.+?)\""
pattern = re.compile(these_regex)
all_urls = re.findall(pattern,htmltext)
# regex to find names
names_regex = "data-event-action=\"title\".+?>(.+?)<"
names_pattern = re.compile(names_regex)
names = re.findall(names_pattern, htmltext)
for j,s in enumerate(all_urls):
try:
com = "wget --no-check-certificate " + s + " -O \"" + names[j] + "\".jpg"
subprocess.call(com,shell=True)
except:
com = "wget --no-check-certificate " + s
subprocess.call(com,shell=True)
regex1 = "next-button.+?\"(.+?)\""
pattern1 = re.compile(regex1)
link1 = re.findall(pattern1,htmltext)
if(len(link1) == 0):
print "Something went wrong for i = %d. trying again..."%i
time.sleep(3)
else:
urls = link1[0]
times = time.strftime("%d-%m-%Y %H:%M:%S", time.localtime())
log = log.append(other = pd.Series([urls,times],index=log.columns),ignore_index = True)
i += 1
log.to_csv('../url_log.csv',mode = 'w',index=False)
except KeyboardInterrupt as e:
print e
print "saving log"
log.to_csv('../url_log.csv',mode = 'w',index=False)