-
Notifications
You must be signed in to change notification settings - Fork 3
/
gsrchDwn.py
executable file
·148 lines (113 loc) · 4.07 KB
/
gsrchDwn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/python
#
# neo1981 (neo1981@gmail.com)
# http://www.infosec-neo.blogspot.com --
#
# Program to download the pdf files for given search
# This program will automatically add "filetype:pdf" to the searched query
# Continue function "-c ResultNo" would continue from c number upwards downloading of files
#
import os.path, sys
# Add current xgoogle dir to search path
sys.path.insert(0, "xgoogle")
# Add directory of target file (e.g. of symbolic link)
sys.path.insert(0, os.path.dirname(os.path.abspath(os.path.realpath(__file__))) + "/xgoogle")
# Load xgoogle package
from xgoogle.search import GoogleSearch, SearchError
import getopt
import urllib2
import urllib
import os.path as ospath
# Activate Python Debugger if required
# import pdb
global rem_file # global variable to be used in dlProgress
def dlProgress(count, blockSize, totalSize):
percent = int(count*blockSize*100/totalSize)
sys.stdout.write("\r" + rem_file + "...%d%%" % percent)
sys.stdout.flush()
# parse command line options
try:
opts, args = getopt.getopt(sys.argv[1:], "", ["query=","ftype=","cnt=","dir=","--help"])
except getopt.error, msg:
print 'python gsrchDwn.py --query [\"query_text\"] [--ftype file_extension] [--cnt contine_result_number] [--dir download_dir]',msg
sys.exit(2)
query = ''
mfiletype = 'pdf'
n_cnt = 0
dwn_dir = "."
# Process options
for o, a in opts:
if o == "--query":
query = a
if o == "--ftype":
mfiletype = a
if o == "--cnt":
n_cnt = int(a)
if o == "--dir":
dwn_dir = a.replace('\\',"\\\\")
if query == '':
print ('python gsrchDwn.py --query [query_text] [--ftype file_extension] [--cnt contine_result_number] [--dir download_dir]')
sys.exit(2)
try:
cnt = 0
gs = GoogleSearch(query)
gs.filetype = mfiletype
gs.results_per_page = 50
pgCnt = 1
if n_cnt <> 0:
print "Download Continuing from result number: ",str(n_cnt)
#import pdb
lastUrl = None
while True:
gs.page = pgCnt
results = gs.get_results()
pgCnt = pgCnt +1 #Increase page count to next page
if not results: # no more results were found
break
for res in results:
cnt = cnt +1
print "Search No. : ",str(cnt)
print res.title.encode("utf8")
# print res.desc.encode("utf8")
temp_url = res.url.encode("utf8")
if n_cnt <> 0:
if cnt < n_cnt:
continue
if lastUrl == temp_url:
# Loop detected.
print "Download loop detected, probably there are no more pages. Stopping"
results = None
break
lastUrl = temp_url
#Temp trace
#pdb.set_trace()
rem_file = res.title.encode("utf8") #rem_file used in download progress
loc_file = dwn_dir + os.sep + temp_url.split("/")[-1]
if ospath.isfile(loc_file):
print "File already exist: ",loc_file
print "Not downloading file"
continue
print "Now downloading... ", temp_url
print
try:
req = urllib2.Request(temp_url, None, {
'Accept-Encoding': 'deflate',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:32.0) Gecko/20100101 Firefox/32.0',
'Referer': temp_url
});
response = urllib2.urlopen(req)
file = open(loc_file, 'w')
file.write(response.read())
file.close()
print "Download Complete: ", loc_file
except IOError:
print "***Unable to Download file:IOError ",rem_file
print "Continuing with Next result."
except:
print "***Unable to Download file:Unknown Error ",rem_file
print "Continuing with Next result."
if not results: # forcibly stopping download
print "Done."
break
except SearchError, e:
print "Search failed: %s" % e