-
Notifications
You must be signed in to change notification settings - Fork 67
/
scribd.py
70 lines (62 loc) · 2.2 KB
/
scribd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/python
from bs4 import BeautifulSoup
import requests
import sys
import shutil
import os
#print len(sys.argv)
os.chdir(sys.path[0])
if len(sys.argv) ==1:
print 'Usage: sudo python scribd.py <link of scribd document>'
print
print 'For selectable PDFs:'
print '- example: sudo python scribd.py https://www.scribd.com/document/55949937/33-Strategies-of-War'
print
print 'For PDFs containing Images; use the -p option:'
print '- example: sudo python scribd.py http://scribd.com/doc/17142797/Case-in-Point -p'
exit()
response = requests.request(method='GET', url=sys.argv[1])
soup = BeautifulSoup(response.text, 'html.parser')
extraction = ''
train = 1
title = soup.find('title').get_text().replace(' ', '_')
print soup.find('title').get_text()
if len(sys.argv) <=2:
if os.path.exists(title + '.txt'):
os.remove(title + '.txt')
else:
if not os.path.exists(title):
os.makedirs(title)
print
js_text = soup.find_all('script', type='text/javascript')
for opening in js_text:
for inner_opening in opening:
portion1 = inner_opening.find('https://')
if not portion1 == -1:
portion2 = inner_opening.find('.jsonp')
jsonp = inner_opening[portion1:portion2+6]
if not jsonp == '':
if len(sys.argv) <=2:
#print jsonp
response = requests.request(method='GET', url=jsonp)
page_no = response.text[11:12]
response_head = (response.text).replace('window.page' + page_no + '_callback(["', '').replace('\\n', '').replace('\\', '').replace('"]);', '')
#print response_head
soup_content = BeautifulSoup(response_head, 'html.parser')
#print soup_content.get_text().encode('utf-8')
for x in soup_content.find_all('span', {'class':'a'}):
xtext = x.get_text().encode('utf-8')
print xtext
extraction = extraction + xtext + '\n'
else:
replacement = jsonp.replace('/pages/', '/images/').replace('jsonp', 'jpg')
#print replacement
print 'Downloading page ' + str(train)
response = requests.get(replacement, stream=True)
with open(title + '/pic' + str(train) + '.jpg', 'wb') as out_file:
shutil.copyfileobj(response.raw, out_file)
del response
train+=1
if len(sys.argv) <=2:
with open(title + '.txt', 'w') as feed:
feed.write(extraction)