-
Notifications
You must be signed in to change notification settings - Fork 0
/
Stage1.py
86 lines (66 loc) · 3.34 KB
/
Stage1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
from bs4 import BeautifulSoup
import os, requests
import time
count = 0
########################################################################################################################
########################################################################################################################
#First phase where we grab the top level table of contents of all volumes, the hyperlink TO each file and the name of the file
########################################################################################################################
########################################################################################################################
# 1 If the 'final' index exists remove it - this is so can re-run script if make changes
if os.path.isfile('./temp/index.html') is True: os.remove('./temp/index.html')
# 2 If working cache index doesnt exist then create it
if os.path.isfile('./temp/cache') is False:
print ('main index cache file doesnt exist so grabbing it')
page = (requests.get('http://edwards.yale.edu/research/browse'))
soup = BeautifulSoup(page.text, "html.parser")
with open("./temp/cache", "w") as myfile:
myfile.write(str(soup))
# 3 Otherwise if it does exist lets process it through soup
else:
print ('main index cache file was found so using' )
with open('./temp/cache', 'r') as f:
contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
# 4 Obtain all the A HREF links from the index and populate filename /dirname
for links in soup.findAll('a'):
filename = os.path.basename(links.get('href'))
pathname = os.path.dirname(links.get('href'))
#5 Figure out the base url, update index links to append html and strip base url.
#this also filters out the urls we dont want from all the hyperlinks found on page
if "archive" in pathname:
basepath = pathname
count = count + 1
print ('Renaming link ' + str(count))
links['href'] = links['href'].replace(filename, filename + ".html")
links['href'] = links['href'].replace(pathname + '/', '')
#6 Remove the 'final' index file if it exists before outputing soup to index
if os.path.isfile('./temp/index.html') is True:
os.remove('./temp/index.html')
##
result = soup.select('style, #center ul ')
for items in result:
with open("./temp/index.html", "a") as myfile: myfile.write(str(items))
print ('Finished creating index.html file')
#7 Determine all the urls, filenames for next part of downloading and output to a dictionary called secondphaselinksandfilenames
# The select statement homes in on the elements we want from the raw html
secondphaselinksandfilenames = {}
selectquery1 = soup.select('#center ul ')
count = 0
for link in selectquery1:
raw = link.select('a')
for item in raw:
fullfilename = (item.get('href'))
nameoffile = fullfilename
nameoffile = nameoffile.replace('.html', '')
fullurl = 'http://edwards.yale.edu' + basepath + '/' + nameoffile
#print (fullfilename)
#print (fullurl)
secondphaselinksandfilenames[fullurl] = fullfilename
for key,val in secondphaselinksandfilenames.items():
print (key)
print (val)
import csv
with open('stage1.csv', 'w') as f:
for key in secondphaselinksandfilenames.keys():
f.write("%s,%s\n" % (key, secondphaselinksandfilenames[key]))