-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrapeFrost.py
59 lines (52 loc) · 1.41 KB
/
scrapeFrost.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#Scrapes Robert Frost Poems from poetry.com
#url ranges for frost poems: https://www.poetry.com/poem/30819 - 30952
import requests
from bs4 import BeautifulSoup, NavigableString
def getPoem(url):
"""
input: poem url
obtain title, poem, format poem, write title & poem to file
"""
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
soup=removeDefinition(soup)
title=soup.find(id='disp-poem-title')
f.write('Title:'+''.join(title.contents)+'\n\n\n')
print("Scraping poem:"+''.join(title.contents))
poem=soup.find(id='disp-quote-body')
poemString=reformatPoem(poem)
f.write(poemString)
f.write('---')
def removeDefinition(poem):
"""
remove a tags with links to definition
"""
for tag in poem.findAll(True):
if tag.name in ['a','h2','em']:
s = ""
for c in tag.contents:
if not isinstance(c, NavigableString):
c = removeDefinition(c)
s += str(c)
tag.replaceWith(s)
return poem
def reformatPoem(poem):
"""
reformat poem by replacing brs with \ns
"""
poemString=''
for i in poem:
if i.name in ['br']:
poemString+='\n'
else:
poemString+=i
return poemString
if __name__=="__main__":
baseURL='https://www.poetry.com/poem/'
urlRange=range(30819,30953)
f = open("botbertFrost.txt", "a")
for url in urlRange:
#for each poem
getPoem('https://www.poetry.com/poem/'+str(url))
f.close()
print('Done Scraping')