-
Notifications
You must be signed in to change notification settings - Fork 2
/
futurama_scrape.py
74 lines (63 loc) · 2.24 KB
/
futurama_scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python
# Adam Calabrigo 2017
# This script is used to scrape Futurama scripts from online and write to the
# futurama_scripts.txt file.
from bs4 import BeautifulSoup
import sys
from urllib.request import Request, urlopen
import re
# open corpus file
f = open('futurama_scripts.txt', 'w')
script_list_url = 'https://theinfosphere.org/Episode_Transcript_Listing'
num_scripts = 0
num_lines = 0
def add_script(script_url):
''' Pulls the relevant script text from a given
online script. '''
try:
req = Request('https://theinfosphere.org' + script_url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
i = 0
soup = BeautifulSoup(html, 'lxml')
paragraphs = soup.find_all('p')
for p in paragraphs:
if p.find('b') is not None:
if p.find('b').find('a') is not None:
name = str(p.find('b').find('a').contents[0])
else:
name = str(p.find('b').contents[0])
f.write(name + re.sub(r'\[.+?\]\s*', '', str(p.contents[len(p.contents) - 1])))
i += 1
print(script_url, str(i))
return i
except:
print('error: failed to make URL list')
f.close()
exit()
def get_script_urls():
''' Goes through the web page that contains the script links and
creates a list of links to parse. '''
try:
req = Request(script_list_url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req).read()
script_urls = []
soup = BeautifulSoup(html, 'lxml')
links = soup.find_all('a')
for l in links:
if len(l.contents) > 0 and len(l.contents[0]) > 10:
if str(l.contents[0])[:11] == 'Transcript:':
script_urls.append(l.get('href'))
return script_urls
except:
print('error: failed to read from URL list')
f.close()
exit()
# create the corpus
script_urls = get_script_urls()
for script_url in script_urls[:-4]:
lines = add_script(script_url)
num_lines += lines
if lines > 0:
num_scripts += 1
print('Scraping complete: {} scripts {} lines scraped'.format(num_scripts, num_lines))
f.close()