-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathshakespear.py
31 lines (27 loc) · 1.06 KB
/
shakespear.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import nltk
from nltk import *
import urllib2
import requests
import re
page="http://shakespeare.mit.edu/Poetry/LoversComplaint.html"
#req = urllib2.Request(page)page_text=urllib2.urlopen(page).read()
req = urllib2.Request(page)
response = urllib2.urlopen(req)
the_page = response.read()
raw=re.sub("\\n\\n<BLOCKQUOTE>\\n","",the_page)
raw1=re.sub("<\/BLOCKQUOTE>\\n","",raw)
raw2=re.sub("<BLOCKQUOTE>","",raw1)
raw3=re.sub("</TITLE>","",re.sub("<\\BODY","",raw2))
raw4=re.sub("</BODY>","",re.sub("</BLOCKQUOTE","",re.sub("<BODY>","",raw3)))
raw5=re.sub("<HTML","",re.sub(">","",re.sub("<BR>","",re.sub("</HTML>","",raw4))))
raw6=re.sub("HEAD","",re.sub("when","",re.sub("/H1FROM","",re.sub("TITLE","",raw5))))
raw7=re.sub("THE","",re.sub("\?","",re.sub("and","",re.sub("\,!\\,\;\?and the ","",raw6))))
#words=["HEAD","TITLE","THE","H1A","<","/H1FROM"]
print raw7
tokens=word_tokenize(raw7)
#print tokens
#result = list(set(tokens) - set(words))
#print result
fdist1 = FreqDist(tokens)
#print fdist1.tabulate()
#fdist1.plot(100, cumulative=True)