forked from jbrew/stereotype
-
Notifications
You must be signed in to change notification settings - Fork 0
/
transcript_parser.py
96 lines (71 loc) · 3.22 KB
/
transcript_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
__author__ = 'jamiebrew'
import os
import string
import operator
"""
Takes a transcript formatted as follows, with newlines separating lines, and without paragraph breaks in the middle of
lines. (this is how transcripts are formatted on genius.com)
Pulls the transcript from 'raw_transcripts/name]' and saves them to 'texts/transcripts/[name]'
[[
CHARACTER: I am saying a line and I am using a VARIety of CapITALIzations and spacing patterns
Some more text in character's line here.
OTHER CHARACTER: Okay...
Here are some stage directions in a separate paragraph that does not contain a colon. The characters are kissing
each other.
]]
Feeds these into separate text files, each containing the collection of lines spoken by a particular character,
each named after the character. Also creates a file called "stage directions" that has all of the lines that did
not seem to be attributed to a character.
NOTES:
Assumes that lines are attributed using a colon.
Assumes that colons do not appear in stage direction paragraphs.
Does not know about alternate names for the same character. WILLY LOMAN and WILLY will be fed into different files.
Likewise, WILLY (to himself) will be fed into a different file from WILLY.
"""
to_find = ['trump','clinton','cooper','raddatz']
class transcript_parser(object):
def parseTranscript(self, tname):
path = 'raw_transcripts/%s' % tname
f = open(path,"r")
# splits file at ":" (so last word of each block is a speaker); pairs each line with last word before preceding colon
lines = f.read().lower().split(':')
for line in lines:
line = line.strip('\n')
labeled_lines = []
# associate each line with a speaker
for i in range(1,len(lines)):
line = lines[i].split()
nextname = line[-1]
if nextname in to_find:
nextline = lines[i+1].split()[:-1]
labeled_lines.append([nextname, " ".join(nextline)])
# consolidate all lines by a given speaker into one line
speakers = {}
for line in labeled_lines:
name = line[0].split(string.punctuation)[0].translate(string.maketrans("",""),'/')
if name in speakers:
speakers[name]+=line[1]
else:
speakers[name] = line[1]
for name in speakers:
if " " not in name:
# save file
dirpath = 'texts/transcripts/%s' % tname
if not os.path.isdir(dirpath):
os.mkdir(dirpath)
path = 'texts/transcripts/%s/%s' % (tname,name)
outfile = open(path,'w')
toWrite = speakers[name]
outfile.write(toWrite)
def biggest_characters(self, tname, number):
size_by_name = {}
tpath = 'texts/transcripts/%s' % tname
for cname in os.listdir(tpath):
cpath = '%s/%s' % (tpath, cname)
size_by_name[cname] = len(file(cpath).read().split())
sorted_chars = list(reversed(sorted(size_by_name.items(), key=operator.itemgetter(1))))
for pair in sorted_chars[0:number]:
print pair
return sorted_chars
p = transcript_parser()
p.parseTranscript('pres_debates.txt')