forked from rajnathani/bacon
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbacon_functions.py
287 lines (253 loc) · 11.8 KB
/
bacon_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
def strip_roman_numeral(s):
'''Return a new string containing all the substrings of string 's' from
the beginning till the index of the occurence of '(*roman numeral*)'.
If no roman numerals are present then return the original string.
'''
# Check for the first occurence of "("
# If its present then return a new string
# containing all the substrings till "("
if s.find("(") != - 1:
numeral_start = s.find("(")
new_string = s[:numeral_start]
return new_string
# If no "(" is present then return the
# original string.
return s
def extract_actor_name(line):
'''Return a string containing the actor's name from the string 'line'.
'''
comma = line.find(",")
# If a comma is present then rearrange the first and last name.
if comma == -1:
# Distinguish between the end of the actor's name and
# beginning of the movie's name by finding the tab
actor_end = line.find("\t")
full_name = line[:actor_end]
full_name = strip_roman_numeral(full_name).strip()
# If a comma is not present then don't reorder the name.
else:
last_name = line[:comma].strip()
# Distinguish between the end of the actor's name and
# beginning of the movie's name by finding the tab
actor_end = line.find("\t")
first_name = line[comma + 2:actor_end]
first_name = strip_roman_numeral(first_name).strip()
full_name = first_name + " " + last_name
# Capitalize the full name of the actor and then return it.
actor_full_name = capitalize_name(full_name)
return actor_full_name
def capitalize_name(s):
'''Return a new string with all the substrings from string 's' followed
by whitespace uppercased, and the rest of the substrings lowercased
and with all leading and trailing whitespace removed.
Note: First substring to be uppercased.
'''
s = s.strip()
space_location = 0
new_string = ""
count = 0
# Loop through each substring.
for substring in s:
if count < len(s):
# Check if the previous substring is whitespace, or that
# the we are dealing with the very first subsring. If it is
# true then uppercase the substring that follows it and add
# it to the new string.
previous = s[count - 1]
if previous == "\t" or previous == " " or count == 0:
new_string += substring.upper()
# If its not whitespace then lowercase the substring
# and add it to the new string.
else:
new_string += substring.lower()
count += 1
return new_string.strip()
def find_movie_name(line):
'''Return the contents of the string 'line' which contains the movie name.
'''
# Find the occurence of a tab and mark that as
# the beginning of the movie name.
# Find the occurence of a closing parentheses
# and mark that as the end of the movie name.
tab = line.find("\t")
end = line.find(")", tab)
movie = line[tab + 1: end + 1].strip()
return movie
def parse_actor_data(actor_data):
'''Return the actor & movie information in the open reader 'actor_data'.
('actor_data' contains movie and actor information in IMDB's format.)
The returned dictionary contains the names of actors (string) as keys and
lists of movies (string) the actor has been in as values.
'''
actor_dict = {}
count = 0
header_skipped = False
passed_heading = False
line_break = True
line = actor_data.readline()
# Iterate till the footer of the file.
while not header_skipped or not (line_break and line.startswith("-------")):
if not header_skipped:
if line.startswith('THE ACTORS LIST') or line.startswith('THE ACTRESSES LIST'):
passed_heading = True
elif line.startswith('Name'):
line_name = count
# Check if 'Name' And the line with hyphens are
# consecutive.
elif line.count("-") == 10 and count == (line_name + 1):
if passed_heading:
header_skipped = True
elif line_break and not (line.isspace()):
# Check if the previous line was a line break.
# Note: For the first actor this condition is set to
# be True.
line_break = False
actor_full_name = extract_actor_name(line)
# Get the movie's name which is on the same line with the actor's
# name. If the actor doesn't exist in the actor_dict then add
# the actor's name to the dictionary with the first movie's name
# as a list. If the actor already exists in the dictionary then
# append the movie to the list of movies to which actor maps onto
# in the dictionary.
movie = find_movie_name(line)
if not (actor_full_name) in actor_dict:
actor_dict[actor_full_name] = [movie]
else:
actor_dict[actor_full_name].append(movie)
line = actor_data.readline()
# As long as there is no line break, keep appending the movies
# found on each line to the actor's list of movies.
while not (line.isspace()):
if line.startswith("\t"):
movie = find_movie_name(line)
# Check if the movie is not present in the list
# of movies of the actor.
if not (movie in actor_dict[actor_full_name]):
actor_dict[actor_full_name].append(movie)
line = actor_data.readline()
line_break = True
line = actor_data.readline()
count += 1
return actor_dict
def invert_actor_dict(actor_dict):
'''Return a dictionary that is the inverse of dictionary 'actor_dict'.
The original actor_dict maps actors (string) to lists of movies (string)
in which they have appeared. The returned dictionary maps movies (string)
to lists of actors (string) appearing in the movie.
'''
inverted = {}
# Iterate through the dictionary 'actor_dict', check if
# the movie in the list of movies of the actor has occured
# in the dictionary 'movie_dict'. If it hasn't then add the
# movie to 'movie_dict' (as a key) and the actor to it.
# Else append the actor to the list of the already existing
# movie key.
for (actor, movies) in actor_dict.iteritems():
for movie in movies:
if not (movie in inverted):
inverted[movie] = [actor]
else:
# Check if the actor is not the list.
if not (actor in inverted[movie]):
inverted[movie].append(actor)
return inverted
def directly_linked_movie(actor, goal_actor, actor_dict, movie_dict):
'''Return the movie (a string) which directly links 'actor' (string to
'goal_actor' (string), from dictionaries: 'actor_dict' and 'movie_dict'.
Return None if no common movie is found.
'''
# Loop through the movies which 'actor' has acted in.
# Check if 'goal_actor' occurs in the actor list of
# any of the movies.
for movie in actor_dict[actor]:
if goal_actor in movie_dict[movie]:
return movie
def shortest_link(actor_name, actor_dict, movie_dict):
'''Return a list of actors (actors are strings)that represents the shortest
connection between 'actor_name' and Kevin Bacon that can be found in the
dictionaries: 'actor_dict' and 'movie_dict'.
'''
# Note about the algorithm:
# Type: Breadth first
# First, check if the actor's name is 'Kevin Bacon' or if the actor is
# not present in the 'actor_dict'. If either of them if True
# then return the empty list.
if actor_name == 'Kevin Bacon' or not (actor_name in actor_dict):
return []
investigated = [actor_name]
to_investigate = [[actor_name]]
distance = 0
# The loop condition checks if the list to_investigate has any remaining
# elements.
while to_investigate:
# Note: As the distance increases the size of each sublist in
# the nested list 'to_investigate' increases proportionally.
# The last actor of each sublist is the actor to be investigated
# The actors which occur before the actor in the sublist simply
# represent the link from 'actor_name' to that actor.
# Loop property: The zeroth index changes on every iteration.
actor_link = to_investigate[0]
actor = actor_link[distance]
for movie in actor_dict[actor]:
for co_star in movie_dict[movie]:
if not (co_star in investigated):
if co_star == "Kevin Bacon":
actor_link.append("Kevin Bacon")
return actor_link
# If the co_star is not present in the list of
# investigated actors then make a list containing
# the entire link from 'actor_name' to the co_star
# and add it to the nested list 'to_investigate'.
elif not (co_star in investigated):
investigated.append(co_star)
full_link = actor_link[:]
full_link.append(co_star)
to_investigate.append(full_link)
# Remove the actor_link (sublist) from the
# to_investigate (nested list)
to_investigate.remove(actor_link)
# Check if all the actor_links of the current distance
# number is investigated. If that's True then increase
# the distance by 1.
if minimum(to_investigate) == distance + 2:
distance += 1
return []
def minimum(L):
'''Return the length of the smallest sublist in then nested list 'L'.
'''
# Initiate smallest as 'z' as any number is smaller than it.
smallest = 'z'
for sublist in L:
smallest = min(smallest, len(sublist))
return smallest
def find_connection(actor_name, actor_dict, movie_dict):
'''Return a list of (movie, actor) tuples (both elements of type string)
that represent a shortest connection between 'actor_name' and 'Kevin Bacon'
that can be found in the actor_dict and movie_dict. Each tuple in the
returned list has a special property: the actor from the previous tuple and
the actor from the current tuple both appeared in the stated movie. If there
is no connection between 'actor_name' and 'Kevin Bacon', or the 'actor_name'
is 'Kevin Bacon', the returned list is empty. Note: The actor_dict is the
inverse of movie_dict.
'''
# The shortest_link function will return the shortest link
# of actors from 'actor_name' to 'Kevin Bacon'.
# If there is no connection found or 'actor_name' is
# 'Kevin Bacon' then it will return an empty list.
link = shortest_link(actor_name, actor_dict, movie_dict)
if not link:
return link
count = 0
complete_link = []
# Loop through the the list 'link' and find the common movie between
# adjacent actors in the list. Store the common movie and the leading
# actor as a tuple in the list 'complete_link'.
for actor in link:
if count > 0:
present_actor = link[count]
previous_actor = link[count - 1]
common_movie = directly_linked_movie(previous_actor, present_actor, actor_dict, movie_dict)
movie_actor_tuple = (common_movie, present_actor)
complete_link.append(movie_actor_tuple)
count += 1
return complete_link