Skip to content

Commit

Permalink
#6 Changed top_cast in TitleScrape to list of CreditScrape objects
Browse files Browse the repository at this point in the history
  • Loading branch information
zembrodt committed Nov 21, 2019
1 parent 8e58615 commit 7b77e1b
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 65 deletions.
16 changes: 15 additions & 1 deletion pymdb/models/name.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
is_int,
to_datetime
)
from functools import total_ordering


class NameBasics:
Expand Down Expand Up @@ -87,6 +88,7 @@ def __str__(self):
f'{"" if self._death_year is None else self._death_year}'


@total_ordering
class CreditScrape:
"""Object to represent information for each person scraped from IMDb's `fullcredits` page for a title.
Expand Down Expand Up @@ -162,8 +164,20 @@ def episode_year_end(self, value):
if is_int(value):
self._episode_year_end = int(value)

def __eq__(self, other):
return (self.name_id, self.title_id, self.job_title, self.credit, self.episode_count, self.episode_year_start, self.episode_year_end) == \
(other.name_id, other.title_id, other.job_title, other.credit, other.episode_count, other.episode_year_start, other.episode_year_end)

def __lt__(self, other):
return (self.name_id, self.title_id, self.job_title, self.credit, self.episode_count, self.episode_year_start, self.episode_year_end) < \
(other.name_id, other.title_id, other.job_title, other.credit, other.episode_count, other.episode_year_start, other.episode_year_end)

def __str__(self):
return f'{self.name_id}: {self.job_title} in {self.title_id} as {self.credit}'
return f'{self.name_id}: {self.job_title} in {self.title_id} as {self.credit}' + \
f'{f" in {self.episode_count} episodes" if self.episode_count is not None else ""}'

def __repr__(self):
return self.__str__()


class NameScrape:
Expand Down
2 changes: 1 addition & 1 deletion pymdb/models/title.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,7 @@ class TitleScrape:
storyline (:obj:`str`): The title's storyline.
production_companies (:obj:`list` of :obj:`str`): A list of company IDs, used by IMDb and
prefixed with `co`, that are credited with producing the title.
top_cast (:obj:`list` of :obj:`str`): A list of person IDs, used by IMDb and prefixed with `nm`,
top_cast (:obj:`list` of :obj:`~.models.name.CreditScrape`): A list of `CreditScrape` objects
that are the top cast in the title.
budget (:obj:`int`): The estimated budget for the title.
budget_denomination (:obj:`str`): The denomination the budget value is listed as
Expand Down
51 changes: 26 additions & 25 deletions pymdb/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
get_category,
get_company_id,
get_denomination,
get_episode_info,
get_name_id,
get_ref_marker,
get_title_id,
Expand Down Expand Up @@ -80,7 +81,7 @@ def get_title(self, title_id, include_taglines=False):
plot = None
storyline = None
production_companies = []
cast_members = []
top_cast = []
budget = None
budget_denomination = None
opening_weekend_gross = None
Expand Down Expand Up @@ -189,14 +190,28 @@ def get_title(self, title_id, include_taglines=False):
for cast_member in cast_node.css('tr.odd, tr.even'):
cast_member_node = cast_member.css_first('td:nth-of-type(2) > a')
if cast_member_node:
cast_member_id = get_name_id(cast_member_node)
# TODO: should this be modified to store a list of Credit objects?
#cast_member_name = cast_member_node.text().strip()
#character_nodes = cast_member.css('td.character > a')
#characters = []
#for c_node in character_nodes:
# characters.append(c_node.text().strip())
cast_members.append(cast_member_id)
character_credit = None
episode_count = None
episode_year_start = None
episode_year_end = None
character_node = cast_member.css_first('td.character')
if character_node:
# Check if there is episode information, save it, then remove it
episode_info_node = character_node.css_first('a.toggle-episodes')
if episode_info_node:
episode_count, episode_year_start, episode_year_end = get_episode_info(episode_info_node)
episode_info_node.decompose()
character_credit = re.sub(r'\s+', ' ', character_node.text().strip())
top_cast.append(
CreditScrape(
name_id=get_name_id(cast_member_node),
title_id=title_id,
job_title='actor',
credit=character_credit,
episode_count=episode_count,
episode_year_start=episode_year_start,
episode_year_end=episode_year_end
))

# Get season and episode numbers if TV episode
heading_nodes = tree.css('div.bp_heading')
Expand Down Expand Up @@ -229,7 +244,7 @@ def get_title(self, title_id, include_taglines=False):
plot=plot,
storyline=storyline,
production_companies=production_companies,
top_cast=cast_members,
top_cast=top_cast,
budget=budget,
budget_denomination=budget_denomination,
opening_weekend_gross=opening_weekend_gross,
Expand Down Expand Up @@ -273,21 +288,7 @@ def get_full_cast(self, title_id, include_episodes=False):
# Check if this is a TV series
toggle_episodes_node = cast_member.css_first('a.toggle-episodes')
if toggle_episodes_node:
episode_info = re.sub(
r'<\s*span.*?<\s*/\s*span\s*>', '', toggle_episodes_node.text()
).strip().split(',')
if len(episode_info) > 1:
episode_count, episode_year_info = episode_info
episode_year_info = episode_year_info.split('-')
if len(episode_year_info) > 1:
episode_year_start, episode_year_end = episode_year_info
else:
episode_year_start, = episode_year_info
else:
episode_count, = episode_info
episode_count_match = re.search(r'\d+', episode_count)
if episode_count_match:
episode_count = episode_count_match.group(0)
episode_count, episode_year_start, episode_year_end = get_episode_info(toggle_episodes_node)

# Include all individual episodes an actor is in
if include_episodes:
Expand Down
76 changes: 38 additions & 38 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,21 +49,21 @@ def test_get_title_movie(self):
'''
production_companies = ['co0071326', 'co0000756']
top_cast = [
'nm0000434',
'nm0000148',
'nm0000402',
'nm0001088',
'nm0000027',
'nm0000355',
'nm0048652',
'nm0562679',
'nm0001190',
'nm0114436',
'nm0292235',
'nm0701023',
'nm0567018',
'nm0125952',
'nm0377120'
CreditScrape('nm0000434', title_id, 'actor', 'Luke Skywalker', None, None, None),
CreditScrape('nm0000148', title_id, 'actor', 'Han Solo', None, None, None),
CreditScrape('nm0000402', title_id, 'actor', 'Princess Leia Organa', None, None, None),
CreditScrape('nm0001088', title_id, 'actor', 'Grand Moff Tarkin', None, None, None),
CreditScrape('nm0000027', title_id, 'actor', 'Ben Obi-Wan Kenobi', None, None, None),
CreditScrape('nm0000355', title_id, 'actor', 'C-3PO', None, None, None),
CreditScrape('nm0048652', title_id, 'actor', 'R2-D2', None, None, None),
CreditScrape('nm0562679', title_id, 'actor', 'Chewbacca', None, None, None),
CreditScrape('nm0001190', title_id, 'actor', 'Darth Vader', None, None, None),
CreditScrape('nm0114436', title_id, 'actor', 'Uncle Owen', None, None, None),
CreditScrape('nm0292235', title_id, 'actor', 'Aunt Beru', None, None, None),
CreditScrape('nm0701023', title_id, 'actor', 'Chief Jawa', None, None, None),
CreditScrape('nm0567018', title_id, 'actor', 'General Dodonna', None, None, None),
CreditScrape('nm0125952', title_id, 'actor', 'General Willard', None, None, None),
CreditScrape('nm0377120', title_id, 'actor', 'Red Leader (as Drewe Hemley)', None, None, None)
]
budget = 11000000
budget_denomination = 'USD'
Expand Down Expand Up @@ -128,14 +128,14 @@ def test_get_title_tv_series(self):
'''
production_companies = ['co0223402', 'co0056447', 'co0159275']
top_cast = [
'nm0921942',
'nm0005408',
'nm0224007',
'nm0534134',
'nm0482851',
'nm0866300',
'nm0005606',
'nm0379114'
CreditScrape('nm0921942', title_id, 'actor', 'Philip J. Fry / ...', 124, 1999, 2013),
CreditScrape('nm0005408', title_id, 'actor', 'Turanga Leela / ...', 124, 1999, 2013),
CreditScrape('nm0224007', title_id, 'actor', 'Bender / ...', 124, 1999, 2013),
CreditScrape('nm0534134', title_id, 'actor', 'Linda / ...', 124, 1999, 2013),
CreditScrape('nm0482851', title_id, 'actor', 'Hermes Conrad / ...', 117, 1999, 2013),
CreditScrape('nm0866300', title_id, 'actor', 'Amy Wong / ...', 115, 1999, 2013),
CreditScrape('nm0005606', title_id, 'actor', 'Morbo / ...', 114, 1999, 2013),
CreditScrape('nm0379114', title_id, 'actor', 'Scruffy / ...', 107, 1999, 2013)
]
budget = None
budget_denomination = None
Expand Down Expand Up @@ -201,21 +201,21 @@ def test_get_title_tv_episode(self):
'''
production_companies = ['co0335036', 'co0418998', 'co0343278']
top_cast = [
'nm0227759',
'nm3229685',
'nm3592338',
'nm0192377',
'nm3849842',
'nm0318821',
'nm0396924',
'nm2812026',
'nm1970465',
'nm3701064',
'nm0654295',
'nm0401264',
'nm2760664',
'nm2247629',
'nm1613839'
CreditScrape('nm0227759', title_id, 'actor', 'Tyrion Lannister', None, None, None),
CreditScrape('nm3229685', title_id, 'actor', 'Jon Snow', None, None, None),
CreditScrape('nm3592338', title_id, 'actor', 'Daenerys Targaryen', None, None, None),
CreditScrape('nm0192377', title_id, 'actor', 'Davos Seaworth', None, None, None),
CreditScrape('nm3849842', title_id, 'actor', 'Sansa Stark', None, None, None),
CreditScrape('nm0318821', title_id, 'actor', 'Petyr \'Littlefinger\' Baelish', None, None, None),
CreditScrape('nm0396924', title_id, 'actor', 'Melisandre (as Carice Van Houten)', None, None, None),
CreditScrape('nm2812026', title_id, 'actor', 'Missandei', None, None, None),
CreditScrape('nm1970465', title_id, 'actor', 'Tormund Giantsbane', None, None, None),
CreditScrape('nm3701064', title_id, 'actor', 'Ramsay Bolton', None, None, None),
CreditScrape('nm0654295', title_id, 'actor', 'Theon Greyjoy', None, None, None),
CreditScrape('nm0401264', title_id, 'actor', 'Daario Naharis', None, None, None),
CreditScrape('nm2760664', title_id, 'actor', 'Grey Worm', None, None, None),
CreditScrape('nm2247629', title_id, 'actor', 'Yara Greyjoy', None, None, None),
CreditScrape('nm1613839', title_id, 'actor', 'Wun Wun', None, None, None)
]
budget = 10000000
budget_denomination = 'GBP'
Expand Down

0 comments on commit 7b77e1b

Please sign in to comment.