#6 Changed top_cast in TitleScrape to list of CreditScrape objects

zembrodt · Nov 21, 2019 · 7b77e1b · 7b77e1b
1 parent 8e58615
commit 7b77e1b
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 65 deletions.
diff --git a/pymdb/models/name.py b/pymdb/models/name.py
@@ -10,6 +10,7 @@
     is_int,
     to_datetime
 )
+from functools import total_ordering
 
 
 class NameBasics:
@@ -87,6 +88,7 @@ def __str__(self):
             f'{"" if self._death_year is None else self._death_year}'
 
 
+@total_ordering
 class CreditScrape:
     """Object to represent information for each person scraped from IMDb's `fullcredits` page for a title.
 
@@ -162,8 +164,20 @@ def episode_year_end(self, value):
         if is_int(value):
             self._episode_year_end = int(value)
 
+    def __eq__(self, other):
+        return (self.name_id, self.title_id, self.job_title, self.credit, self.episode_count, self.episode_year_start, self.episode_year_end) == \
+            (other.name_id, other.title_id, other.job_title, other.credit, other.episode_count, other.episode_year_start, other.episode_year_end)
+
+    def __lt__(self, other):
+        return (self.name_id, self.title_id, self.job_title, self.credit, self.episode_count, self.episode_year_start, self.episode_year_end) < \
+            (other.name_id, other.title_id, other.job_title, other.credit, other.episode_count, other.episode_year_start, other.episode_year_end)
+
     def __str__(self):
-        return f'{self.name_id}: {self.job_title} in {self.title_id} as {self.credit}'
+        return f'{self.name_id}: {self.job_title} in {self.title_id} as {self.credit}' + \
+            f'{f" in {self.episode_count} episodes" if self.episode_count is not None else ""}'
+
+    def __repr__(self):
+        return self.__str__()
 
 
 class NameScrape:

diff --git a/pymdb/models/title.py b/pymdb/models/title.py
@@ -443,7 +443,7 @@ class TitleScrape:
         storyline (:obj:`str`): The title's storyline.
         production_companies (:obj:`list` of :obj:`str`): A list of company IDs, used by IMDb and
             prefixed with `co`, that are credited with producing the title.
-        top_cast (:obj:`list` of :obj:`str`): A list of person IDs, used by IMDb and prefixed with `nm`,
+        top_cast (:obj:`list` of :obj:`~.models.name.CreditScrape`): A list of `CreditScrape` objects
             that are the top cast in the title.
         budget (:obj:`int`): The estimated budget for the title.
         budget_denomination (:obj:`str`): The denomination the budget value is listed as

diff --git a/pymdb/scraper.py b/pymdb/scraper.py
@@ -17,6 +17,7 @@
     get_category,
     get_company_id,
     get_denomination,
+    get_episode_info,
     get_name_id,
     get_ref_marker,
     get_title_id,
@@ -80,7 +81,7 @@ def get_title(self, title_id, include_taglines=False):
         plot = None
         storyline = None
         production_companies = []
-        cast_members = []
+        top_cast = []
         budget = None
         budget_denomination = None
         opening_weekend_gross = None
@@ -189,14 +190,28 @@ def get_title(self, title_id, include_taglines=False):
             for cast_member in cast_node.css('tr.odd, tr.even'):
                 cast_member_node = cast_member.css_first('td:nth-of-type(2) > a')
                 if cast_member_node:
-                    cast_member_id = get_name_id(cast_member_node)
-                    # TODO: should this be modified to store a list of Credit objects?
-                    #cast_member_name = cast_member_node.text().strip()
-                    #character_nodes = cast_member.css('td.character > a')
-                    #characters = []
-                    #for c_node in character_nodes:
-                    #    characters.append(c_node.text().strip())
-                    cast_members.append(cast_member_id)
+                    character_credit = None
+                    episode_count = None
+                    episode_year_start = None
+                    episode_year_end = None
+                    character_node = cast_member.css_first('td.character')
+                    if character_node:
+                        # Check if there is episode information, save it, then remove it
+                        episode_info_node = character_node.css_first('a.toggle-episodes')
+                        if episode_info_node:
+                            episode_count, episode_year_start, episode_year_end = get_episode_info(episode_info_node)
+                            episode_info_node.decompose()
+                        character_credit = re.sub(r'\s+', ' ', character_node.text().strip())
+                    top_cast.append(
+                        CreditScrape(
+                            name_id=get_name_id(cast_member_node),
+                            title_id=title_id,
+                            job_title='actor',
+                            credit=character_credit,
+                            episode_count=episode_count,
+                            episode_year_start=episode_year_start,
+                            episode_year_end=episode_year_end
+                    ))
 
         # Get season and episode numbers if TV episode
         heading_nodes = tree.css('div.bp_heading')
@@ -229,7 +244,7 @@ def get_title(self, title_id, include_taglines=False):
             plot=plot,
             storyline=storyline,
             production_companies=production_companies,
-            top_cast=cast_members,
+            top_cast=top_cast,
             budget=budget,
             budget_denomination=budget_denomination,
             opening_weekend_gross=opening_weekend_gross,
@@ -273,21 +288,7 @@ def get_full_cast(self, title_id, include_episodes=False):
                 # Check if this is a TV series
                 toggle_episodes_node = cast_member.css_first('a.toggle-episodes')
                 if toggle_episodes_node:
-                    episode_info = re.sub(
-                        r'<\s*span.*?<\s*/\s*span\s*>', '', toggle_episodes_node.text()
-                    ).strip().split(',')
-                    if len(episode_info) > 1:
-                        episode_count, episode_year_info = episode_info
-                        episode_year_info = episode_year_info.split('-')
-                        if len(episode_year_info) > 1:
-                            episode_year_start, episode_year_end = episode_year_info
-                        else:
-                            episode_year_start, = episode_year_info
-                    else:
-                        episode_count, = episode_info
-                    episode_count_match = re.search(r'\d+', episode_count)
-                    if episode_count_match:
-                        episode_count = episode_count_match.group(0)
+                    episode_count, episode_year_start, episode_year_end = get_episode_info(toggle_episodes_node)
 
                     # Include all individual episodes an actor is in
                     if include_episodes:

diff --git a/tests/test_scraper.py b/tests/test_scraper.py
@@ -49,21 +49,21 @@ def test_get_title_movie(self):
             '''
         production_companies = ['co0071326', 'co0000756']
         top_cast = [
-            'nm0000434',
-            'nm0000148',
-            'nm0000402',
-            'nm0001088',
-            'nm0000027',
-            'nm0000355',
-            'nm0048652',
-            'nm0562679',
-            'nm0001190',
-            'nm0114436',
-            'nm0292235',
-            'nm0701023',
-            'nm0567018',
-            'nm0125952',
-            'nm0377120'
+            CreditScrape('nm0000434', title_id, 'actor', 'Luke Skywalker', None, None, None),
+            CreditScrape('nm0000148', title_id, 'actor', 'Han Solo', None, None, None),
+            CreditScrape('nm0000402', title_id, 'actor', 'Princess Leia Organa', None, None, None),
+            CreditScrape('nm0001088', title_id, 'actor', 'Grand Moff Tarkin', None, None, None),
+            CreditScrape('nm0000027', title_id, 'actor', 'Ben Obi-Wan Kenobi', None, None, None),
+            CreditScrape('nm0000355', title_id, 'actor', 'C-3PO', None, None, None),
+            CreditScrape('nm0048652', title_id, 'actor', 'R2-D2', None, None, None),
+            CreditScrape('nm0562679', title_id, 'actor', 'Chewbacca', None, None, None),
+            CreditScrape('nm0001190', title_id, 'actor', 'Darth Vader', None, None, None),
+            CreditScrape('nm0114436', title_id, 'actor', 'Uncle Owen', None, None, None),
+            CreditScrape('nm0292235', title_id, 'actor', 'Aunt Beru', None, None, None),
+            CreditScrape('nm0701023', title_id, 'actor', 'Chief Jawa', None, None, None),
+            CreditScrape('nm0567018', title_id, 'actor', 'General Dodonna', None, None, None),
+            CreditScrape('nm0125952', title_id, 'actor', 'General Willard', None, None, None),
+            CreditScrape('nm0377120', title_id, 'actor', 'Red Leader (as Drewe Hemley)', None, None, None)
         ]
         budget = 11000000
         budget_denomination = 'USD'
@@ -128,14 +128,14 @@ def test_get_title_tv_series(self):
             '''
         production_companies = ['co0223402', 'co0056447', 'co0159275']
         top_cast = [
-            'nm0921942',
-            'nm0005408',
-            'nm0224007',
-            'nm0534134',
-            'nm0482851',
-            'nm0866300',
-            'nm0005606',
-            'nm0379114'
+            CreditScrape('nm0921942', title_id, 'actor', 'Philip J. Fry / ...', 124, 1999, 2013),
+            CreditScrape('nm0005408', title_id, 'actor', 'Turanga Leela / ...', 124, 1999, 2013),
+            CreditScrape('nm0224007', title_id, 'actor', 'Bender / ...', 124, 1999, 2013),
+            CreditScrape('nm0534134', title_id, 'actor', 'Linda / ...', 124, 1999, 2013),
+            CreditScrape('nm0482851', title_id, 'actor', 'Hermes Conrad / ...', 117, 1999, 2013),
+            CreditScrape('nm0866300', title_id, 'actor', 'Amy Wong / ...', 115, 1999, 2013),
+            CreditScrape('nm0005606', title_id, 'actor', 'Morbo / ...', 114, 1999, 2013),
+            CreditScrape('nm0379114', title_id, 'actor', 'Scruffy / ...', 107, 1999, 2013)
         ]
         budget = None
         budget_denomination = None
@@ -201,21 +201,21 @@ def test_get_title_tv_episode(self):
             '''
         production_companies = ['co0335036', 'co0418998', 'co0343278']
         top_cast = [
-            'nm0227759',
-            'nm3229685',
-            'nm3592338',
-            'nm0192377',
-            'nm3849842',
-            'nm0318821',
-            'nm0396924',
-            'nm2812026',
-            'nm1970465',
-            'nm3701064',
-            'nm0654295',
-            'nm0401264',
-            'nm2760664',
-            'nm2247629',
-            'nm1613839'
+            CreditScrape('nm0227759', title_id, 'actor', 'Tyrion Lannister', None, None, None),
+            CreditScrape('nm3229685', title_id, 'actor', 'Jon Snow', None, None, None),
+            CreditScrape('nm3592338', title_id, 'actor', 'Daenerys Targaryen', None, None, None),
+            CreditScrape('nm0192377', title_id, 'actor', 'Davos Seaworth', None, None, None),
+            CreditScrape('nm3849842', title_id, 'actor', 'Sansa Stark', None, None, None),
+            CreditScrape('nm0318821', title_id, 'actor', 'Petyr \'Littlefinger\' Baelish', None, None, None),
+            CreditScrape('nm0396924', title_id, 'actor', 'Melisandre (as Carice Van Houten)', None, None, None),
+            CreditScrape('nm2812026', title_id, 'actor', 'Missandei', None, None, None),
+            CreditScrape('nm1970465', title_id, 'actor', 'Tormund Giantsbane', None, None, None),
+            CreditScrape('nm3701064', title_id, 'actor', 'Ramsay Bolton', None, None, None),
+            CreditScrape('nm0654295', title_id, 'actor', 'Theon Greyjoy', None, None, None),
+            CreditScrape('nm0401264', title_id, 'actor', 'Daario Naharis', None, None, None),
+            CreditScrape('nm2760664', title_id, 'actor', 'Grey Worm', None, None, None),
+            CreditScrape('nm2247629', title_id, 'actor', 'Yara Greyjoy', None, None, None),
+            CreditScrape('nm1613839', title_id, 'actor', 'Wun Wun', None, None, None)
         ]
         budget = 10000000
         budget_denomination = 'GBP'