Skip to content

Commit

Permalink
#4 Add new method get_full_credits_as_dict to replace as_generators
Browse files Browse the repository at this point in the history
  • Loading branch information
zembrodt committed Nov 29, 2019
1 parent 380aa69 commit c2ac3c5
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 69 deletions.
3 changes: 2 additions & 1 deletion docs/modules/scraper.rst
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ pymdb.scraper module
PyMDbScraper
------------
.. autoclass:: PyMDbScraper
:members:
:members:
:private-members:
104 changes: 36 additions & 68 deletions pymdb/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import re
import requests
from collections import defaultdict
from selectolax.parser import HTMLParser
from pymdb.exceptions import InvalidCompanyId
from pymdb.models import (
Expand Down Expand Up @@ -42,10 +43,6 @@ class PyMDbScraper:
"""Scrapes various information from IMDb web pages.
Contains functions for various IMDb pages and scrapes information into Python classes.
Args:
as_generators (:obj:`bool`): Determines if the objects scraped from IMDb should be returned as
generators or all at once as lists or dictionaries.
"""

_headers = {
Expand All @@ -54,8 +51,8 @@ class PyMDbScraper:
'accept': 'text/html,application/xhtml+xml,application/xml'
}

def __init__(self, as_generators=True):
self._as_generators = as_generators
def __init__(self):
pass

def get_title(self, title_id, include_taglines=False):
"""Scrapes information from the IMDb web page for the specified title.
Expand Down Expand Up @@ -284,16 +281,6 @@ def get_full_cast(self, title_id, include_episodes=False):
HTTPError: If a request failed.
"""

if self._as_generators:
for credit in self._get_full_cast(title_id, include_episodes):
yield credit
else:
actors = []
for credit in self._get_full_cast(title_id, include_episodes):
actors.append(credit)
return actors

def _get_full_cast(self, title_id, include_episodes):
request = f'https://www.imdb.com/title/{title_id}/fullcredits'
tree = self._get_tree(request)

Expand Down Expand Up @@ -381,19 +368,6 @@ def get_full_crew(self, title_id):
HTTPError: If the request failed.
"""

if self._as_generators:
for crew_member in self._get_full_crew(title_id):
yield crew_member
else:
crew = dict()
for crew_member in self._get_full_crew(title_id):
if crew_member.job_title in crew:
crew[crew_member.job_title].append(crew_member)
else:
crew[crew_member.job_title] = [crew_member]
return crew

def _get_full_crew(self, title_id):
request = f'https://www.imdb.com/title/{title_id}/fullcredits'
tree = self._get_tree(request)

Expand All @@ -404,6 +378,10 @@ def _get_full_crew(self, title_id):
for node in credits_node.iter():
if not found_title:
if node.tag == 'h4' and node.id is None:
# Remove any extra info in a span next to the job title
title_span_node = node.css_first('span')
if title_span_node:
title_span_node.decompose()
title = node.text().strip()
if len(title) > 0:
found_title = True
Expand Down Expand Up @@ -483,20 +461,40 @@ def get_full_credits(self, title_id, include_episodes=False):
HTTPError: If the request failed.
"""

if self._as_generators:
for credit in self._get_full_credits(title_id, include_episodes):
yield credit
else:
full_credits = self._get_full_crew(title_id)
full_credits[_ACTOR] = self._get_full_cast(title_id, include_episodes)
return full_credits

def _get_full_credits(self, title_id, include_episodes):
for cast_member in self.get_full_cast(title_id, include_episodes=include_episodes):
yield cast_member
for crew_member in self.get_full_crew(title_id):
yield crew_member

def get_full_credits_as_dict(self, title_id, include_episodes=False):
"""Scrapes the full list of credited people for a title into a dictionary.
Builds a dictionary with `job_title` as key of lists of :class:`~.models.title.CreditScrape`
objects. Uses the results of the :obj:`~.scraper.PyMDbScraper.get_full_credits` method
to gather the objects.
An optional argument `include_episodes` will also scrape each episode an actor is in
if the title is a TV series.
Args:
title_id (:obj:`str`): The title's ID used by IMDb prefixed with `tt`.
include_episodes (:obj:`bool`, optional): Specify if individual episodes of a
TV series should also be scraped.
Returns:
:obj:`dict` of :obj:`list` of :class:`~.models.title.CreditScrape`: A dictionary where
each key is a :obj:`str` of a `job_title` and the value is a :obj:`list` of
:class:`~.models.title.CreditScrape` objects who's `job_title` value is the same
as the key.
Raises:
HTTPError: If the request failed.
"""

full_credits = defaultdict(list)
for credit in self.get_full_credits(title_id, include_episodes):
full_credits[credit.job_title].append(credit)
return full_credits

def get_name(self, name_id, include_known_for_titles=False):
"""Scrapes detailed information from a person's personal IMDb web page.
Expand Down Expand Up @@ -618,16 +616,6 @@ def get_name_credits(self, name_id, include_episodes=False):
HTTPError: If a request failed.
"""

if self._as_generators:
for credit in self._get_name_credits(name_id, include_episodes):
yield credit
else:
name_credits = []
for credit in self._get_name_credits(name_id, include_episodes):
name_credits.append(credit)
return name_credits

def _get_name_credits(self, name_id, include_episodes):
request = f'https://www.imdb.com/name/{name_id}/'
tree = self._get_tree(request)

Expand Down Expand Up @@ -739,16 +727,6 @@ def get_company(self, company_id):
InvalidCompanyId: If an invalid company ID was given.
"""

if self._as_generators:
for title in self._get_company(company_id):
yield title
else:
company_titles = []
for title in self._get_company(company_id):
company_titles.append(title)
return company_titles

def _get_company(self, company_id):
index = 1
finding_titles = True
while finding_titles:
Expand Down Expand Up @@ -832,16 +810,6 @@ def get_company_credits(self, title_id):
HTTPError: If the request failed.
"""

if self._as_generators:
for company in self._get_company_credits(title_id):
yield company
else:
company_credits = []
for company in self._get_company_credits(title_id):
company_credits.append(company)
return company_credits

def _get_company_credits(self, title_id):
request = f'https://www.imdb.com/title/{title_id}/companycredits'
tree = self._get_tree(request)

Expand Down
32 changes: 32 additions & 0 deletions tests/test_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
from pymdb.models.name import (
_ACTOR,
_ART_DEPARTMENT,
_ASSISTANT_DIRECTOR,
_CAMERA_AND_ELECTRICAL_DEPARTMENT,
_CINEMATOGRAPHY,
_DIRECTOR,
_MUSIC,
_MUSIC_DEPARTMENT,
Expand Down Expand Up @@ -866,6 +869,35 @@ def test_get_full_credits_tv_episode(self):
self.assertEqual(credit.credit, correct_credit.credit)
self.assertEqual(credit_types_count, len(credit_types))

def test_get_full_credits_as_dict(self):
title_id = 'tt3359412'
scraper = PyMDbScraper()

# Correct results
directors = ['nm6113438']
writers = ['nm6113440']
cast = ['nm4043618', 'nm6113439']
cinematographers = ['nm6113438']
assistant_directors = ['nm6113441']
art_department = ['nm6294206']
camera_department = ['nm6113442']

actual_results = scraper.get_full_credits_as_dict(title_id)
for director in actual_results[_DIRECTOR]:
self.assertTrue(director.name_id in directors)
for writer in actual_results[_WRITER]:
self.assertTrue(writer.name_id in writers)
for cast_member in actual_results[_ACTOR]:
self.assertTrue(cast_member.name_id in cast)
for cinematographer in actual_results[_CINEMATOGRAPHY]:
self.assertTrue(cinematographer.name_id in cinematographers)
for assistant_director in actual_results[_ASSISTANT_DIRECTOR]:
self.assertTrue(assistant_director.name_id in assistant_directors)
for crew_member in actual_results[_ART_DEPARTMENT]:
self.assertTrue(crew_member.name_id in art_department)
for crew_member in actual_results[_CAMERA_AND_ELECTRICAL_DEPARTMENT]:
self.assertTrue(crew_member.name_id in camera_department)

def test_get_full_credits_bad_request(self):
title_id = 'nm123456789'
scraper = PyMDbScraper()
Expand Down

0 comments on commit c2ac3c5

Please sign in to comment.