diff --git a/.gitignore b/.gitignore index 0c2217c..f5f8987 100644 --- a/.gitignore +++ b/.gitignore @@ -21,7 +21,7 @@ hs_err_pid* backend/node_modules/ # config file and various api keys -data/config.py +data/public_data/config.py # python compiles data/__pycache__/ diff --git a/.travis.yml b/.travis.yml index 1172a3f..49b183e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,19 +24,13 @@ before_script: - cd .. before_install: - cd data - - python3 -V - - python3 -c "import sys;print('\n'.join(sys.path))" - sudo apt-get -y install python3-pip - pip3 install bs4 - pip3 install lxml - - pip3 list - - pip3 show lxml - cd .. install: - cd backend - npm install script: - npm test - - cd .. - - cd data - - python3 -m unittest test/test_moviedata.py + diff --git a/data/cron.py b/data/cron.py index ee24d6d..949f194 100644 --- a/data/cron.py +++ b/data/cron.py @@ -1,14 +1,23 @@ -import etl.etlprocessor as processor +from controller import ETLController from apscheduler.schedulers.blocking import BlockingScheduler +import logging + + if __name__ == '__main__': + logging.basicConfig(level=logging.WARNING) scheduler = BlockingScheduler() - processor = processor.ETLProcessor() - scheduler.add_job(processor.update_movie_data, args=[1, 1000000, 0]) - scheduler.add_job(processor.update_movie_data, args=[1000000, 2000000, 5]) - # scheduler.add_job(processor.update_movie_data, args=[2000000, 3000000, 10]) - # scheduler.add_job(processor.update_movie_data, args=[3000000, 4000000, 15]) - # scheduler.add_job(processor.update_movie_data, args=[4000000, 5000000, 20]) - # scheduler.add_job(processor.update_movie_data, args=[5000000, 6000000, 25]) - # scheduler.add_job(processor.update_movie_data, args=[6000000, 7000000, 30]) + + controller = ETLController() + scheduler.add_job(controller.update_movie_data, args=[321535, 1000000, 0]) + # scheduler.add_job(controller.update_movie_data, args=[1172158, 2000000, 5]) + # scheduler.add_job(controller.update_movie_data, args=[2033967, 3000000, 10]) + # scheduler.add_job(controller.update_movie_data, args=[3052760, 4000000, 15]) + scheduler.start() + +# +# if __name__ == '__main__': +# logging.basicConfig(level=logging.WARNING) +# controller = ETLController() +# controller.update_movie_rating() diff --git a/data/etl/cinemalist.py b/data/etl/cinemalist.py deleted file mode 100644 index 0c75255..0000000 --- a/data/etl/cinemalist.py +++ /dev/null @@ -1,96 +0,0 @@ -from bs4 import BeautifulSoup -from urllib import request, error -from selenium import webdriver -from string import capwords - - -class CinemaList: - - gv_cinema_list_home = "https://www.gv.com.sg/GVCinemas" - - cathay_cinema_list_home = "http://www.cathaycineplexes.com.sg/cinemas/" - - sb_cinema_list_home = "http://www.shaw.sg/sw_cinema.aspx" - - def __init__(self): - self.driver = webdriver.PhantomJS() - - def get_golden_village_cinema_list(self): - """Get a list of dictionaries contain all Golden Village - cinema names, and their corresponding url. - """ - url = self.gv_cinema_list_home - - cinema_list = [] - - # get raw cinema list - raw_cinema_url = [] - self.driver.get(url) - anchors = self.driver.find_element_by_class_name("cinemas-list").find_elements_by_class_name("ng-binding") - for anchor in anchors: - raw_cinema_url.append(anchor.get_attribute("href")) - - # get actual list, in each url it may contain more than one cinema - for cinema_url in raw_cinema_url: - self.driver = webdriver.PhantomJS() # reinstantiate to avoid detach from DOM - self.driver.get(cinema_url) - div = self.driver.find_elements_by_class_name("ng-binding") - for item in div: - if item.get_attribute("ng-bind-html") == "cinema.name": - cinema_name = item.text - self.insert_cinema_data(cinema_list, cinema_name, cinema_url) - return cinema_list - - def get_cathay_cinema_list(self): - """Get a list of dictionaries contain all cathay cinema names. - It's corresponding url is None because cathay does not show movies - schedule based on individual cinemas in their web page layouts. - """ - cinema_list = [] - - url = self.cathay_cinema_list_home - web_content = request.urlopen(url).read().decode("utf-8") - soup = BeautifulSoup(web_content, "lxml") - divs = soup.find_all("div", {"class": "description"}) - for div in divs: - cinema_name = capwords(div.find("h1").text) - self.insert_cinema_data(cinema_list, cinema_name, "http://www.cathaycineplexes.com.sg/showtimes/") - return cinema_list - - def get_shaw_brother_cinema_list(self): - """Get a list of dictionaries contain all SB cinema names, - and their corresponding urls - """ - name_list = [] - url_list = [] - cinema_list = [] - - url = self.sb_cinema_list_home - web_content = request.urlopen(url).read().decode("utf-8") - soup = BeautifulSoup(web_content, "lxml") - divs = soup.find_all("a", {"class": "txtHeaderBold"}) - for div in divs: - name_list.append(div.text) - - buy_tickets = soup.find_all("a", {"class": "txtNormalDim"}) - for item in buy_tickets: - current_link = item["href"] - if "buytickets" in current_link: - url_list.append("www.shaw.sg/" + item["href"]) - - assert len(name_list) == len(url_list) # check whether there is misake in matching cinema name and url - - for i in range(len(name_list)): - self.insert_cinema_data(cinema_list, name_list[i], url_list[i]) - - return cinema_list - - @staticmethod - def insert_cinema_data(cinema_list, cinema_name, cinema_url): - inserted_tuple = { - "url": cinema_url, - "cinema_name": cinema_name - } - cinema_list.append(inserted_tuple) - - diff --git a/data/etl/etlprocessor.py b/data/etl/etlprocessor.py deleted file mode 100644 index 92fe7f5..0000000 --- a/data/etl/etlprocessor.py +++ /dev/null @@ -1,102 +0,0 @@ -""" - Core objective of this etl framework. This is the highest level API. - Each one of them will be run in backend on server, at desginated - time intervals. - - It includes four main methods in total: - 1. update movie data - 2. update movie public rating - 3. update the list of cinemas in Singapore - 4. update cinema schedule for each cinema available -""" -import etl.extractor as extractor -import etl.transformer as transformer -import etl.loader as loader - -import utils -import psycopg2 -import time - - -from urllib import error - - -class ETLProcessor: - - def __init__(self): - self.logger = utils.initialise_logger() - self.logger.info("Initialise ETL process ...") - - self.extractor = extractor.Extractor(self.logger) - self.loader = loader.Loader(self.logger) - self.transformer = transformer.Transformer(self.logger) - - def update_movie_data(self, lower, upper, delay): - """updates movie data from databases (potentially more than one source) - it is a one time process, i.e. data will not be updated constantly - """ - self.logger.info("Initialise movie data retrieval process ...") - - time.sleep(delay) # delay to avoid conflict - existing_movies_id = self.loader.get_movie_id_list() - - for index in range(lower, upper): # iterate all possible titles - imdb_id = utils.imdb_id_builder(index) - - if imdb_id in existing_movies_id: - continue - - try: - movie_data = self.extractor.extract_movie_data(imdb_id) - except error.HTTPError: - self.logger.error("Movie ID is not valid." + imdb_id) - continue - except Exception as e: # need to find out the exact error type - self.logger.error("Movie ID type is not registered." + imdb_id) - self.logger.error(e) - continue - - try: - self.loader.load_movie_data(movie_data) - except psycopg2.DataError: - self.logger.error("Invalid insertion! Due to the subtext are partially parsed.") - continue - - self.logger.info("Movie data update process complete.") - - def update_movie_rating(self): - """updates movie rating from popcorn movies (may have to change to raaw implementation in the future) - it is a continuous process and data will be updated constantly - """ - self.logger.info("Initialise movie rating update process ...") - - # get list of existing movies - id_list = self.loader.get_movie_id_list() - - self.logger.info("Movie rating update process complete.") - - def update_cinema_schedule(self): - """ - updates movie rating from various theatres official page - it is a continuous process and data will be updated constantly - """ - self.logger.info("Initialise movie showing update process ...") - # get a list of cinemas - cinema_list = self.loader.get_cinema_list() # [0]:cimena_id, [1]:cinema_name, [2]:url - # for each cinema - for cinema in cinema_list: - self.extractor.extract_cinema_schedule(cinema) - - # get all schedules - # load into database based on cinema id and imdb id - break - - self.logger.info("Movie showing update process complete.") - - def update_cinema_list(self): - """update cinema list from various theatres websites""" - self.logger.info("Initialise cinema list update process ...") - cinema_list = self.extractor.extract_cinema_list() - self.loader.load_cinema_list(cinema_list) - self.logger.info("Cinema list update process complete.") - diff --git a/data/etl/extractor.py b/data/etl/extractor.py deleted file mode 100644 index b25c108..0000000 --- a/data/etl/extractor.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Façade class for various lower level extractors""" -from etl.moviedata import MovieData -from etl.movierating import MovieRating -from etl.cinemalist import CinemaList -from etl.movieshowing import MovieShowing - - -class Extractor: - - def __init__(self, logger): - self.logger = logger - - @staticmethod - def extract_movie_data(movie_id): - """given imdb_id, return the metadata of that movie from imdb""" - data_model = MovieData(movie_id) - data_model.build_soup(data_model.get_html_content()) - data_model.extract_process() - return data_model.get_movie_data() - - @staticmethod - def extract_movie_rating(movie_id): - """given imdb_id, return a list of dictionaries that contain respective - rating and votes from each ratings sources - """ - data_model = MovieRating(movie_id) - return data_model.get_movie_ratings() - - @staticmethod - def extract_cinema_list(): - """return a list of dictionaries contains all the cinema names and its - respective urls - """ - data_model = CinemaList() - final_list = [] - final_list.extend(data_model.get_golden_village_cinema_list()) - final_list.extend(data_model.get_cathay_cinema_list()) - final_list.extend(data_model.get_shaw_brother_cinema_list()) - return final_list - - @staticmethod - def extract_cinema_schedule(cinema): - data_model = MovieShowing(cinema) - data_model.extract_cinema_schedule() - return - - - - - - - diff --git a/data/etl/loader.py b/data/etl/loader.py deleted file mode 100644 index e6252ba..0000000 --- a/data/etl/loader.py +++ /dev/null @@ -1,77 +0,0 @@ -"""handles all interactions with database""" -import config -import psycopg2 -import logging - - -class Loader: - - def __init__(self, logger): - self.cursor, self.conn = config.database_connection() - self.logger = logger - - # ======== - # LOAD - # ======== - def load_movie_data(self, movie_data): - try: - self.cursor.execute("INSERT INTO movies (movie_id, title, production_year, rated, plot, actors, " - "language, country, runtime, poster_url, genre, director, released, type) " - "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", - (movie_data['movie_id'], movie_data['title'], movie_data['production_year'], - movie_data['rated'], movie_data['plot'], movie_data['actors'], movie_data['language'], - movie_data['country'], movie_data['runtime'], movie_data['poster_url'], - movie_data['genre'], movie_data['director'], movie_data['released'], movie_data['type'])) - self.conn.commit() - except psycopg2.IntegrityError: - logging.error("UNIQUE CONSTRAINT violated in Table: movies " + movie_data['movie_id']) - - def load_movie_rating(self, movie_rating): - self.cursor.execute("INSERT INTO public_ratings (vote, score, movie_id, source_id) VALUES (%s, %s, %s, %s) " - "ON CONFLICT (movie_id, source_id) " - "DO UPDATE SET (vote, score) = (%s, %s) " - "WHERE public_ratings.movie_id=%s AND public_ratings.source_id=%s", - (movie_rating['votes'], movie_rating['score'], movie_rating['movie_id'], - movie_rating['source_id'], movie_rating['votes'], movie_rating['score'], - movie_rating['movie_id'], movie_rating['source_id'])) - self.conn.commit() - - def load_cinema_list(self, cinema_list): - for cinema in cinema_list: - self.cursor.execute("INSERT INTO cinemas (cinema_name, url) VALUES (%s, %s) " - "ON CONFLICT (cinema_name) " - "DO UPDATE SET (cinema_name, url) = (%s, %s)" - "WHERE cinemas.cinema_name=%s", - (cinema['cinema_name'], cinema['url'], cinema['cinema_name'], cinema['url'], - cinema['cinema_name'])) - - self.conn.commit() - - def load_cinema_schedule(self, cinema_schedule): - pass - - # ======== - # GET - # ======== - def get_movie_id_list(self): - self.cursor.execute("SELECT movie_id FROM movies") - data_object = self.cursor.fetchall() - id_list = [] - for item in data_object: - id_list.append(item[0]) - return id_list - - def get_movie_validation_info(self, movie_id): - self.cursor.execute("SELECT title, released, director FROM movies WHERE movie_id=%s", (movie_id, )) - data_object = self.cursor.fetchone() - return data_object - - def get_cinema_list(self): - """return a list of tuples that contains the information of - each cinema""" - self.cursor.execute("SELECT * FROM cinemas") - data_object= self.cursor.fetchall() - cinema_list = [] - for item in data_object: - cinema_list.append(item) - return cinema_list diff --git a/data/etl/moviedata.py b/data/etl/moviedata.py deleted file mode 100644 index 5bc8670..0000000 --- a/data/etl/moviedata.py +++ /dev/null @@ -1,235 +0,0 @@ -""" - data class for all imdb movies -""" -from bs4 import BeautifulSoup -from urllib import request, error - -import lxml -import html -import utils - - -class MovieData: - - # statics - IMDB_URL_FORMAT = "http://www.imdb.com/title/{}/" - - title = None - production_year = None - rated = None - plot = None - actors = None - language = None - country = None - genre = None - poster_url = None - released = None - runtime = None - director = None - type = None - subtext = None - soup = None - - def __init__(self, imdb_id): - """ - it takes an imdb_id to instantiate a MovieData object, upon instantiation, - it will get relevant html content and store as instance attribute - :param imdb_id: - """ - self.imdb_id = imdb_id - - # main logic - def get_html_content(self): - """ - get html source based on imdb_id - :return: string - """ - url = self.IMDB_URL_FORMAT.format(self.imdb_id) - request_result = html.unescape(request.urlopen(url).read().decode("utf-8")) - return request_result - - def build_soup(self, request_result): - """ - build soup based on html content in string format - :param request_result: - :return: - """ - self.soup = BeautifulSoup(request_result, "lxml") # soup builder - - def build_soup_for_test(self, html_file_io_wrapper): - self.soup = BeautifulSoup(html_file_io_wrapper, "lxml") - - def extract_process(self): - """ - main logic for extraction of imdb data - :return: - """ - self.extract_title_and_year() - self.extract_poster() - self.extract_credits() - self.extract_plot() - self.extract_subtext() - self.extract_rated() - self.extract_genre() - self.extract_release() - self.extract_runtime() - - # get - def get_movie_data(self): - """ - return a dict that contains all data to extractor - :return: dictionary of data in various type - """ - movie_data = utils.get_movie_data_dict(self.actors, self.country, self.director, self.genre, self.imdb_id, - None, self.plot, self.poster_url, self.production_year, self.rated, - self.released, self.runtime, self.title, self.type) - return movie_data - - # extraction nodes - def extract_title_and_year(self): - """ - return title and production year of a movie - :return: title in string, production year in integer or None - """ - title_wrapper = self.soup.find("h1").text.split("\xa0") - self.title = title_wrapper[0] - self.production_year = title_wrapper[1].replace("(", "").replace(")", "").replace(" ", "") - if self.production_year == "": - self.production_year = None - return self.title, self.production_year - return self.title, int(self.production_year) - - def extract_poster(self): - """ - return the url of poster of one movie - :return: - """ - poster = self.soup.find("div", {"class": "poster"}) - try: - self.poster_url = poster.find("img")['src'] - except AttributeError: - self.poster_url = None - return self.poster_url - - def extract_credits(self): - """ - return the directors and actors of the movie. If there is more than - one director or actor, it will display a string with multiple tokens, - separated by comma - :return: credits info in string format or None - """ - credits_text = self.soup.find_all("div", {"class": "credit_summary_item"}) - for item in credits_text: - current_text = item.text - if "Directors:" in current_text: - self.director = current_text.replace("Directors:", "").split("|")[0]\ - .replace("\n", "").replace(" ", "").strip() - elif "Director:" in current_text: - self.director = current_text.replace("Director:", "").strip() - elif "Stars" in current_text: - self.actors = current_text.replace("Stars:", "").split("|")[0]\ - .replace("\n", "").replace(" ", "").strip() - elif "Star" in current_text: - self.actors = current_text.replace("Star:", "").strip() - return self.actors, self.director - - def extract_plot(self): - """ - return the plot of one movie - :return: plot in string format or None - """ - self.plot = self.soup.find("div", {"class": "summary_text"}).text.replace("\n", "").strip().split(" ")[0] - if "Add a Plot" in self.plot: - self.plot = None - return self.plot - - def extract_subtext(self): - """ - retrieve the subtext tag for other extraction nodes - :return: - """ - self.subtext = self.soup.find("div", {"class": "subtext"}) - - def extract_rated(self): - """ - return the rating of a movie - :return: - """ - metas = self.subtext.find_all("meta") - for meta in metas: - if meta['itemprop'] == "contentRating": - self.rated = meta['content'] - return self.rated - - def extract_release(self): - """ - parse the last token in subtext element. it determines the type of the object, - it may also determine the release date and country - :return: - """ - self.type = 'movie' # default movie type - anchors = self.subtext.find_all("a") - for anchor in anchors: - if anchor.has_attr('title'): - release_text = anchor.text - if "Episode aired" in release_text: - self.type = "episode" - release_text = release_text.replace("Episode aired", "").replace("\n", "").strip() - self.released = utils.transform_date_imdb(release_text) - elif "TV Series" in release_text: - self.type = "tv" - elif "TV Episode" in release_text: - self.type = "episode" - elif "TV Special" in release_text: - self.type = "tv-special" - release_text = release_text.replace("TV Special", "").replace("\n", "").strip() - self.released = utils.transform_date_imdb(release_text) - elif "Video Game" in release_text: - self.type = "video-game" - elif "Video game released" in release_text: - self.type = "video-game" - release_text = release_text.replace("Video game released", "").replace("\n", "").strip() - self.released = utils.transform_date_imdb(release_text) - elif "Video" in release_text: - self.type = "video" - release_text = release_text.replace("Video", "").replace("\n", "").strip() - self.released = utils.transform_date_imdb(release_text) - elif "TV Mini-Series" in release_text: - self.type = "tv-mini" - elif "TV Movie" in release_text: - self.type = "tv-movie" - release_text = release_text.replace("TV Movie", "").replace("\n", "").strip() - self.released = utils.transform_date_imdb(release_text) - elif "TV Short" in release_text: - self.type = "tv-short" - else: - release_text = release_text.replace("\n", "").strip() - self.released, self.country = utils.split_release_and_country_imdb(release_text) - self.released = utils.transform_date_imdb(self.released) - return self.released, self.country, self.type - - def extract_genre(self): - """ - parse the html content and return the genre of the movie - :return: - """ - genre_list = [] - spans = self.subtext.find_all("span", {"class": "itemprop"}) - for span in spans: - genre_list.append(span.text) - if len(genre_list) > 0: - self.genre = ", ".join(genre_list) - return self.genre - - def extract_runtime(self): - """ - parse the html content and return the runtime of the movie - :return: - """ - time_tag = self.subtext.find("time") - try: - time_text = time_tag['datetime'] - self.runtime = int(time_text.replace("PT", "").replace("M", "").replace(",", "")) - except TypeError: - return None - return self.runtime diff --git a/data/etl/movierating.py b/data/etl/movierating.py deleted file mode 100644 index cc68575..0000000 --- a/data/etl/movierating.py +++ /dev/null @@ -1,111 +0,0 @@ -from urllib import request, error -from bs4 import BeautifulSoup - -import utils -import json - - -class MovieRating: - - trakt_header = { - 'Content-Type': 'application/json', - 'trakt-api-version': '2', - 'trakt-api-key': '411a8f0219456de5e3e10596486c545359a919b6ebb10950fa86896c1a8ac99b' - } - - wemakesites_api_key = "5a7e0693-af96-4d43-89a3-dc8ca00cf355" - - imdb_url_format = "http://www.imdb.com/title/{}/" - - # omdb setup - omdb_plot_option = "full" # attribute for omdb - - omdb_content_type = "json" # return type for omdb requests - - # douban - douban_url_format = "https://movie.douban.com/subject_search?search_text={}" - metacritic_url_format = "http://www.metacritic.com/search/movie/{}/results" - - def __init__(self, movie_id): - self.movie_id = movie_id - - def get_movie_ratings(self): - movie_ratings = [] - - rating, votes = self.extract_trakt_rating() - movie_ratings.append(utils.get_movie_rating_dict(rating, votes, self.movie_id, 'Trakt')) - - rating, votes = self.extract_imdb_rating() - movie_ratings.append(utils.get_movie_rating_dict(rating, votes, self.movie_id, 'IMDb')) - - rating, votes = self.extract_douban_rating() - movie_ratings.append(utils.get_movie_rating_dict(rating, votes, self.movie_id, 'Douban')) - return movie_ratings - - def extract_trakt_rating(self): - """ - given imdb_id, return the current rating and total number of votes of this movie in trakt.tv database - :param movie_id: - :return: rating and votes in STRING format - """ - request_result = request.Request('https://api.trakt.tv/movies/{}/ratings'.format(self.movie_id), - headers=self.trakt_header) - try: - json_result = json.loads(request.urlopen(request_result).read().decode("utf-8")) - except error.HTTPError: - return None, None - - return str(json_result['rating']), str(json_result['votes']) - - def extract_imdb_rating(self): - """ - given imdb_id, return the current rating and total number of votes of this movie in imdb database - :param movie_id: - :return: rating and votes in STRING format - """ - url = self.imdb_url_format.format(self.movie_id) - request_result = request.urlopen(url).read() - soup = BeautifulSoup(request_result, "lxml") - div = soup.find('div', {'class': 'ratingValue'}) - - try: - parse_list = div.find("strong")['title'].split(" based on ") - except AttributeError: - return None, None - - rating = parse_list[0] - votes = parse_list[1].split(" ")[0].replace(",", "") - return rating, votes - - def extract_douban_rating(self): - """ - given imdb_id, return the current rating and total number of votes of this movie in douban database - :param movie_id: - :return: rating and votes in STRING format - """ - url = self.douban_url_format.format(self.movie_id) - request_result = request.urlopen(url).read() - soup = BeautifulSoup(request_result, "lxml") - - try: - rating = soup.find("span", {'class': 'rating_nums'}).text - votes = soup.find("span", {'class': 'pl'}).text.replace("人评价","")[1: -1].replace(",", "") # remove parenthesis and words - except AttributeError: - return None, None - - return rating, votes - - # def extract_metacritic_rating(self, imdb_id, search_string, director, release_date): - # # bad request, on hold, need to use selenium - # url = self.metacritic_url_format.format(html.escape(search_string)) - # call_result = request.urlopen(url).read() - # soup = BeautifulSoup(call_result, "lxml") - # results = soup.find('li', {'class': 'result'}) - # print(results) - # pass - # - # def extract_rotten_tomatoes_rating(self, imdb_id): - # pass - # - # def extract_letterboxd_rating(self, movie_id): - # pass diff --git a/data/etl/movieshowing.py b/data/etl/movieshowing.py deleted file mode 100644 index 7fc7996..0000000 --- a/data/etl/movieshowing.py +++ /dev/null @@ -1,49 +0,0 @@ -from urllib import request, error -from bs4 import BeautifulSoup -from selenium import webdriver - - -import html - - -class MovieShowing: - - imdb_search_format = "http://www.imdb.com/find?&q={}" - - def __init__(self, cinema): - self.driver = webdriver.PhantomJS() - self.cinema_id, self.cinema_name, self.cinema_url = cinema - - def extract_cinema_schedule(self): - """retrieve one cinema schedule based on the given url, - return a list of dictionaries contains """ - # retrieve title, (type like 3D) and schedule time - print(self.cinema_url) - self.driver.get(self.cinema_url) - - # find imdb id - # create tuple cinema_id, movie_id, type, schedule - - def extract(self): - url = "http://www.imdb.com/find?&q=harry+potter+and+deathly+hallows" - soup = BeautifulSoup(request.urlopen(url).read().decode("utf-8"), "lxml") - anchors = soup.find_all("a") - for item in anchors: - try: - current_href = item['href'] - except KeyError: - continue - if "/title" in current_href: - print(current_href) - - def match(self): - print(self.build_search_url("Tu ying dang an")) - - def build_search_url(self, search_title): - search_query = html.escape(search_title.lower()) - return self.imdb_search_format.format(search_query) - - -if __name__ == '__main__': - app = MovieShowing() - app.match() diff --git a/data/etl/transformer.py b/data/etl/transformer.py deleted file mode 100644 index b21931b..0000000 --- a/data/etl/transformer.py +++ /dev/null @@ -1,77 +0,0 @@ -import datetime - - -class Transformer: - - def __init__(self, logger): - self.logger = logger - - # ============== - # Movie Data - # ============== - @staticmethod - def split_release_and_country_imdb(release_country): - """ - given a string containing released date and country of a movie, return both fields - :param release_country: string - :return: string, string - """ - released, country = release_country.replace(")", "").split("(") - released = released.strip() # remove last white space - return released, country - - @staticmethod - def transform_time_imdb(runtime): - """ - given a string of time in various format from imdb, return in minutes - :param runtime: string - :return: string - """ - runtime = runtime.replace(" ", "").replace("min", "") - if "h" in runtime: - [hours, minutes] = runtime.split("h") - if minutes == "": - minutes = 0 - runtime = int(hours) * 60 + int(minutes) - return str(runtime) - - @staticmethod - def transform_date_imdb(input_text): - """ - given a date of string from imdb, return date in %Y-%m-%d format - :param input_text: string - :return: string - """ - length_of_date = len(input_text.split(" ")) - if length_of_date == 3: - input_text = datetime.datetime.strptime(input_text, '%d %B %Y').strftime('%Y-%m-%d') - elif length_of_date == 2: - input_text = datetime.datetime.strptime(input_text, '%B %Y').strftime('%Y-%m-%d') - elif length_of_date == 1: - if input_text == "": - return None - else: - input_text = datetime.datetime.strptime(input_text, '%Y').strftime('%Y-%m-%d') - return input_text - # ================ - # Movie Rating - # ================ - - @staticmethod - def movie_rating_votes(votes): - votes = votes.replace(",", "") - return votes - - @staticmethod - def is_hour(input_text): - if 'h' in input_text: - try: - time = int(input_text.strip().replace("h", "")) - except ValueError: - return False - return True - return False - - # ================ - # Now Showing - # ================ diff --git a/data/algo/__init__.py b/data/movie_id_matcher/__init__.py similarity index 100% rename from data/algo/__init__.py rename to data/movie_id_matcher/__init__.py diff --git a/data/movie_id_matcher/matcher.py b/data/movie_id_matcher/matcher.py new file mode 100644 index 0000000..1f92578 --- /dev/null +++ b/data/movie_id_matcher/matcher.py @@ -0,0 +1,100 @@ +""" +given title and some additional information of a movie +match certain id (e.g. imdb id) +""" +from urllib import request +from bs4 import BeautifulSoup +from selenium import webdriver +from datetime import datetime + + +class MovieIDMatcher: + + _IMDB_SEARCH_URL_FORMAT = "http://www.imdb.com/find?&q={}&s=tt&ttype=ft&exact=true" + + def __init__(self): + self.driver = webdriver.PhantomJS() + + def match_imdb_id_for_cinema_schedule(self, title): + """return the MOST possible imdb id of the movie from all recent showing""" + possible_result = [] + possible_imdb_list = self._extract_imdb_possible(title) + + for movie in possible_imdb_list: + movie_id, movie_title = movie + titles, infos = self._parse_imdb_search_text(movie_title) + + # check year + current_year = datetime.now().strftime("%Y") + last_year = str(int(current_year) - 1) + next_year = str(int(current_year) + 1) + + if current_year in infos or next_year in infos or last_year in infos: + possible_result.append(movie_id) + + # check type is not tv + if "Short" is not infos and "TV" is not infos: + possible_result.append(movie_id) + + # use the first + try: + imdb_id = possible_result[0] + except IndexError: + return None + return imdb_id + + def _extract_imdb_possible(self, title): + """return a list of possible imdb id in string format""" + if " :" in title: + title = title.replace(" :", ":") + possible_list = [] + search_query = self._imdb_search_query_builder(title) + url = self._IMDB_SEARCH_URL_FORMAT.format(search_query) + self.driver.get(url) + elements = self.driver.find_elements_by_class_name("findResult") + for element in elements: + td = element.find_element_by_class_name("result_text") + current_imdb = td.find_element_by_css_selector("a").get_attribute("href").split("/")[4] + current_text = td.text.strip() + possible_list.append((current_imdb, current_text)) + + return possible_list[:3] + + @staticmethod + def _parse_imdb_search_text(text): + """parse out the searched text generated from imdb search + query, two variable will be returned. First is a list that + consists of the movie title obtained, possibly more than one. + Second return is a list that contains all possible + information stored in a bracket, such as year, type and + other strange information + :return list, list + """ + title_list = [] + info_list = [] + + segments = text.split("aka") + segments = [segment.strip() for segment in segments] # remove extra white space + + for segment in segments: + first_bracket_index = segment.find("(") + + # title list + title_found = segment[:first_bracket_index].strip().replace("\"", "") + title_list.append(title_found) + + # info list + tags = segment[first_bracket_index:].split(")")[:-1] + tags = [info.replace("(", "").strip() for info in tags] + info_list.extend(tags) + return title_list, info_list + + @staticmethod + def _build_soup(url): + soup = BeautifulSoup(request.urlopen(url).read().decode("utf-8"), "lxml") + return soup + + @staticmethod + def _imdb_search_query_builder(movie_title): + """parse the movie title according to the query""" + return movie_title.lower() diff --git a/data/etl/__init__.py b/data/movie_id_matcher/test/__init__.py similarity index 100% rename from data/etl/__init__.py rename to data/movie_id_matcher/test/__init__.py diff --git a/data/movie_id_matcher/test/test_matcher.py b/data/movie_id_matcher/test/test_matcher.py new file mode 100644 index 0000000..d6e9313 --- /dev/null +++ b/data/movie_id_matcher/test/test_matcher.py @@ -0,0 +1,107 @@ +from matcher import MovieIDMatcher + +import unittest + + +class TestMovieIDMatcher(unittest.TestCase): + + def setUp(self): + self.matcher = MovieIDMatcher() + + def test_extract_imdb_possible(self): + + def helper(title, expect_result): + matcher = MovieIDMatcher() + test_result = matcher._extract_imdb_possible(title) + self.assertEqual(test_result, expect_result) + + helper("Collide", [ + ("tt2126235", "Collide (I) (2016)"), + ("tt2834052", "Collide"), + ("tt1230120", "Collide (II) (2010)") + ]) + + helper("Cook up a storm", [ + ("tt6315750", "Cook Up a Storm (2017)") + ]) + + helper("Kung Fu Yoga", [ + ('tt4217392', 'Kung-Fu Yoga (2017)\naka "Kung Fu Yoga"') + ]) + + helper("The Lego Batman Movie", [ + ('tt4116284', 'The LEGO Batman Movie (2017)') + ]) + + helper("Rings", [ + ('tt0498381', 'Rings (2017)'), + ('tt0152191', 'Rings (1993)') + ]) + + helper("Hidden Figures", [ + ('tt4846340', 'Hidden Figures (2016)') + ]) + + helper("Sleepless", [ + ('tt2072233', 'Sleepless (III) (2017)'), + ('tt0220827', 'Sleepless (2001)'), + ('tt5039992', 'Sleepless (II) (2017)') + ]) + + helper("Fist Fight", [ + ('tt3401882', 'Fist Fight (2017)') + ]) + + helper("Siew Lup", [ + ('tt6550794', 'Siew Lup (2017)') + ]) + + helper("Jackie", [ + ('tt1619029', 'Jackie (V) (2016)'), + ('tt2108546', 'Jackie (II) (2012)'), + ('tt5249954', 'Jackie') + ]) + + helper("John Wick: Chapter 2", [ + ('tt4425200', 'John Wick: Chapter 2 (2017)') + ]) + + helper("John Wick : Chapter 2", [ + ('tt4425200', 'John Wick: Chapter 2 (2017)') + ]) + + helper("Resident Evil: The Final Chapter", [ + ('tt2592614', 'Resident Evil: The Final Chapter (2016)') + ]) + + def test_parse_imdb_search_text(self): + self.assertEqual( + self.matcher._parse_imdb_search_text("Collide (I) (2016)"), (["Collide"], ["I", "2016"])) + self.assertEqual( + self.matcher._parse_imdb_search_text("Collide (2017) (Short)"), (["Collide"], ["2017", "Short"])) + self.assertEqual( + self.matcher._parse_imdb_search_text("Cook Up a Storm (2017)"), (["Cook Up a Storm"], ["2017"])) + self.assertEqual( + self.matcher._parse_imdb_search_text("Cooking Up a Storm (2015) (TV Episode)"), + (["Cooking Up a Storm"], ["2015", "TV Episode"])) + self.assertEqual( + self.matcher._parse_imdb_search_text('The King of Queens (1998) (TV Series) aka "Kung av Queens"'), + (["The King of Queens", "Kung av Queens"], ["1998", "TV Series"])) + self.assertEqual( + self.matcher._parse_imdb_search_text('Kung-Fu Yoga (2017)\naka "Kung Fu Yoga"'), + (["Kung-Fu Yoga", "Kung Fu Yoga"], ["2017"])) + + def test_match_imdb_id(self): + + def helper(title, expect_result): + matcher = MovieIDMatcher() + test_result = matcher.match_imdb_id_for_cinema_schedule(title) + self.assertEqual(test_result, expect_result) + + helper("Collide", "tt2126235") + helper("Cook up a storm", "tt6315750") + helper("Kung Fu Yoga", 'tt4217392') + helper("The Lego Batman Movie", 'tt4116284') + helper("Rings", 'tt0498381') + + diff --git a/data/test/__init__.py b/data/public_data/__init__.py similarity index 100% rename from data/test/__init__.py rename to data/public_data/__init__.py diff --git a/data/public_data/cinema.py b/data/public_data/cinema.py new file mode 100644 index 0000000..323baa6 --- /dev/null +++ b/data/public_data/cinema.py @@ -0,0 +1,273 @@ +""" + This class retrieves movie schedule from different sources and + parse all data into required format +""" +from datetime import datetime +from bs4 import BeautifulSoup +from urllib import request +from selenium import webdriver +from string import capwords +from transformer import CinemaScheduleTransformer, GeneralTransformer + + +class CinemaList: + + GOLDEN_VILLAGE_LIST_HOME = "https://www.gv.com.sg/GVCinemas" + + CATHAY_LIST_HOME = "http://www.cathaycineplexes.com.sg/cinemas/" + + SHAW_BROTHER_LIST_HOME = "http://www.shaw.sg/sw_cinema.aspx" + + def __init__(self): + self.driver = webdriver.PhantomJS() + + def get_latest_cinema_list(self): + """ + return the latest cinema list to the processor in the format of + [{ + "url": ... + "cinema_name: ... + "provider": ... + }, { + "url": ... + "cinema_name: ... + "provider": ... + }] + :return: list + """ + cinema_list = [] + cinema_list.extend(self._extract_cathay_cinema_list()) + cinema_list.extend(self._extract_sb_cinema_list()) + cinema_list.extend(self._extract_gv_cinema_list()) + return cinema_list + + def _extract_gv_cinema_list(self): + """ + return a list of dictionaries contain all Golden Village + cinema names, and their corresponding url. + """ + url = self.GOLDEN_VILLAGE_LIST_HOME + + cinema_list = [] + + # get raw cinema list + raw_cinema_url = [] + self.driver.get(url) + anchors = self.driver.find_element_by_class_name("cinemas-list").find_elements_by_class_name("ng-binding") + for anchor in anchors: + raw_cinema_url.append(anchor.get_attribute("href")) + + # get actual list, in each url it may contain more than one cinema + for cinema_url in raw_cinema_url: + self.driver = webdriver.PhantomJS() # reinstantiate to avoid detach from DOM + self.driver.get(cinema_url) + div = self.driver.find_elements_by_class_name("ng-binding") + for item in div: + if item.get_attribute("ng-bind-html") == "cinema.name": + cinema_name = item.text + self.insert_cinema_data(cinema_list, cinema_name, cinema_url, "gv") + return cinema_list + + def _extract_cathay_cinema_list(self): + """Get a list of dictionaries contain all cathay cinema names. + It's corresponding url is None because cathay does not show movies + schedule based on individual cinemas in their web page layouts. + """ + cinema_list = [] + + url = self.CATHAY_LIST_HOME + web_content = request.urlopen(url).read().decode("utf-8") + soup = BeautifulSoup(web_content, "lxml") + divs = soup.find_all("div", {"class": "description"}) + for div in divs: + cinema_name = capwords(div.find("h1").text) + self.insert_cinema_data(cinema_list, cinema_name, "http://www.cathaycineplexes.com.sg/showtimes/", "cathay") + return cinema_list + + def _extract_sb_cinema_list(self): + """Get a list of dictionaries contain all SB cinema names, + and their corresponding urls + """ + name_list = [] + url_list = [] + cinema_list = [] + + # get names + url = self.SHAW_BROTHER_LIST_HOME + web_content = request.urlopen(url).read().decode("utf-8") + soup = BeautifulSoup(web_content, "lxml") + divs = soup.find_all("a", {"class": "txtHeaderBold"}) + for div in divs: + name_list.append(div.text) + + # get url + buy_tickets = soup.find_all("a", {"class": "txtNormalDim"}) + for item in buy_tickets: + current_link = item["href"] + if "buytickets" in current_link: + url_list.append("http://" + "www.shaw.sg/" + item["href"]) + + assert len(name_list) == len(url_list) # check whether there is mistake in matching cinema name and url + + for i in range(len(name_list)): + self.insert_cinema_data(cinema_list, name_list[i], url_list[i], "sb") + return cinema_list + + @staticmethod + def insert_cinema_data(cinema_list, cinema_name, cinema_url, provider): + inserted_tuple = { + "url": cinema_url, + "cinema_name": cinema_name, + "provider": provider + } + cinema_list.append(inserted_tuple) + + +class CinemaSchedule: + """ + This class handles all operations related to the extraction + of movie schedules in cinemas + """ + def __init__(self, cinema_name, cinema_url, cinema_provider): + self.driver = webdriver.PhantomJS() + self.driver.set_window_size(1124, 850) # set browser size + + self.cinema_name = cinema_name + self.cinema_url = cinema_url + self.provider = cinema_provider + + def extract_cinema_schedule(self): + """ + it will auto select the extract method based on the url + or cinema name given, return the formatted data object + that can be used by Loader + :return: a list of dictionary + """ + if self.provider == "gv": + cinema_object = self._extract_golden_village() + elif self.provider == "sb": + cinema_object = self._extract_shaw_brother() + elif self.provider == "cathay": + cinema_object = self._extract_cathay() + else: + raise Exception("Invalid Cinema provider") + + return CinemaScheduleTransformer.parse_cinema_object_to_data(cinema_object) + + def _extract_golden_village(self): + self.driver.get(self.cinema_url) + # retrieve title, (type like 3D) and schedule time raw data + tabs = self.driver.find_elements_by_class_name("ng-binding") + + cinema_schedule = {} + date_counter = 0 + for tab in tabs: + if tab.get_attribute("ng-bind-html") == "day.day": + current_date = GeneralTransformer.get_singapore_date(date_counter) + if tab.text == "Advance Sales": # reach the end of tabs + break + + tab.click() + rows = self.driver.find_elements_by_class_name("row") + + for row in rows: + # get movie title + current_title = None + current_time = [] + + # get movie title + anchors = row.find_elements_by_class_name("ng-binding") + for anchor in anchors: + if anchor.get_attribute("ng-bind-html") == "getFilmTitle(movie)": + current_title = anchor.text + + # get movie schedule + buttons = row.find_elements_by_css_selector("button") + for button in buttons: + if button.get_attribute("ng-bind-html") == "time.time": + current_time.append(current_date + " " + + GeneralTransformer.convert_12_to_24_hour_time(button.text)) + + # store + if current_title is not None: + if current_title in cinema_schedule: + cinema_schedule[current_title].extend(current_time) + else: + cinema_schedule[current_title] = current_time + + date_counter += 1 + return cinema_schedule + + def _extract_cathay(self): + self.driver.get(self.cinema_url) + cathay_id = CinemaScheduleTransformer.get_id_from_cathay_cinema_name(self.cinema_name) + outer_div = self.driver.find_element_by_id("ContentPlaceHolder1_wucST{}_tabs".format(cathay_id)) + tabbers = outer_div.find_elements_by_class_name("tabbers") + + date_counter = 0 + cinema_schedule = {} + for tabber in tabbers: # for each day + current_date = GeneralTransformer.get_singapore_date(date_counter) + rows = tabber.find_elements_by_class_name("movie-container") + for row in rows: + try: + row_content = row.get_attribute("innerHTML") + soup = BeautifulSoup(row_content, "lxml") + current_title = soup.find("strong").text + + current_time = [] + times = soup.find_all("a", {"class": "cine_time"}) + for show_time in times: + current_time.append(current_date + " " + show_time.text + ":00") + + if current_title is not None: + if current_title in cinema_schedule: + cinema_schedule[current_title].extend(current_time) + else: + cinema_schedule[current_title] = current_time + except AttributeError: + break + + date_counter += 1 + return cinema_schedule + + def _extract_shaw_brother(self): + self.driver.get(self.cinema_url) + show_dates = [] + options = self.driver.find_element_by_id("ctl00_Content_ddlShowDate").find_elements_by_css_selector( + "option") + for show_date in options: + show_dates.append(show_date.get_attribute("value")) + + cinema_schedule = {} + for show_date in show_dates: # each day + current_date = datetime.strptime(show_date, "%m/%d/%Y").strftime("%Y-%m-%d") + self.driver.find_element_by_xpath( + "//select[@id='ctl00_Content_ddlShowDate']/option[@value='{}']".format(show_date)).click() + rows = self.driver.find_elements_by_class_name("panelSchedule") + for row in rows[2:]: # remove table header + current_title, schedule = row.text.strip().split("\n", 1) + if "PM" in schedule or "AM" in schedule: + # title + current_title = current_title.split(" ")[1] + + # time + current_time = [] + schedule = schedule.replace("+", "").replace("*", "") + schedule = schedule.replace(" PM", "PM").replace(" AM", "AM").replace("\n", " ") + if "(" in schedule: + bracket_index = schedule.find("(") + schedule = schedule[:bracket_index] # remove anything behind bracket + schedule = schedule.split(" ") + + for item in schedule: + if item != "": + current_time.append(current_date + " " + + GeneralTransformer.convert_12_to_24_hour_time(item)) + + if current_title is not None: + if current_title in cinema_schedule: + cinema_schedule[current_title].extend(current_time) + else: + cinema_schedule[current_title] = current_time + return cinema_schedule diff --git a/data/public_data/config.py b/data/public_data/config.py new file mode 100644 index 0000000..8dec046 --- /dev/null +++ b/data/public_data/config.py @@ -0,0 +1,15 @@ +import psycopg2 + + +def database_connection(): + try: + connect_str = "dbname='production' " \ + "user='postgres' " \ + "host='128.199.231.190' " + \ + "password=''" + + conn = psycopg2.connect(connect_str) + cursor = conn.cursor() + return cursor, conn + except Exception as e: + print(e) diff --git a/data/public_data/controller.py b/data/public_data/controller.py new file mode 100644 index 0000000..82b5621 --- /dev/null +++ b/data/public_data/controller.py @@ -0,0 +1,196 @@ +""" + Core objective of this etl framework. This is the highest level API. + + Each one of them will be run in backend on server, at designated + time intervals. + + It includes four main methods in total: + 1. update movie data + 2. update movie public rating + 3. update the list of cinemas in Singapore + 4. update cinema schedule for each cinema available +""" +from cinema import CinemaList, CinemaSchedule +from movie import MovieData, MovieRating +from loader import Loader +from movie_id_matcher.matcher import MovieIDMatcher +from urllib import error +from transformer import GeneralTransformer +from http import client + +import utils +import time +import logging +import psycopg2 + + +class ETLController: + + def __init__(self): + self.loader = Loader() + + def update_movie_data(self, lower, upper, delay): + """ + updates movie data from IMDb + :param lower: integer + :param upper: integer + :param delay: integer + :return: None + """ + logging.warning("Initialise movie data retrieval process ...") + logging.warning("Range: " + str(lower) + " to " + str(upper) + ", starting in " + str(delay) + "s ...") + + time.sleep(delay) # delay to avoid database transaction lock during multi-thread process + existing_movies_id = self.loader.get_movie_id_list() + + for index in range(lower, upper): # iterate all possible titles + current_imdb_id = GeneralTransformer.build_imdb_id(index) + + if index % 1000 == 0: # id monitor + logging.warning("Currently at: " + current_imdb_id) + + if current_imdb_id in existing_movies_id: + continue + + try: + self._update_single_movie_data(current_imdb_id) + except error.HTTPError: # invalid id will cause an 404 error + continue + except utils.InvalidMovieTypeException: # ignore all non-movie types + continue + except psycopg2.InterfaceError: # database connection lost after a long time + logging.error("Reestablishing database connection") + self.loader = Loader() + continue + except ConnectionResetError or TimeoutError or client.IncompleteRead: + logging.error("Connection reset by remote host, reconnecting in 5s ...") + time.sleep(5) + + # try again + try: + self._update_single_movie_data(current_imdb_id) + except: # skip any error + continue + except Exception as e: # unknown error + logging.error("Unknown error occurs. Please examine.") + logging.error(e) + logging.error(current_imdb_id) + + logging.warning("Movie data update process complete.") + + def update_movie_rating(self): + """ + updates movie rating from various websites + """ + logging.warning("Initialise movie rating update process ...") + + existing_movies_id = self.loader.get_movie_id_list() + for current_id in existing_movies_id: + self._update_single_movie_rating(current_id) + + logging.warning("Movie rating update process complete.") + + def update_cinema_list(self): + """ + Update cinema list from various theatres websites + :return: None + """ + logging.warning("Initialise cinema list update process ...") + + cinema_list_object = CinemaList() + cinema_list = cinema_list_object.get_latest_cinema_list() + self.loader.load_cinema_list(cinema_list) + + logging.warning("Cinema list update process complete.") + + def update_cinema_schedule(self): + """ + Update latest cinema schedule from cinema list. + + It passes an empty dictionary to each cinema schedule object, + every iteration it will append that cinema's schedule to the + dictionary. + + IMDb ID is obtained using MovieMatcher module in the process. + + + The dictionary should be structured using title and imdb_id + as the top level keys, follow by other data. + + { + title: { + "imdb_id": ... + "content": [{ + "cinema_id": ... + "schedule": [...] + "type": ... + } + ] + } + } + """ + logging.warning("Initialise cinema schedule update process ...") + + cinema_schedule_data = {} + + # retrieve schedule + cinema_list = self.loader.get_cinema_list() + self._cinema_schedule_retrieve(cinema_list, cinema_schedule_data) + + # match id and check existence + matcher = MovieIDMatcher() + for title, content in cinema_schedule_data.items(): + imdb_id = matcher.match_imdb_id_for_cinema_schedule(title) + content['imdb_id'] = imdb_id + self.movie_list = self.loader.get_movie_id_list() + self._update_single_movie_data(imdb_id) + + # load data + self.loader.load_cinema_schedule(cinema_schedule_data) + + logging.warning("Cinema schedule update process complete.") + + def _update_single_movie_data(self, imdb_id): + """ + given imdb id, extract movie data and store it in database + :param imdb_id: string + :return: None + """ + data_model = MovieData(imdb_id) + current_movie_data = data_model.get_movie_data() + self.loader.load_movie_data(current_movie_data) + + def _update_single_movie_rating(self, current_id): + """ + given imdb id, extract movie ratings from various sources and + store them in database + :param current_id: string + :return: None + """ + data_model = MovieRating(current_id) + movie_rating = data_model.get_movie_ratings() + self.loader.load_movie_rating(movie_rating) + + @staticmethod + def _cinema_schedule_retrieve(cinema_list, cinema_schedule_data): + for cinema in cinema_list: + cinema_id, cinema_name, provider, cinema_url = cinema + cinema_schedule = CinemaSchedule(cinema_name, cinema_url, provider) + current_schedules = cinema_schedule.extract_cinema_schedule() + + # parse schedules and update data + for movie in current_schedules: + current_title = movie['title'] + if movie['title'] not in cinema_schedule_data: + cinema_schedule_data[current_title] = {} + current_title = cinema_schedule_data[current_title] + current_title['content'] = [] + else: + current_title = cinema_schedule_data[current_title] + + del movie['title'] + movie['cinema_id'] = cinema_id + current_title['content'].append(movie) + + + diff --git a/data/public_data/loader.py b/data/public_data/loader.py new file mode 100644 index 0000000..336e466 --- /dev/null +++ b/data/public_data/loader.py @@ -0,0 +1,106 @@ +"""handles all interactions with database""" +import logging + +import psycopg2 + +import config + + +class Loader: + + def __init__(self): + self.cursor, self.conn = config.database_connection() + + # ======== + # LOAD + # ======== + def load_movie_data(self, movie_data): + """ + load movie data into database, if movie_id exists, it will update accordingly + :param movie_data: dictionary + :return: None + """ + if movie_data['type'] != "movie": # does not store any non movie content + return + + self.cursor.execute("INSERT INTO movies (movie_id, title, production_year, rated, plot, actors, " + "language, country, runtime, poster_url, genre, director, released, type) " + "VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) " + "ON CONFLICT (movie_id) " + "DO UPDATE SET (title, production_year, rated, plot, actors, " + "language, country, runtime, poster_url, genre, director, released, type) = " + "(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" + "WHERE movies.movie_id=%s", + (movie_data['movie_id'], movie_data['title'], movie_data['production_year'], + movie_data['rated'], movie_data['plot'], movie_data['actors'], movie_data['language'], + movie_data['country'], movie_data['runtime'], movie_data['poster_url'], + movie_data['genre'], movie_data['director'], movie_data['released'], + movie_data['type'], + movie_data['title'], movie_data['production_year'], + movie_data['rated'], movie_data['plot'], movie_data['actors'], movie_data['language'], + movie_data['country'], movie_data['runtime'], movie_data['poster_url'], + movie_data['genre'], movie_data['director'], movie_data['released'], + movie_data['type'], + movie_data['movie_id'])) + self.conn.commit() + + def load_movie_rating(self, movie_ratings): + for movie_rating in movie_ratings: + self.cursor.execute("INSERT INTO public_ratings (vote, score, movie_id, source_id) VALUES (%s, %s, %s, %s) " + "ON CONFLICT (movie_id, source_id) " + "DO UPDATE SET (vote, score) = (%s, %s) " + "WHERE public_ratings.movie_id=%s AND public_ratings.source_id=%s", + (movie_rating['votes'], movie_rating['score'], movie_rating['movie_id'], + movie_rating['source_id'], movie_rating['votes'], movie_rating['score'], + movie_rating['movie_id'], movie_rating['source_id'])) + self.conn.commit() + + def load_cinema_list(self, cinema_list): + for cinema in cinema_list: + self.cursor.execute("INSERT INTO cinemas (cinema_name, url, provider) VALUES (%s, %s, %s) " + "ON CONFLICT (cinema_name) " + "DO UPDATE SET (cinema_name, url, provider) = (%s, %s, %s)" + "WHERE cinemas.cinema_name=%s", + (cinema['cinema_name'], cinema['url'], cinema['provider'], cinema['cinema_name'], + cinema['url'], cinema['provider'], cinema['cinema_name'])) + + self.conn.commit() + + def load_cinema_schedule(self, cinema_schedule): + for title, cinema_content in cinema_schedule.items(): + movie_id = cinema_content['imdb_id'] + for cinema in cinema_content['content']: + cinema_id = cinema['cinema_id'] + additional_info = cinema['type'] + schedule_list = cinema['schedule'] + for timing in schedule_list: + try: + self.cursor.execute("INSERT INTO showings (cinema_id, movie_id, type, schedule) " + "VALUES (%s, %s, %s, %s)", (cinema_id, movie_id, additional_info, timing)) + except psycopg2.IntegrityError: + continue + except psycopg2.InternalError: + continue + self.conn.commit() + + # ======== + # GET + # ======== + def get_movie_id_list(self): + self.cursor.execute("SELECT movie_id FROM movies") + data_object = self.cursor.fetchall() + id_list = [] + for item in data_object: + id_list.append(item[0]) + return id_list + + def get_movie_validation_info(self, movie_id): + self.cursor.execute("SELECT title, released, director FROM movies WHERE movie_id=%s", (movie_id, )) + data_object = self.cursor.fetchone() + return data_object + + def get_cinema_list(self): + self.cursor.execute("SELECT * FROM cinemas") + data_object = self.cursor.fetchall() + return data_object + diff --git a/data/public_data/movie.py b/data/public_data/movie.py new file mode 100644 index 0000000..7d4a5a1 --- /dev/null +++ b/data/public_data/movie.py @@ -0,0 +1,322 @@ +from bs4 import BeautifulSoup +from urllib import request, error + +import html +import utils +import json + + +class MovieData: + """ + This class handles all operations related to movie data + extraction + """ + + title = None + production_year = None + rated = None + plot = None + actors = None + language = None + country = None + genre = None + poster_url = None + released = None + runtime = None + director = None + type = None + subtext = None + soup = None + + def __init__(self, imdb_id): + """ + It takes an imdb_id to instantiate a MovieData object, upon instantiation, + it will get relevant html content and store as instance attribute + :param imdb_id: + """ + self.imdb_id = imdb_id + if imdb_id != "mock-id": # special identifier for test cases. i.e. normal instantiation + self._build_soup(self._get_html_content()) + self._extract_process() + + def get_movie_data(self): + """ + return a dict that contains all data to extractor + :return: dictionary of data in various type + """ + movie_data = utils.get_movie_data_dict(self.actors, self.country, self.director, self.genre, self.imdb_id, + None, self.plot, self.poster_url, self.production_year, self.rated, + self.released, self.runtime, self.title, self.type) + return movie_data + + def _extract_process(self): + """ + main logic for extraction of imdb data + :return: + """ + self._extract_subtext() + self._extract_release() + self._extract_rated() + self._extract_genre() + self._extract_release() + self._extract_runtime() + self._extract_title_and_year() + self._extract_poster() + self._extract_credits() + self._extract_plot() + + def _get_html_content(self): + """ + get html source based on imdb_id + :return: string + """ + url = utils.UrlFormatter.IMDB_URL_FORMAT.value.format(self.imdb_id) + request_result = html.unescape(request.urlopen(url).read().decode("utf-8")) + return request_result + + def _build_soup(self, request_result): + """ + build soup based on html content in string format + :param request_result: + :return: None + """ + self.soup = BeautifulSoup(request_result, "lxml") # soup builder + + def _build_soup_for_test(self, html_file_io_wrapper): + """ + build soup based on imported html source code file + :param html_file_io_wrapper: + :return: None + """ + self.soup = BeautifulSoup(html_file_io_wrapper, "lxml") + + def _extract_title_and_year(self): + """ + return title and production year of a movie + :return: string or None, int or None + """ + title_wrapper = self.soup.find("h1").text.split("\xa0") + self.title = title_wrapper[0] + self.production_year = title_wrapper[1].replace("(", "").replace(")", "").replace(" ", "") + if self.production_year == "": + self.production_year = None + return self.title, self.production_year + return self.title, int(self.production_year) + + def _extract_poster(self): + """ + return the url of poster of one movie + :return: string or None + """ + poster = self.soup.find("div", {"class": "poster"}) + try: + self.poster_url = poster.find("img")['src'] + except AttributeError: + self.poster_url = None + return self.poster_url + + def _extract_credits(self): + """ + return the directors and actors of the movie. If there is more than + one director or actor, it will display a string with multiple tokens, + separated by comma + :return: string or None, string or None + """ + credits_text = self.soup.find_all("div", {"class": "credit_summary_item"}) + for item in credits_text: + current_text = item.text + if "Directors:" in current_text: + self.director = current_text.replace("Directors:", "").split("|")[0]\ + .replace("\n", "").replace(" ", "").strip() + elif "Director:" in current_text: + self.director = current_text.replace("Director:", "").strip() + elif "Stars" in current_text: + self.actors = current_text.replace("Stars:", "").split("|")[0]\ + .replace("\n", "").replace(" ", "").strip() + elif "Star" in current_text: + self.actors = current_text.replace("Star:", "").strip() + return self.actors, self.director + + def _extract_plot(self): + """ + return the plot of one movie + :return: string or None + """ + try: + self.plot = self.soup.find("div", {"class": "summary_text"}).text.replace("\n", "").strip().split(" ")[0] + except AttributeError: + self.plot = None + + if self.plot is not None and "Add a Plot" in self.plot: + self.plot = None + return self.plot + + def _extract_subtext(self): + """ + retrieve the subtext tag for other extraction nodes + :return: None + """ + self.subtext = self.soup.find("div", {"class": "subtext"}) + + def _extract_rated(self): + """ + return the rating(i.e. PG, R, M) of a movie + Not to confused with user rating + :return: string or None + """ + metas = self.subtext.find_all("meta") + for meta in metas: + if meta['itemprop'] == "contentRating": + self.rated = meta['content'] + return self.rated + + def _extract_release(self): + """ + parse the last token in subtext element, + determine the release date and country + If it is not a movie, raise an exception + :return: datetime or None, string or None, string + """ + self.type = 'movie' # default movie type + anchors = self.subtext.find_all("a") + for anchor in anchors: + if anchor.has_attr('title'): + release_text = anchor.text + if "Episode aired" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "TV Series" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "TV Episode" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "TV Special" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "Video Game" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "Video game released" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "Video" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "TV Mini-Series" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "TV Movie" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + elif "TV Short" in release_text: + raise utils.InvalidMovieTypeException("Invalid movie type.") + release_text = release_text.replace("\n", "").strip() + self.released, self.country = utils.split_release_and_country_imdb(release_text) + self.released = utils.transform_date_imdb(self.released) + return self.released, self.country, self.type + + def _extract_genre(self): + """ + parse the html content and return the genre of the movie + :return: string or None + """ + genre_list = [] + spans = self.subtext.find_all("span", {"class": "itemprop"}) + for span in spans: + genre_list.append(span.text) + if len(genre_list) > 0: + self.genre = ", ".join(genre_list) + return self.genre + + def _extract_runtime(self): + """ + parse the html content and return the runtime of the movie + :return: int or None + """ + time_tag = self.subtext.find("time") + try: + time_text = time_tag['datetime'] + self.runtime = int(time_text.replace("PT", "").replace("M", "").replace(",", "")) + except TypeError: + return None + return self.runtime + + +class MovieRating: + + TRAKT_HEADER = { + 'Content-Type': 'application/json', + 'trakt-api-version': '2', + 'trakt-api-key': '411a8f0219456de5e3e10596486c545359a919b6ebb10950fa86896c1a8ac99b' + } + + imdb_url_format = "http://www.imdb.com/title/{}/" + + douban_url_format = "https://movie.douban.com/subject_search?search_text={}" + + metacritic_url_format = "http://www.metacritic.com/search/movie/{}/results" + + def __init__(self, movie_id): + self.movie_id = movie_id + + def get_movie_ratings(self): + """ + get a list of votes and ratings from each source + :return: list + """ + movie_ratings = [] + + rating, votes = self._extract_trakt_rating() + movie_ratings.append(utils.get_movie_rating_dict(rating, votes, self.movie_id, 'Trakt')) + + rating, votes = self._extract_imdb_rating() + movie_ratings.append(utils.get_movie_rating_dict(rating, votes, self.movie_id, 'IMDb')) + + rating, votes = self._extract_douban_rating() + movie_ratings.append(utils.get_movie_rating_dict(rating, votes, self.movie_id, 'Douban')) + return movie_ratings + + def _extract_trakt_rating(self): + """ + given imdb_id, return the current rating and total number of votes of this movie in trakt.tv database + :return: string or None, string or None + """ + request_result = request.Request('https://api.trakt.tv/movies/{}/ratings'.format(self.movie_id), + headers=self.TRAKT_HEADER) + try: + json_result = json.loads(request.urlopen(request_result).read().decode("utf-8")) + except error.HTTPError: + return None, None + + return str(json_result['rating']), str(json_result['votes']) + + def _extract_imdb_rating(self): + """ + given imdb_id, return the current rating and total number of votes of this movie in imdb database + :return: string or None, string or None + """ + url = self.imdb_url_format.format(self.movie_id) + request_result = request.urlopen(url).read() + soup = BeautifulSoup(request_result, "lxml") + div = soup.find('div', {'class': 'ratingValue'}) + + try: + parse_list = div.find("strong")['title'].split(" based on ") + except AttributeError: + return None, None + + rating = parse_list[0] + votes = parse_list[1].split(" ")[0].replace(",", "") + return rating, votes + + def _extract_douban_rating(self): + """ + given imdb_id, return the current rating and total number of votes of this movie in douban database + :return: string or None, string or None + """ + url = self.douban_url_format.format(self.movie_id) + request_result = request.urlopen(url).read() + soup = BeautifulSoup(request_result, "lxml") + + try: + rating = soup.find("span", {'class': 'rating_nums'}).text + + # remove parenthesis and words + votes = soup.find("span", {'class': 'pl'}).text.replace("人评价","")[1: -1].replace(",", "") + except AttributeError: + return None, None + + return rating, votes + diff --git a/data/public_data/test/__init__.py b/data/public_data/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data/public_data/test/data_movie_data/__init__.py b/data/public_data/test/data_movie_data/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data/test/test_data_moviedata/get_html_source.py b/data/public_data/test/data_movie_data/get_html_source.py similarity index 100% rename from data/test/test_data_moviedata/get_html_source.py rename to data/public_data/test/data_movie_data/get_html_source.py diff --git a/data/test/test_data_moviedata/tt0000001.html b/data/public_data/test/data_movie_data/tt0000001.html similarity index 100% rename from data/test/test_data_moviedata/tt0000001.html rename to data/public_data/test/data_movie_data/tt0000001.html diff --git a/data/test/test_data_moviedata/tt0000004.html b/data/public_data/test/data_movie_data/tt0000004.html similarity index 100% rename from data/test/test_data_moviedata/tt0000004.html rename to data/public_data/test/data_movie_data/tt0000004.html diff --git a/data/test/test_data_moviedata/tt0000007.html b/data/public_data/test/data_movie_data/tt0000007.html similarity index 100% rename from data/test/test_data_moviedata/tt0000007.html rename to data/public_data/test/data_movie_data/tt0000007.html diff --git a/data/test/test_data_moviedata/tt0000012.html b/data/public_data/test/data_movie_data/tt0000012.html similarity index 100% rename from data/test/test_data_moviedata/tt0000012.html rename to data/public_data/test/data_movie_data/tt0000012.html diff --git a/data/test/test_data_moviedata/tt0000019.html b/data/public_data/test/data_movie_data/tt0000019.html similarity index 100% rename from data/test/test_data_moviedata/tt0000019.html rename to data/public_data/test/data_movie_data/tt0000019.html diff --git a/data/test/test_data_moviedata/tt0000025.html b/data/public_data/test/data_movie_data/tt0000025.html similarity index 100% rename from data/test/test_data_moviedata/tt0000025.html rename to data/public_data/test/data_movie_data/tt0000025.html diff --git a/data/test/test_data_moviedata/tt0000399.html b/data/public_data/test/data_movie_data/tt0000399.html similarity index 100% rename from data/test/test_data_moviedata/tt0000399.html rename to data/public_data/test/data_movie_data/tt0000399.html diff --git a/data/test/test_data_moviedata/tt0000481.html b/data/public_data/test/data_movie_data/tt0000481.html similarity index 100% rename from data/test/test_data_moviedata/tt0000481.html rename to data/public_data/test/data_movie_data/tt0000481.html diff --git a/data/test/test_data_moviedata/tt0000502.html b/data/public_data/test/data_movie_data/tt0000502.html similarity index 100% rename from data/test/test_data_moviedata/tt0000502.html rename to data/public_data/test/data_movie_data/tt0000502.html diff --git a/data/test/test_data_moviedata/tt0000869.html b/data/public_data/test/data_movie_data/tt0000869.html similarity index 100% rename from data/test/test_data_moviedata/tt0000869.html rename to data/public_data/test/data_movie_data/tt0000869.html diff --git a/data/test/test_data_moviedata/tt0001304.html b/data/public_data/test/data_movie_data/tt0001304.html similarity index 100% rename from data/test/test_data_moviedata/tt0001304.html rename to data/public_data/test/data_movie_data/tt0001304.html diff --git a/data/test/test_data_moviedata/tt0010781.html b/data/public_data/test/data_movie_data/tt0010781.html similarity index 100% rename from data/test/test_data_moviedata/tt0010781.html rename to data/public_data/test/data_movie_data/tt0010781.html diff --git a/data/test/test_data_moviedata/tt0030298.html b/data/public_data/test/data_movie_data/tt0030298.html similarity index 100% rename from data/test/test_data_moviedata/tt0030298.html rename to data/public_data/test/data_movie_data/tt0030298.html diff --git a/data/test/test_data_moviedata/tt0039445.html b/data/public_data/test/data_movie_data/tt0039445.html similarity index 100% rename from data/test/test_data_moviedata/tt0039445.html rename to data/public_data/test/data_movie_data/tt0039445.html diff --git a/data/test/test_data_moviedata/tt0039624.html b/data/public_data/test/data_movie_data/tt0039624.html similarity index 100% rename from data/test/test_data_moviedata/tt0039624.html rename to data/public_data/test/data_movie_data/tt0039624.html diff --git a/data/test/test_data_moviedata/tt0395865.html b/data/public_data/test/data_movie_data/tt0395865.html similarity index 100% rename from data/test/test_data_moviedata/tt0395865.html rename to data/public_data/test/data_movie_data/tt0395865.html diff --git a/data/test/test_data_moviedata/tt0460648.html b/data/public_data/test/data_movie_data/tt0460648.html similarity index 100% rename from data/test/test_data_moviedata/tt0460648.html rename to data/public_data/test/data_movie_data/tt0460648.html diff --git a/data/test/test_data_moviedata/tt1234567.html b/data/public_data/test/data_movie_data/tt1234567.html similarity index 100% rename from data/test/test_data_moviedata/tt1234567.html rename to data/public_data/test/data_movie_data/tt1234567.html diff --git a/data/test/test_data_moviedata/tt2345678.html b/data/public_data/test/data_movie_data/tt2345678.html similarity index 100% rename from data/test/test_data_moviedata/tt2345678.html rename to data/public_data/test/data_movie_data/tt2345678.html diff --git a/data/test/test_data_moviedata/tt3107288.html b/data/public_data/test/data_movie_data/tt3107288.html similarity index 100% rename from data/test/test_data_moviedata/tt3107288.html rename to data/public_data/test/data_movie_data/tt3107288.html diff --git a/data/test/test_data_moviedata/tt3783958.html b/data/public_data/test/data_movie_data/tt3783958.html similarity index 100% rename from data/test/test_data_moviedata/tt3783958.html rename to data/public_data/test/data_movie_data/tt3783958.html diff --git a/data/test/test_data_moviedata/tt4346792.html b/data/public_data/test/data_movie_data/tt4346792.html similarity index 100% rename from data/test/test_data_moviedata/tt4346792.html rename to data/public_data/test/data_movie_data/tt4346792.html diff --git a/data/public_data/test/test_cinema_schedule.py b/data/public_data/test/test_cinema_schedule.py new file mode 100644 index 0000000..960c271 --- /dev/null +++ b/data/public_data/test/test_cinema_schedule.py @@ -0,0 +1,53 @@ +from cinema import CinemaSchedule, CinemaList + +import unittest + + +class TestCinemaSchedule(unittest.TestCase): + + def setUp(self): + self.gv_schedule = CinemaSchedule(('1', 'GV Tiong Bahru', 'https://www.gv.com.sg/GVCinemaDetails#/cinema/03', "gv")) + self.cathay_schedule = CinemaSchedule(('32', 'The Cathay Cineplex', 'http://www.cathaycineplexes.com.sg/showtimes/', "cathay")) + self.shaw_schedule = CinemaSchedule(('39', 'Shaw Theatres Lido', 'http://www.shaw.sg/sw_buytickets.aspx?' + 'filmCode=&cplexCode=30 210 236 39 155 56 75 124 ' + '123 77 76 246 36 85 160 0&date=', "sb")) + self.cinema_list = CinemaList() + + def test_convert_12_to_24_hour(self): + # to be added in more test cases + self.assertEqual(self.gv_schedule._convert_12_to_24_hour_time("8:25pm"), "20:25:00") + + def test_movie_title_parser(self): + # gv + self.assertEqual(self.gv_schedule._movie_title_parser("Logan*"), ("Logan", ["No free pass"])) + + # cathay + self.assertEqual(self.cathay_schedule._movie_title_parser("*Hidden Figures PG (Dolby Digital)"), + ("Hidden Figures", ["Dolby Digital"])) + self.assertEqual(self.cathay_schedule._movie_title_parser("*T2 Trainspotting R21 (Dolby Digital)"), + ("T2 Trainspotting", ["Dolby Digital"])) + self.assertEqual(self.cathay_schedule._movie_title_parser("Fifty Shades Darker R21 (Dolby Digital)"), + ("Fifty Shades Darker", ["Dolby Digital"])) + self.assertEqual(self.cathay_schedule._movie_title_parser("John Wick : Chapter 2 M18 (Dolby Digital)"), + ("John Wick : Chapter 2", ["Dolby Digital"])) + self.assertEqual(self.cathay_schedule._movie_title_parser("*Before I Fall PG13 (Dolby Digital)"), + ("Before I Fall", ["Dolby Digital"])) + + # shaw + self.assertEqual(self.shaw_schedule._movie_title_parser("Logan [D]"), + ("Logan", ["Digital"])) + self.assertEqual(self.shaw_schedule._movie_title_parser("Siew Lup [M] [D]"), + ("Siew Lup", ['Digital'])) + self.assertEqual(self.shaw_schedule._movie_title_parser("Jackie [D]"), + ("Jackie", ["Digital"])) + self.assertEqual(self.shaw_schedule._movie_title_parser("Hidden Figures [D]"), + ("Hidden Figures", ["Digital"])) + self.assertEqual(self.shaw_schedule._movie_title_parser("Logan [IMAX]"), + ("Logan", ["IMAX"])) + self.assertEqual(self.shaw_schedule._movie_title_parser("John Wick: Chapter 2 [D]"), + ("John Wick: Chapter 2", ["Digital"])) + self.assertEqual(self.shaw_schedule._movie_title_parser("The Lego Batman Movie [D]"), + ("The Lego Batman Movie", ["Digital"])) + + + diff --git a/data/test/test_moviedata.py b/data/public_data/test/test_movie_data.py similarity index 58% rename from data/test/test_moviedata.py rename to data/public_data/test/test_movie_data.py index 102f00e..f4c605b 100644 --- a/data/test/test_moviedata.py +++ b/data/public_data/test/test_movie_data.py @@ -1,7 +1,7 @@ import unittest import os -from etl.moviedata import MovieData +from public_data.movie import MovieData class TestMovieData(unittest.TestCase): @@ -11,9 +11,6 @@ class TestMovieData(unittest.TestCase): 'tt0000025', 'tt0010781', 'tt0000481', 'tt0000012', 'tt0000399', 'tt0039624', 'tt0030298', 'tt0039445'] - def __init__(self, *args, **kwargs): - super(TestMovieData, self).__init__(*args, **kwargs) - def test_extract_title_and_year(self): """ test the extractor of movie title and production year @@ -22,7 +19,7 @@ def test_extract_title_and_year(self): :return: """ - def helper_test(imdb_id, expected): + def helper(imdb_id, expected): """ takes in imdb id and the tuple of expected result :param imdb_id: @@ -31,18 +28,15 @@ def helper_test(imdb_id, expected): """ data_model = MovieData("mock-id") test_data_directory = os.path.realpath( - os.path.join(os.getcwd(), "test/test_data_moviedata/{}.html".format(imdb_id))) + os.path.join(os.getcwd(), "data_movie_data/{}.html".format(imdb_id))) io_wrapper = open(test_data_directory, encoding="utf8") - data_model.build_soup_for_test(io_wrapper) - data_model.extract_process() - self.assertEqual(data_model.extract_title_and_year(), expected) + data_model._build_soup_for_test(io_wrapper) + data_model._extract_process() + self.assertEqual(data_model._extract_title_and_year(), expected) io_wrapper.close() - helper_test(self.test_id_list[0], ('Carmencita', 1894)) - helper_test(self.test_id_list[1], ('The Top 14 Perform', None)) - helper_test(self.test_id_list[2], ('Hot Properties', None)) - helper_test(self.test_id_list[3], ('Episode dated 24 March 2004', None)) - helper_test(self.test_id_list[7], ('La La Land', 2016)) + helper(self.test_id_list[0], ('Carmencita', 1894)) + helper(self.test_id_list[7], ('La La Land', 2016)) def test_extract_poster(self): """ @@ -51,7 +45,7 @@ def test_extract_poster(self): :return: """ - def helper_test(imdb_id, expected): + def helper(imdb_id, expected): """ takes in imdb id and the tuple of expected result :param imdb_id: @@ -61,21 +55,18 @@ def helper_test(imdb_id, expected): data_model = MovieData("mock-id") test_data_directory = os.path.realpath( - os.path.join(os.getcwd(), "test/test_data_moviedata/{}.html".format(imdb_id))) + os.path.join(os.getcwd(), "data_movie_data/{}.html".format(imdb_id))) io_wrapper = open(test_data_directory, encoding="utf8") - data_model.build_soup_for_test(io_wrapper) - data_model.extract_process() - self.assertEqual(data_model.extract_poster(), expected) + data_model._build_soup_for_test(io_wrapper) + data_model._extract_process() + self.assertEqual(data_model._extract_poster(), expected) io_wrapper.close() - helper_test(self.test_id_list[0], + helper(self.test_id_list[0], "https://images-na.ssl-images-amazon.com/images/" "M/MV5BMjAzNDEwMzk3OV5BMl5BanBnXkFtZTcwOTk4OTM5Ng@@._V1_UY268_CR6,0,182,268_AL_.jpg") - helper_test(self.test_id_list[1], - "https://images-na.ssl-images-amazon.com/images/" - "M/MV5BMTMxMjU0MTMxMl5BMl5BanBnXkFtZTcwNjY4Mjc3MQ@@._V1_UY268_CR2,0,182,268_AL_.jpg") - helper_test(self.test_id_list[13], None) - helper_test(self.test_id_list[14], None) + helper(self.test_id_list[13], None) + helper(self.test_id_list[14], None) def test_extract_credits(self): """ @@ -94,20 +85,17 @@ def helper_test(imdb_id, expected): """ data_model = MovieData("mock-id") test_data_directory = os.path.realpath( - os.path.join(os.getcwd(), "test/test_data_moviedata/{}.html".format(imdb_id))) + os.path.join(os.getcwd(), "data_movie_data/{}.html".format(imdb_id))) io_wrapper = open(test_data_directory, encoding="utf8") - data_model.build_soup_for_test(io_wrapper) - data_model.extract_process() - self.assertEqual(data_model.extract_credits(), expected) + data_model._build_soup_for_test(io_wrapper) + data_model._extract_process() + self.assertEqual(data_model._extract_credits(), expected) io_wrapper.close() helper_test(self.test_id_list[16], (None, None)) helper_test(self.test_id_list[14], (None, "Birt Acres")) helper_test(self.test_id_list[17], (None, "Auguste Lumière, Louis Lumière")) - helper_test(self.test_id_list[3], ("Agustín Bravo", None)) - helper_test(self.test_id_list[5], ("Grant Gustin, Candice Patton, Danielle Panabaker", None)) helper_test(self.test_id_list[0], ("Carmencita", "William K.L. Dickson")) - helper_test(self.test_id_list[1], ("Joshua Allen, Stephen Boss, Cat Deeley", "Don Weiner")) helper_test(self.test_id_list[18], ("Thomas White", "George S. Fleming, Edwin S. Porter")) helper_test(self.test_id_list[15], ("Ruth Roland, George Larkin, Mark Strong", "Robert Ellis, Louis J. Gasnier")) @@ -126,20 +114,16 @@ def helper_test(imdb_id, expected): """ data_model = MovieData("mock-id") test_data_directory = os.path.realpath( - os.path.join(os.getcwd(), "test/test_data_moviedata/{}.html".format(imdb_id))) + os.path.join(os.getcwd(), "data_movie_data/{}.html".format(imdb_id))) io_wrapper = open(test_data_directory, encoding="utf8") - data_model.build_soup_for_test(io_wrapper) - data_model.extract_process() - self.assertEqual(data_model.extract_plot(), expected) + data_model._build_soup_for_test(io_wrapper) + data_model._extract_process() + self.assertEqual(data_model._extract_plot(), expected) io_wrapper.close() helper_test(self.test_id_list[0], "Performing on what looks like a small wooden stage, wearing a dress with a " "hoop skirt and white high-heeled pumps, Carmencita does a dance with kicks " "and twirls, a smile always on her face.") - helper_test(self.test_id_list[1], "Host Cat Deeley promised at the outset that the final 14 dancers will face " - "some changes and the competition would get more difficult for the final " - "seven couples...") - helper_test(self.test_id_list[3], None) def test_extract_rated(self): """ @@ -156,16 +140,14 @@ def helper_test(imdb_id, expected): """ data_model = MovieData("mock-id") test_data_directory = os.path.realpath( - os.path.join(os.getcwd(), "test/test_data_moviedata/{}.html".format(imdb_id))) + os.path.join(os.getcwd(), "data_movie_data/{}.html".format(imdb_id))) io_wrapper = open(test_data_directory, encoding="utf8") - data_model.build_soup_for_test(io_wrapper) - data_model.extract_process() - self.assertEqual(data_model.extract_rated(), expected) + data_model._build_soup_for_test(io_wrapper) + data_model._extract_process() + self.assertEqual(data_model._extract_rated(), expected) io_wrapper.close() - helper_test(self.test_id_list[4], "TV-14") helper_test(self.test_id_list[0], "NOT RATED") - helper_test(self.test_id_list[1], None) def test_extract_release(self): """ @@ -182,24 +164,13 @@ def helper_test(imdb_id, expected): """ data_model = MovieData("mock-id") test_data_directory = os.path.realpath( - os.path.join(os.getcwd(), "test/test_data_moviedata/{}.html".format(imdb_id))) + os.path.join(os.getcwd(), "data_movie_data/{}.html".format(imdb_id))) io_wrapper = open(test_data_directory, encoding="utf8") - data_model.build_soup_for_test(io_wrapper) - data_model.extract_process() - self.assertEqual(data_model.extract_release(), expected) + data_model._build_soup_for_test(io_wrapper) + data_model._extract_process() + self.assertEqual(data_model._extract_release(), expected) io_wrapper.close() - # episodes - helper_test(self.test_id_list[1], ('2008-07-02', None, 'episode')) - helper_test(self.test_id_list[3], ('2004-03-24', None, 'episode')) - helper_test(self.test_id_list[4], ('2015-10-06', None, 'episode')) - - # tv - helper_test(self.test_id_list[2], (None, None, 'tv')) - helper_test(self.test_id_list[5], (None, None, 'tv')) - helper_test(self.test_id_list[6], (None, None, 'tv')) - - # # movies helper_test(self.test_id_list[0], ('1894-03-10', 'USA', 'movie')) helper_test(self.test_id_list[7], ('2016-12-25', 'USA', 'movie')) helper_test(self.test_id_list[8], ('1892-10-28', 'France', 'movie')) @@ -208,11 +179,6 @@ def helper_test(imdb_id, expected): helper_test(self.test_id_list[11], ('1913-01-10', 'Germany', 'movie')) helper_test(self.test_id_list[12], (None, None, 'movie')) - # tv-movies - helper_test(self.test_id_list[19], (None, None, 'tv-movie')) - helper_test(self.test_id_list[20], ('1938-07-24', None, 'tv-movie')) - helper_test(self.test_id_list[21], ('1947-12-09', None, 'tv-movie')) - def test_extract_genre(self): """ test the genre token of subtext @@ -228,16 +194,14 @@ def helper_test(imdb_id, expected): """ data_model = MovieData("mock-id") test_data_directory = os.path.realpath( - os.path.join(os.getcwd(), "test/test_data_moviedata/{}.html".format(imdb_id))) + os.path.join(os.getcwd(), "data_movie_data/{}.html".format(imdb_id))) io_wrapper = open(test_data_directory, encoding="utf8") - data_model.build_soup_for_test(io_wrapper) - data_model.extract_process() - self.assertEqual(data_model.extract_genre(), expected) + data_model._build_soup_for_test(io_wrapper) + data_model._extract_process() + self.assertEqual(data_model._extract_genre(), expected) io_wrapper.close() helper_test(self.test_id_list[0], 'Documentary, Short') - helper_test(self.test_id_list[1], 'Game-Show, Music, Reality-TV') - helper_test(self.test_id_list[2], 'Comedy') helper_test(self.test_id_list[12], None) def test_extract_runtime(self): @@ -255,19 +219,12 @@ def helper_test(imdb_id, expected): """ data_model = MovieData("mock-id") test_data_directory = os.path.realpath( - os.path.join(os.getcwd(), "test/test_data_moviedata/{}.html".format(imdb_id))) + os.path.join(os.getcwd(), "data_movie_data/{}.html".format(imdb_id))) io_wrapper = open(test_data_directory, encoding="utf8") - data_model.build_soup_for_test(io_wrapper) - data_model.extract_process() - self.assertEqual(data_model.extract_runtime(), expected) + data_model._build_soup_for_test(io_wrapper) + data_model._extract_process() + self.assertEqual(data_model._extract_runtime(), expected) io_wrapper.close() helper_test(self.test_id_list[0], 1) - helper_test(self.test_id_list[1], 60) - helper_test(self.test_id_list[2], 30) - helper_test(self.test_id_list[3], 75) - helper_test(self.test_id_list[4], 43) helper_test(self.test_id_list[12], None) - -if __name__ == '__main__': - unittest.main() diff --git a/data/public_data/test/test_movie_rating.py b/data/public_data/test/test_movie_rating.py new file mode 100644 index 0000000..da598f6 --- /dev/null +++ b/data/public_data/test/test_movie_rating.py @@ -0,0 +1,56 @@ +import unittest +import random +import data.utils as utils +from bs4 import BeautifulSoup +from urllib import request, error +from data.etl.movierating import MovieRating + + +class TestMovieRating(unittest.TestCase): + + test_id_list = ['tt0000001', 'tt1234567', 'tt0460648', 'tt2345678', 'tt4346792', 'tt3107288', 'tt0395865', + 'tt3783958', 'tt0000004', 'tt0000007', 'tt0000502', 'tt0001304', 'tt0000869', 'tt0000019', + 'tt0000025', 'tt0010781', 'tt0000481', 'tt0000012', 'tt0000399', 'tt0039624', 'tt0030298', + 'tt0039445'] + + def __init__(self, *args, **kwargs): + super(TestMovieRating, self).__init__(*args, **kwargs) + + def test_extract_trakt_tv_ratings(self): + self.assertEqual(MovieRating(self.test_id_list[0])._extract_trakt_rating(), ('4.66667', '9')) + self.assertEqual(MovieRating(self.test_id_list[1])._extract_trakt_rating(), ('0.0', '0')) + self.assertEqual(MovieRating(self.test_id_list[2])._extract_trakt_rating(), (None, None)) + self.assertEqual(MovieRating(self.test_id_list[3])._extract_trakt_rating(), (None, None)) + self.assertEqual(MovieRating(self.test_id_list[4])._extract_trakt_rating(), (None, None)) + self.assertEqual(MovieRating(self.test_id_list[7])._extract_trakt_rating(), ('7.92902', '4973')) + + # new movie data for type validation + rating, votes = MovieRating(self.test_id_list[7])._extract_douban_rating() + self.assertTrue(utils.is_numeric(rating.isnumeric())) + self.assertTrue(utils.is_numeric(votes.isnumeric())) + + def test_extract_imdb_rating(self): + self.assertEqual(MovieRating(self.test_id_list[0])._extract_imdb_rating(), ('5.8', '1232')) + self.assertEqual(MovieRating(self.test_id_list[1])._extract_imdb_rating(), ('5.3', '13')) + self.assertEqual(MovieRating(self.test_id_list[2])._extract_imdb_rating(), ('6.4', '227')) + self.assertEqual(MovieRating(self.test_id_list[3])._extract_imdb_rating(), (None, None)) + self.assertEqual(MovieRating(self.test_id_list[4])._extract_imdb_rating(), ('8.5', '4265')) + self.assertEqual(MovieRating(self.test_id_list[7])._extract_imdb_rating(), ('8.5', '170403')) + + # new movie data for type validation + rating, votes = MovieRating(self.test_id_list[7])._extract_douban_rating() + self.assertTrue(utils.is_numeric(rating.isnumeric())) + self.assertTrue(utils.is_numeric(votes.isnumeric())) + + def test_extract_douban_rating(self): + # old movie data for value validation + self.assertEqual(MovieRating(self.test_id_list[0])._extract_douban_rating(), ('7.1', '164')) + self.assertEqual(MovieRating(self.test_id_list[1])._extract_douban_rating(), (None, None)) + self.assertEqual(MovieRating(self.test_id_list[2])._extract_douban_rating(), (None, None)) + self.assertEqual(MovieRating(self.test_id_list[3])._extract_douban_rating(), (None, None)) + self.assertEqual(MovieRating(self.test_id_list[4])._extract_douban_rating(), ('7.5', '5416')) + + # new movie data for type validation + rating, votes = MovieRating(self.test_id_list[7])._extract_douban_rating() + self.assertTrue(utils.is_numeric(rating.isnumeric())) + self.assertTrue(utils.is_numeric(votes.isnumeric())) diff --git a/data/test/test_extractor.py b/data/public_data/test/test_something.py similarity index 86% rename from data/test/test_extractor.py rename to data/public_data/test/test_something.py index d1bb9de..ea9b0e6 100644 --- a/data/test/test_extractor.py +++ b/data/public_data/test/test_something.py @@ -3,7 +3,7 @@ import unittest -class TestExtractor(unittest.TestCase): +class TestMovieRating(unittest.TestCase): test_id_list = ['tt0000001', 'tt1234567', 'tt0460648', 'tt2345678', 'tt4346792', 'tt3107288', 'tt0395865', 'tt3783958', 'tt0000004', 'tt0000007', 'tt0000502', 'tt0001304', 'tt0000869', 'tt0000019', @@ -11,7 +11,7 @@ class TestExtractor(unittest.TestCase): 'tt0039445'] def __init__(self, *args, **kwargs): - super(TestExtractor, self).__init__(*args, **kwargs) + super(TestMovieRating, self).__init__(*args, **kwargs) def test_extract_movie_rating(self): data_model = Extractor(None).extract_movie_rating(self.test_id_list[0]) @@ -25,4 +25,4 @@ def test_extract_movie_rating(self): self.assertEqual(item['score'], '7.1') if item['source_id'] == 3: self.assertEqual(item['votes'], '9') - self.assertEqual(item['score'], '4.66667') + self.assertEqual(item['score'], '4.66667') \ No newline at end of file diff --git a/data/test/test_transformer.py b/data/public_data/test/test_utils.py similarity index 94% rename from data/test/test_transformer.py rename to data/public_data/test/test_utils.py index 848eec5..e11a042 100644 --- a/data/test/test_transformer.py +++ b/data/public_data/test/test_utils.py @@ -1,5 +1,6 @@ import unittest -from data.etl.transformer import Transformer + +from transformer import Transformer class TestTransformer(unittest.TestCase): @@ -33,7 +34,4 @@ def test_transform_time_imdb(self): self.assertEqual(self.transformer.transform_time_imdb("1h"), "60") self.assertEqual(self.transformer.transform_time_imdb("2h40min"), "160") self.assertEqual(self.transformer.transform_time_imdb("1h 40min "), "100") - self.assertEqual(self.transformer.transform_time_imdb("1h 40min"), "100") - -if __name__ == '__main__': - unittest.main() + self.assertEqual(self.transformer.transform_time_imdb("1h 40min"), "100") \ No newline at end of file diff --git a/data/public_data/transformer.py b/data/public_data/transformer.py new file mode 100644 index 0000000..f001526 --- /dev/null +++ b/data/public_data/transformer.py @@ -0,0 +1,209 @@ +from pytz import timezone +from datetime import datetime, timedelta + +import time + + +class Transformer: + + @staticmethod + def split_release_and_country_imdb(release_country): + """ + given a string containing released date and country of a movie, return both fields + :param release_country: string + :return: string, string + """ + released, country = release_country.replace(")", "").split("(") + released = released.strip() # remove last white space + return released, country + + @staticmethod + def transform_time_imdb(runtime): + """ + given a string of time in various format from imdb, return in minutes + :param runtime: string + :return: string + """ + runtime = runtime.replace(" ", "").replace("min", "") + if "h" in runtime: + [hours, minutes] = runtime.split("h") + if minutes == "": + minutes = 0 + runtime = int(hours) * 60 + int(minutes) + return str(runtime) + + @staticmethod + def transform_date_imdb(input_text): + """ + given a date of string from imdb, return date in %Y-%m-%d format + :param input_text: string + :return: string + """ + length_of_date = len(input_text.split(" ")) + if length_of_date == 3: + input_text = datetime.strptime(input_text, '%d %B %Y').strftime('%Y-%m-%d') + elif length_of_date == 2: + input_text = datetime.strptime(input_text, '%B %Y').strftime('%Y-%m-%d') + elif length_of_date == 1: + if input_text == "": + return None + else: + input_text = datetime.strptime(input_text, '%Y').strftime('%Y-%m-%d') + return input_text + + @staticmethod + def movie_rating_votes(votes): + votes = votes.replace(",", "") + return votes + + +class GeneralTransformer: + + @staticmethod + def get_singapore_date(n): + """get the date of n days from now in SGT""" + today = (datetime.fromtimestamp(time.time(), timezone("Singapore")) + timedelta(days=n)).strftime( + "%Y-%m-%d") + return today + + @staticmethod + def convert_12_to_24_hour_time(time_string): + """ + convert time in 12 hour string format to 24 hour string format + :param time_string: string + :return: string + """ + return datetime.strptime(time_string, "%I:%M%p").strftime("%H:%M:%S") + + @staticmethod + def build_imdb_id(i): + """ + this function takes in an integer and converts it to an imdb id + :param i: integer + :return: string + """ + current_imdb_number = "{0:0=7d}".format(i) + imdb_id = "tt" + current_imdb_number + return imdb_id + + +class CinemaScheduleTransformer: + + @staticmethod + def get_id_from_cathay_cinema_name(cinema_name): + """get cathay internal id from their cinema name for web elements""" + mapper = { + "Cathay Cineplex Amk Hub": "", + "Cathay Cineplex Causeway Point": "1", + "Cathay Cineplex Cineleisure Orchard": "2", + "Cathay Cineplex Downtown East": "3", + "Cathay Cineplex Jem": "4", + "The Cathay Cineplex": "5", + "Cathay Cineplex West Mall": "6" + } + return mapper[cinema_name] + + def parse_cinema_object_to_data(self, cinema_object): + """ + parse the cinema object in the format: + (based on self.provider, parsing strategy may vary) + { + movie_title: a list of movie schedule + } + to the format that can be consumed by loader class and + subsequently being stored into the database + { + "title": ..., + "schedule": [...], + "type": ... + + In the process, it will complete 2 additional tasks + besides rearranging the dictionary -- parse the movie + title into title and additional information such as + "3D" "Dolby Digital", and match the title to imdb id + + It will also return another list of imdb id found in this + process and subjected to movie data extraction process if + imdb id is not present in database + :return: dictionary + """ + data_object = [] + + # parse title + for key, value in cinema_object.items(): + if "Zen Zone" in key: # strange thing in gv + continue + title, additional_info = self._movie_title_parser(key) + data_object.append( + { + "title": title, + "schedule": value, + "type": additional_info + }) + return data_object + + def _movie_title_parser(self, title): + additional_info = [] + if self.provider == "gv": + if "`" in title: + title = title.replace("`", "\'") + if "*" in title: + title = title.replace("*", "") + additional_info.append("No free pass") + if "(Eng Sub)" in title: + title = title.replace("(Eng Sub)", "") + additional_info.append("English sub only") + if "(Atmos)" in title: + title = title.replace("(Atmos)", "") + additional_info.append("Atmos") + if "Dessert Set" in title: + title = title.replace("Dessert Set", "") + additional_info.append("Dessert Set") + if "(D-Box)" in title: + title = title.replace("(D-Box)", "") + additional_info.append("(D-Box)") + elif self.provider == "cathay": + if "*" in title: + title = title.replace("*", "") + # have not figure out the meaning of * + if "(Dolby Digital)" in title: + tokens = title.split(" ") + splitter = tokens.index("(Dolby") + title = " ".join(tokens[:splitter - 1]) + additional_info.append("Dolby Digital") + if "(Dolby Atmos)" in title: + tokens = title.split(" ") + splitter = tokens.index("(Dolby") + title = " ".join(tokens[:splitter - 1]) + additional_info.append("Dolby Atmos") + title = title.replace("Atmos", "") + elif self.provider == "sb": + # special rules + if "Kungfu" in title: + title = title.replace("Kungfu", "Kung-fu") + + # general rules + if "`" in title: + title = title.replace("`", "\'") + if "[D]" in title: + title = title.replace("[D]", "") + additional_info.append("Digital") + if "[IMAX]" in title: + title = title.replace("[IMAX]", "") + additional_info.append("IMAX") + if "[M]" in title: + title = title.replace("[M]", "") + if "[IMAX 3D]" in title: + title = title.replace("[IMAX 3D]", "") + additional_info.append("IMAX") + additional_info.append("3D") + + else: + raise Exception("Invalid cinema provider") + + title = title.strip() + additional_info = ",".join(additional_info) + return title, additional_info + + + diff --git a/data/utils.py b/data/public_data/utils.py similarity index 56% rename from data/utils.py rename to data/public_data/utils.py index de70119..e634608 100644 --- a/data/utils.py +++ b/data/public_data/utils.py @@ -1,43 +1,80 @@ -""" - This file contains miscellaneous functions used by all classes -""" -import logging -import datetime +from enum import Enum +from datetime import datetime -# ============== -# Logger -# ============== -def initialise_logger(): +class UrlFormatter(Enum): + + IMDB_URL_FORMAT = "http://www.imdb.com/title/{}/" + + +class InvalidMovieTypeException(Exception): + pass + + +def split_release_and_country_imdb(release_country): """ - initialise general logger, create general.log file in current directory + given a string containing released date and country of a movie, return both fields + :param release_country: string + :return: string, string """ - logger = logging.getLogger("general_logger") - logger.setLevel(logging.INFO) - file_handler = logging.FileHandler('general.log', mode='w') - file_handler.setLevel(logging.DEBUG) - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - file_handler.setFormatter(formatter) - logger.addHandler(file_handler) - return logger + released, country = release_country.replace(")", "").split("(") + released = released.strip() # remove last white space + return released, country -def initialise_test_logger(): +def transform_time_imdb(runtime): """ - initialise a console logger for testing classes + given a string of time in various format from imdb, return in minutes + :param runtime: string + :return: string """ - logger = logging.getLogger("test_logger") - logger.setLevel(logging.DEBUG) - return logger + runtime = runtime.replace(" ", "").replace("min", "") + if "h" in runtime: + [hours, minutes] = runtime.split("h") + if minutes == "": + minutes = 0 + runtime = int(hours) * 60 + int(minutes) + return str(runtime) -# ============== -# Movie Data -# ============== -def get_movie_data_dict(actors, country, director, genre, imdb_id, language, plot, poster_url, - production_year, rated, released, runtime, title, type): +def transform_date_imdb(input_text): + """ + given a date of string from imdb, return date in %Y-%m-%d format + :param input_text: string + :return: string """ - this is the data model of movie data. + length_of_date = len(input_text.split(" ")) + if length_of_date == 3: + input_text = datetime.strptime(input_text, '%d %B %Y').strftime('%Y-%m-%d') + elif length_of_date == 2: + input_text = datetime.strptime(input_text, '%B %Y').strftime('%Y-%m-%d') + elif length_of_date == 1: + if input_text == "": + return None + else: + input_text = datetime.strptime(input_text, '%Y').strftime('%Y-%m-%d') + return input_text + + +def get_movie_data_dict(actors, country, director, genre, imdb_id, language, plot, poster_url, production_year, rated, + released, runtime, title, type): + """ + this is the data model of movie data. + :param actors: string + :param country: string + :param director: string + :param genre: string + :param imdb_id: string + :param language: string + :param plot: string + :param poster_url: string + :param production_year: integer + :param rated: string + :param released: datetime + :param runtime: string + :param title: string + :param type: string + :return: dictionary """ movie_data = { "movie_id": imdb_id, @@ -77,63 +114,6 @@ def get_movie_rating_dict(score, votes, imdb_id, rating_source): return movie_rating -def imdb_id_builder(i): - """ - this function takes in an integer and converts it to an imdb id - """ - current_imdb_number = "{0:0=7d}".format(i) - imdb_id = "tt" + current_imdb_number - return imdb_id - - -def split_release_and_country_imdb(release_country): - """ - given a string containing released date and country of a movie, return both fields - :param release_country: string - :return: string, string - """ - released, country = release_country.replace(")", "").split("(") - released = released.strip() # remove last white space - return released, country -def transform_time_imdb(runtime): - """ - given a string of time in various format from imdb, return in minutes - :param runtime: string - :return: string - """ - runtime = runtime.replace(" ", "").replace("min", "") - if "h" in runtime: - [hours, minutes] = runtime.split("h") - if minutes == "": - minutes = 0 - runtime = int(hours) * 60 + int(minutes) - return str(runtime) - - -def transform_date_imdb(input_text): - """ - given a date of string from imdb, return date in %Y-%m-%d format - :param input_text: string - :return: string - """ - length_of_date = len(input_text.split(" ")) - if length_of_date == 3: - input_text = datetime.datetime.strptime(input_text, '%d %B %Y').strftime('%Y-%m-%d') - elif length_of_date == 2: - input_text = datetime.datetime.strptime(input_text, '%B %Y').strftime('%Y-%m-%d') - elif length_of_date == 1: - if input_text == "": - return None - else: - input_text = datetime.datetime.strptime(input_text, '%Y').strftime('%Y-%m-%d') - return input_text - -def is_numeric(number): - try: - float(number) - except ValueError: - return False - return True diff --git a/data/recommedation_algo/__init__.py b/data/recommedation_algo/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data/test/test_cinemalist.py b/data/test/test_cinemalist.py deleted file mode 100644 index 54e6373..0000000 --- a/data/test/test_cinemalist.py +++ /dev/null @@ -1,19 +0,0 @@ -from data.etl.cinemalist import CinemaList - -import unittest - - -class TestCinemaList(unittest.TestCase): - - def setUp(self): - self.cinema_list = CinemaList() - - # def test_get_golden_village(self): - # cinemalist = CinemaList() - # cinemalist.get_golden_village_cinema_list() - - # def test_get_cathay(self): - # self.cinema_list.get_cathay() - - # def test_get_shaw_brother(self): - # self.cinema_list.get_shaw_brother() diff --git a/data/test/test_movierating.py b/data/test/test_movierating.py deleted file mode 100644 index fe2c5b1..0000000 --- a/data/test/test_movierating.py +++ /dev/null @@ -1,56 +0,0 @@ -import unittest -import random -import data.utils as utils -from bs4 import BeautifulSoup -from urllib import request, error -from data.etl.movierating import MovieRating - - -class TestMovieRating(unittest.TestCase): - - test_id_list = ['tt0000001', 'tt1234567', 'tt0460648', 'tt2345678', 'tt4346792', 'tt3107288', 'tt0395865', - 'tt3783958', 'tt0000004', 'tt0000007', 'tt0000502', 'tt0001304', 'tt0000869', 'tt0000019', - 'tt0000025', 'tt0010781', 'tt0000481', 'tt0000012', 'tt0000399', 'tt0039624', 'tt0030298', - 'tt0039445'] - - def __init__(self, *args, **kwargs): - super(TestMovieRating, self).__init__(*args, **kwargs) - - def test_extract_trakt_tv_ratings(self): - self.assertEqual(MovieRating(self.test_id_list[0]).extract_trakt_rating(), ('4.66667', '9')) - self.assertEqual(MovieRating(self.test_id_list[1]).extract_trakt_rating(), ('0.0', '0')) - self.assertEqual(MovieRating(self.test_id_list[2]).extract_trakt_rating(), (None, None)) - self.assertEqual(MovieRating(self.test_id_list[3]).extract_trakt_rating(), (None, None)) - self.assertEqual(MovieRating(self.test_id_list[4]).extract_trakt_rating(), (None, None)) - self.assertEqual(MovieRating(self.test_id_list[7]).extract_trakt_rating(), ('7.92902', '4973')) - - # new movie data for type validation - rating, votes = MovieRating(self.test_id_list[7]).extract_douban_rating() - self.assertTrue(utils.is_numeric(rating.isnumeric())) - self.assertTrue(utils.is_numeric(votes.isnumeric())) - - def test_extract_imdb_rating(self): - self.assertEqual(MovieRating(self.test_id_list[0]).extract_imdb_rating(), ('5.8', '1232')) - self.assertEqual(MovieRating(self.test_id_list[1]).extract_imdb_rating(), ('5.3', '13')) - self.assertEqual(MovieRating(self.test_id_list[2]).extract_imdb_rating(), ('6.4', '227')) - self.assertEqual(MovieRating(self.test_id_list[3]).extract_imdb_rating(), (None, None)) - self.assertEqual(MovieRating(self.test_id_list[4]).extract_imdb_rating(), ('8.5', '4265')) - self.assertEqual(MovieRating(self.test_id_list[7]).extract_imdb_rating(), ('8.5', '170403')) - - # new movie data for type validation - rating, votes = MovieRating(self.test_id_list[7]).extract_douban_rating() - self.assertTrue(utils.is_numeric(rating.isnumeric())) - self.assertTrue(utils.is_numeric(votes.isnumeric())) - - def test_extract_douban_rating(self): - # old movie data for value validation - self.assertEqual(MovieRating(self.test_id_list[0]).extract_douban_rating(), ('7.1', '164')) - self.assertEqual(MovieRating(self.test_id_list[1]).extract_douban_rating(), (None, None)) - self.assertEqual(MovieRating(self.test_id_list[2]).extract_douban_rating(), (None, None)) - self.assertEqual(MovieRating(self.test_id_list[3]).extract_douban_rating(), (None, None)) - self.assertEqual(MovieRating(self.test_id_list[4]).extract_douban_rating(), ('7.5', '5416')) - - # new movie data for type validation - rating, votes = MovieRating(self.test_id_list[7]).extract_douban_rating() - self.assertTrue(utils.is_numeric(rating.isnumeric())) - self.assertTrue(utils.is_numeric(votes.isnumeric()))