From c1eb14e5ace4710f55bf1502bcfc5773757950d1 Mon Sep 17 00:00:00 2001 From: Franco Manca Date: Sun, 19 Jul 2020 09:38:13 -0300 Subject: [PATCH] =?UTF-8?q?Funci=C3=B3n=20para=20construir=20link,=20busca?= =?UTF-8?q?r=20articulo,=20utilizarlas=20en=20=5Fnews=5Fscraper=20y=20most?= =?UTF-8?q?rar=20titulo=20de=20articulo=20encontrado?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- web-scrapper/main.py | 56 ++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 52 insertions(+), 4 deletions(-) diff --git a/web-scrapper/main.py b/web-scrapper/main.py index 667c37d..1a96447 100644 --- a/web-scrapper/main.py +++ b/web-scrapper/main.py @@ -1,28 +1,76 @@ import argparse # Libreria estandar para análisis de opciones y argumentos de línea de comando import logging # Libreria estandar para informe de estado, error y mensajes informativos logging.basicConfig(level=logging.INFO) +import re # Módulo de expresiones regulares (re = regular expression) + +from requests.exceptions import HTTPError +from urllib3.exceptions import MaxRetryError import news_page_objects as news -from common import config +import common logger = logging.getLogger(__name__) +is_well_formed_link = re.compile(r'^https?://.+/.+$') # https://example.com/hello +is_root_path = re.compile(r'^/.+$') # /some-text def _news_scraper(news_site): - host = config()['news_sites'][news_site]['url'] + host = common.config()['news_sites'][news_site]['url'] logging.info('Beginning scraper for {}'.format(host)) homepage = news.HomePage(news_site, host) + articles = [] for link in homepage.article_links: - print(link) + article = _fetch_article(news_site, host, link) + + if article: + logger.info('Article ferched!!') + articles.append(article) + print(article.title) + + print(len(articles)) + + +def _fetch_article(news_site, host, link): + ''' Función para buscar un artículo. + @param news_site: Sitio nuevo donde se buscará el artículo + @param host: + ''' + logger.info('Start fetching article at {}'.format(link)) # Mensaje en consola + + article = None + try: + article = news.ArticlePage(news_site, _build_link(host, link)) + except(HTTPError, MaxRetryError) as e: + logger.warning('Error while fetching the article', exc_info=False) + + if article and not article.body: + logger.warning('Invalid article. There is no body') + return None + + return article + + +def _build_link(host, link): + ''' Funcion para construir el link y poder usarlo. + @param host + @param link + @return link directamente; se forman los link con expresiones regulares. + ''' + if is_well_formed_link.match(link): + return link + elif is_root_path.match(link): + return '{}{}'.format(host, link) + else: + return '{host}/{uri}'.format(host=host, uri=link) if __name__ == '__main__': parser = argparse.ArgumentParser() # Parseador de argumentos - news_site_choices = list(config()['news_sites'].keys()) + news_site_choices = list(common.config()['news_sites'].keys()) parser.add_argument('news_site', help='The news site that you want to scrape', type=str,