Added Parool (NL), Revista 22 (RO), Dilema (RO) and Internazionale (I…

…T); updated Volkskrant to download cover
kovidgoyal · Apr 19, 2024 · 24befe4 · 24befe4
1 parent c581bad
commit 24befe4
Show file tree

Hide file tree

Showing 10 changed files with 419 additions and 3 deletions.
diff --git a/recipes/dilema.png b/recipes/dilema.png
diff --git a/recipes/dilema.recipe b/recipes/dilema.recipe
@@ -0,0 +1,107 @@
+#!/usr/bin/env  python
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from bs4 import BeautifulSoup
+
+class Volkskrant(BasicNewsRecipe):
+    title = 'Dilema'
+    __author__ = 'Cristi Ghera'
+    max_articles_per_feed = 100
+    description = '"Sint vechi, domnule!" (I.L. Caragiale)'
+    needs_subscription = False
+    language = 'ro'
+    country = 'RO'
+    category = 'politics, culture, Romania'
+    resolve_internal_links = True
+    remove_tags_before = { 'class': 'post' }
+    remove_tags_after = { 'class': 'post_content' }
+    remove_tags = [
+        dict(
+            attrs={
+                'class': [
+                    'single_meta_category',
+                    'avatar',
+                    'jm-post-like',
+                    'fa',
+                ]
+            }
+        ),
+        dict(
+            name=['div'],
+            attrs={
+                'class': ['mb-2']
+            }
+        ),
+        dict(id=['like', 'dlik']),
+        dict(name=['script', 'noscript', 'style']),
+    ]
+    remove_attributes = ["class", "id", "name", "style"]
+    encoding = 'utf-8'
+    no_stylesheets = True
+    ignore_duplicate_articles = {'url'}
+
+    def parse_index(self):
+        homepage_url = 'https://www.dilema.ro/'
+        soup = self.index_to_soup(homepage_url)
+
+        articles = []
+
+        # .banner-container
+        banner_container = soup.find('div', attrs={'class': 'banner-container'})
+        container = banner_container.find('h5')
+        a = container.find('a')
+        url = homepage_url + a.attrs['href']
+        articles.append(
+            dict(
+                title=self.tag_to_string(container).strip(),
+                url=url,
+                date=self.tag_to_string(banner_container.find(attrs={'class': 'post-date'})).strip(),
+                description='',
+                content=''
+            )
+        )
+
+        # .homepage_builder_3grid_post
+        containers = soup.findAll('div', attrs={'class': 'homepage_builder_3grid_post'})
+        for container in containers:
+            if self.tag_to_string(container.find('h2')) in ['CELE MAI RECENTE', 'CELE MAI CITITE']:
+                continue
+            for article in container.findAll('div', attrs={'class': 'blog_grid_post_style'}):
+                title_container = article.find('h3')
+                if not title_container:
+                    continue
+                url = title_container.find('a')['href']
+                url = homepage_url + url
+                article_title = self.tag_to_string(title_container).strip()
+                author = self.tag_to_string(
+                    article.find('a', attrs={'rel': 'author'})
+                ).strip()
+                summary = self.tag_to_string(article.find('p')).strip()
+                pubdate = self.tag_to_string(article.find(attrs={'class': 'post-date'}))
+                description = author + ' - ' + summary
+                articles.append(
+                    dict(
+                        title=article_title,
+                        url=url,
+                        date=pubdate,
+                        description=description,
+                        content=''
+                    )
+                )
+
+        sections = [("Numărul curent", articles)]
+        return sections
+
+    def preprocess_html(self, soup):
+        main_carousel = soup.find(attrs={'id': 'main-carousel'})
+        if main_carousel:
+            img = main_carousel.find('img')
+            body = soup.find('body')
+            body.clear()
+            body.append(img)
+        return soup
+
+    def get_cover_url(self):
+        url = 'https://www.dilema.ro/coperta-saptaminii/'
+        soup = self.index_to_soup(url)
+        img = soup.find(attrs={'id': 'main-carousel'}).find('img')
+        return url + img.attrs['src']
diff --git a/recipes/internazionale.png b/recipes/internazionale.png
diff --git a/recipes/internazionale.recipe b/recipes/internazionale.recipe
@@ -0,0 +1,117 @@
+#!/usr/bin/env  python
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class Volkskrant(BasicNewsRecipe):
+    title = 'Internazionale'
+    __author__ = 'Cristi Ghera'
+    max_articles_per_feed = 100
+    description = 'Internazionale - Notizie dall’Italia e dal mondo'
+    needs_subscription = False
+    language = 'it'
+    country = 'IT'
+    category = 'news, politics, Italy, world'
+    resolve_internal_links = True
+    remove_tags_before = { 'name': 'article' }
+    remove_tags_after = { 'name': 'article' }
+    remove_tags = [
+        dict(
+            attrs={
+                'class': [
+                    'item-banner',
+                    'hentryfeed__side',
+                    'magazine-article-share-tools',
+                    'magazine-article-share-popup',
+                    'article_next',
+                    'cta_nl_ext_container',
+                ]
+            }
+        ),
+        dict(name=['script', 'style']),
+    ]
+    remove_attributes = ["class", "id", "name", "style"]
+    encoding = 'utf-8'
+    no_stylesheets = True
+    ignore_duplicate_articles = {'url'}
+
+    current_number_url = "https://www.internazionale.it/sommario"
+    home_url = "https://www.internazionale.it"
+    cover_url = None
+
+    def extract_article(self, article):
+        url = article.find('a')['href']
+        if url[0] == '/':
+            url = self.home_url + url
+        title_parts = []
+        tag = article.find('div', {'class': 'abstract-article__tag'})
+        if tag: title_parts.append(self.tag_to_string(tag).upper())
+        title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'})))
+        article_title = ' \u2022 '.join(title_parts)
+        pubdate=''
+        description_parts = []
+        author = article.find('div', {'class': 'abstract-article__author'})
+        if author: description_parts.append(self.tag_to_string(author))
+        summary = article.find('div', {'class': 'abstract-article__content'})
+        if summary: description_parts.append(self.tag_to_string(summary))
+        description = ' \u2022 '.join(description_parts)
+        return dict(
+            title=article_title,
+            url=url,
+            date=pubdate,
+            description=description,
+            content=''
+        )
+
+    def parse_index(self):
+        soup = self.index_to_soup(self.current_number_url)
+        self.cover_url = soup.find('span', { 'class': 'img_expand' })['data-src']
+        main_container = soup.find('div', { 'class': 'content_data' })
+        children = main_container.findAll('div', recursive=False)
+        sections = []
+        current_section = None
+        for container in children:
+            if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']:
+                if current_section:
+                    sections.append(current_section)
+                current_section = (self.tag_to_string(container), [])
+                continue
+
+            if 'masonry-items' in container['class']:
+                for article in container.findAll('div', {'class': 'abstract-article'}):
+                    current_section[1].append(self.extract_article(article))
+                continue
+
+            if 'abstract-article' in container['class']:
+                current_section[1].append(self.extract_article(container))
+                continue
+
+            # print(container['class'])
+        if current_section:
+            sections.append(current_section)
+        return sections
+
+    def preprocess_html(self, soup):
+        for node in soup.findAll('figure'):
+            img_src = None
+            image_attributes = [
+                'data-media1024',
+                'data-media1025',
+                'data-media641',
+                'data-media321',
+                'data-media',
+            ]
+            for attr in image_attributes:
+                if node.has_attr(attr):
+                    img_src = node[attr]
+                    break
+            node.name = 'div'
+            if img_src:
+                img = soup.new_tag('img', src=img_src)
+                node.insert(0, img)
+        for node in soup.findAll('figcaption'):
+            node.name = 'div'
+        # if self.browser.cookiejar:
+        #     self.browser.cookiejar.clear()
+        return soup
+
+    def get_cover_url(self):
+        return self.cover_url
diff --git a/recipes/parool.png b/recipes/parool.png
diff --git a/recipes/parool.recipe b/recipes/parool.recipe
@@ -0,0 +1,96 @@
+#!/usr/bin/env  python
+from calibre.web.feeds.recipes import BasicNewsRecipe
+import uuid
+from mechanize import Request
+from contextlib import closing
+import json
+
+class Parool(BasicNewsRecipe):
+    title = 'Het Parool'
+    __author__ = 'Cristi Ghera'
+    max_articles_per_feed = 100
+    description = 'Het Parool - Vrij, Onverveerd'
+    needs_subscription = False
+    language = 'nl'
+    country = 'NL'
+    category = 'news, politics, Netherlands'
+    resolve_internal_links = True
+    remove_tags_before = dict(id='main-content')
+    remove_tags_after  = dict(id='main-content')
+    remove_tags = [
+        dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
+        dict(attrs={'data-element-id': ['article-element-authors']}),
+        dict(name=['script', 'noscript', 'style']),
+    ]
+    remove_attributes = ["class", "id", "name", "style"]
+    encoding = 'utf-8'
+    no_stylesheets = True
+    ignore_duplicate_articles = {'url'}
+
+    def parse_index(self):
+        soup = self.index_to_soup('https://www.parool.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
+        containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
+        sections = []
+        for container in containers:
+            section_title = self.tag_to_string(container.find('h2')).strip()
+            articles = []
+
+            for art in container.findAll('article'):
+                a = art.find('a')
+                url = a['href']
+                if url[0] == '/':
+                    url = 'https://www.parool.nl' + url
+                if '/editie/' not in url:
+                    continue
+                header = a.find('header')
+                teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
+                teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
+                teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
+                ignore = { "dirkjan", "s1ngle", "pukkels", "hein de kort" }
+                if teaser_label.lower() in ignore:
+                    continue
+                parts = []
+                if teaser_label:
+                    parts.append(teaser_label.upper())
+                if teaser_sublabel:
+                    parts.append(teaser_sublabel)
+                if teaser_title:
+                    parts.append(teaser_title)
+                article_title = ' \u2022 '.join(parts)
+                articles.append(dict(title=article_title,
+                                    url=url,
+                                    content=''))
+
+            sections.append((section_title, articles))
+        return sections
+
+    def preprocess_html(self, soup):
+        for tag in soup():
+            if tag.name == 'img':
+                if tag['src'][0] == '/':
+                    tag['src'] = 'https://www.parool.nl' + tag['src']
+        for tag in soup():
+            if tag.name == "picture":
+                tag.replaceWith(tag.find("img"))
+        comic_articles = {
+            "Alle strips van Dirkjan",
+            "S1NGLE",
+            "Pukkels",
+            "Bekijk hier alle cartoons van Hein de Kort",
+        }
+        if self.tag_to_string(soup.find('h1')).strip() in comic_articles:
+            for node in soup.find('figure').find_next_siblings():
+                node.extract()
+        return soup
+
+    def get_cover_url(self):
+        headers = {
+            'X-Requested-With': 'XMLHttpRequest',
+            'Accept': 'application/json, text/javascript, */*; q=0.01',
+            'DNT': '1',
+        }
+        url = "https://login-api.e-pages.dk/v1/krant.parool.nl/folders"
+        with closing(self.browser.open(Request(url, None, headers))) as r:
+            folders = json.loads(r.read())
+            return folders["objects"][0]["teaser_medium"]
+        return None
diff --git a/recipes/revista22.png b/recipes/revista22.png