Skip to content

Commit

Permalink
Added Parool (NL), Revista 22 (RO), Dilema (RO) and Internazionale (I…
Browse files Browse the repository at this point in the history
…T); updated Volkskrant to download cover
  • Loading branch information
itsirc committed Apr 19, 2024
1 parent c581bad commit 24befe4
Show file tree
Hide file tree
Showing 10 changed files with 419 additions and 3 deletions.
Binary file added recipes/dilema.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
107 changes: 107 additions & 0 deletions recipes/dilema.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
from bs4 import BeautifulSoup

class Volkskrant(BasicNewsRecipe):
title = 'Dilema'
__author__ = 'Cristi Ghera'
max_articles_per_feed = 100
description = '"Sint vechi, domnule!" (I.L. Caragiale)'
needs_subscription = False
language = 'ro'
country = 'RO'
category = 'politics, culture, Romania'
resolve_internal_links = True
remove_tags_before = { 'class': 'post' }
remove_tags_after = { 'class': 'post_content' }
remove_tags = [
dict(
attrs={
'class': [
'single_meta_category',
'avatar',
'jm-post-like',
'fa',
]
}
),
dict(
name=['div'],
attrs={
'class': ['mb-2']
}
),
dict(id=['like', 'dlik']),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
no_stylesheets = True
ignore_duplicate_articles = {'url'}

def parse_index(self):
homepage_url = 'https://www.dilema.ro/'
soup = self.index_to_soup(homepage_url)

articles = []

# .banner-container
banner_container = soup.find('div', attrs={'class': 'banner-container'})
container = banner_container.find('h5')
a = container.find('a')
url = homepage_url + a.attrs['href']
articles.append(
dict(
title=self.tag_to_string(container).strip(),
url=url,
date=self.tag_to_string(banner_container.find(attrs={'class': 'post-date'})).strip(),
description='',
content=''
)
)

# .homepage_builder_3grid_post
containers = soup.findAll('div', attrs={'class': 'homepage_builder_3grid_post'})
for container in containers:
if self.tag_to_string(container.find('h2')) in ['CELE MAI RECENTE', 'CELE MAI CITITE']:
continue
for article in container.findAll('div', attrs={'class': 'blog_grid_post_style'}):
title_container = article.find('h3')
if not title_container:
continue
url = title_container.find('a')['href']
url = homepage_url + url
article_title = self.tag_to_string(title_container).strip()
author = self.tag_to_string(
article.find('a', attrs={'rel': 'author'})
).strip()
summary = self.tag_to_string(article.find('p')).strip()
pubdate = self.tag_to_string(article.find(attrs={'class': 'post-date'}))
description = author + ' - ' + summary
articles.append(
dict(
title=article_title,
url=url,
date=pubdate,
description=description,
content=''
)
)

sections = [("Numărul curent", articles)]
return sections

def preprocess_html(self, soup):
main_carousel = soup.find(attrs={'id': 'main-carousel'})
if main_carousel:
img = main_carousel.find('img')
body = soup.find('body')
body.clear()
body.append(img)
return soup

def get_cover_url(self):
url = 'https://www.dilema.ro/coperta-saptaminii/'
soup = self.index_to_soup(url)
img = soup.find(attrs={'id': 'main-carousel'}).find('img')
return url + img.attrs['src']
Binary file added recipes/internazionale.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
117 changes: 117 additions & 0 deletions recipes/internazionale.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe

class Volkskrant(BasicNewsRecipe):
title = 'Internazionale'
__author__ = 'Cristi Ghera'
max_articles_per_feed = 100
description = 'Internazionale - Notizie dall’Italia e dal mondo'
needs_subscription = False
language = 'it'
country = 'IT'
category = 'news, politics, Italy, world'
resolve_internal_links = True
remove_tags_before = { 'name': 'article' }
remove_tags_after = { 'name': 'article' }
remove_tags = [
dict(
attrs={
'class': [
'item-banner',
'hentryfeed__side',
'magazine-article-share-tools',
'magazine-article-share-popup',
'article_next',
'cta_nl_ext_container',
]
}
),
dict(name=['script', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
no_stylesheets = True
ignore_duplicate_articles = {'url'}

current_number_url = "https://www.internazionale.it/sommario"
home_url = "https://www.internazionale.it"
cover_url = None

def extract_article(self, article):
url = article.find('a')['href']
if url[0] == '/':
url = self.home_url + url
title_parts = []
tag = article.find('div', {'class': 'abstract-article__tag'})
if tag: title_parts.append(self.tag_to_string(tag).upper())
title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'})))
article_title = ' \u2022 '.join(title_parts)
pubdate=''
description_parts = []
author = article.find('div', {'class': 'abstract-article__author'})
if author: description_parts.append(self.tag_to_string(author))
summary = article.find('div', {'class': 'abstract-article__content'})
if summary: description_parts.append(self.tag_to_string(summary))
description = ' \u2022 '.join(description_parts)
return dict(
title=article_title,
url=url,
date=pubdate,
description=description,
content=''
)

def parse_index(self):
soup = self.index_to_soup(self.current_number_url)
self.cover_url = soup.find('span', { 'class': 'img_expand' })['data-src']
main_container = soup.find('div', { 'class': 'content_data' })
children = main_container.findAll('div', recursive=False)
sections = []
current_section = None
for container in children:
if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']:
if current_section:
sections.append(current_section)
current_section = (self.tag_to_string(container), [])
continue

if 'masonry-items' in container['class']:
for article in container.findAll('div', {'class': 'abstract-article'}):
current_section[1].append(self.extract_article(article))
continue

if 'abstract-article' in container['class']:
current_section[1].append(self.extract_article(container))
continue

# print(container['class'])
if current_section:
sections.append(current_section)
return sections

def preprocess_html(self, soup):
for node in soup.findAll('figure'):
img_src = None
image_attributes = [
'data-media1024',
'data-media1025',
'data-media641',
'data-media321',
'data-media',
]
for attr in image_attributes:
if node.has_attr(attr):
img_src = node[attr]
break
node.name = 'div'
if img_src:
img = soup.new_tag('img', src=img_src)
node.insert(0, img)
for node in soup.findAll('figcaption'):
node.name = 'div'
# if self.browser.cookiejar:
# self.browser.cookiejar.clear()
return soup

def get_cover_url(self):
return self.cover_url
Binary file added recipes/parool.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
96 changes: 96 additions & 0 deletions recipes/parool.recipe
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python
from calibre.web.feeds.recipes import BasicNewsRecipe
import uuid
from mechanize import Request
from contextlib import closing
import json

class Parool(BasicNewsRecipe):
title = 'Het Parool'
__author__ = 'Cristi Ghera'
max_articles_per_feed = 100
description = 'Het Parool - Vrij, Onverveerd'
needs_subscription = False
language = 'nl'
country = 'NL'
category = 'news, politics, Netherlands'
resolve_internal_links = True
remove_tags_before = dict(id='main-content')
remove_tags_after = dict(id='main-content')
remove_tags = [
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}),
dict(attrs={'data-element-id': ['article-element-authors']}),
dict(name=['script', 'noscript', 'style']),
]
remove_attributes = ["class", "id", "name", "style"]
encoding = 'utf-8'
no_stylesheets = True
ignore_duplicate_articles = {'url'}

def parse_index(self):
soup = self.index_to_soup('https://www.parool.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4()))
containers = soup.findAll('section', attrs={'class': 'section--horizontal'})
sections = []
for container in containers:
section_title = self.tag_to_string(container.find('h2')).strip()
articles = []

for art in container.findAll('article'):
a = art.find('a')
url = a['href']
if url[0] == '/':
url = 'https://www.parool.nl' + url
if '/editie/' not in url:
continue
header = a.find('header')
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip()
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip()
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip()
ignore = { "dirkjan", "s1ngle", "pukkels", "hein de kort" }
if teaser_label.lower() in ignore:
continue
parts = []
if teaser_label:
parts.append(teaser_label.upper())
if teaser_sublabel:
parts.append(teaser_sublabel)
if teaser_title:
parts.append(teaser_title)
article_title = ' \u2022 '.join(parts)
articles.append(dict(title=article_title,
url=url,
content=''))

sections.append((section_title, articles))
return sections

def preprocess_html(self, soup):
for tag in soup():
if tag.name == 'img':
if tag['src'][0] == '/':
tag['src'] = 'https://www.parool.nl' + tag['src']
for tag in soup():
if tag.name == "picture":
tag.replaceWith(tag.find("img"))
comic_articles = {
"Alle strips van Dirkjan",
"S1NGLE",
"Pukkels",
"Bekijk hier alle cartoons van Hein de Kort",
}
if self.tag_to_string(soup.find('h1')).strip() in comic_articles:
for node in soup.find('figure').find_next_siblings():
node.extract()
return soup

def get_cover_url(self):
headers = {
'X-Requested-With': 'XMLHttpRequest',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'DNT': '1',
}
url = "https://login-api.e-pages.dk/v1/krant.parool.nl/folders"
with closing(self.browser.open(Request(url, None, headers))) as r:
folders = json.loads(r.read())
return folders["objects"][0]["teaser_medium"]
return None
Binary file added recipes/revista22.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 24befe4

Please sign in to comment.