-
-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added Parool (NL), Revista 22 (RO), Dilema (RO) and Internazionale (I…
…T); updated Volkskrant to download cover
- Loading branch information
Showing
10 changed files
with
419 additions
and
3 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,107 @@ | ||
#!/usr/bin/env python | ||
from calibre.web.feeds.recipes import BasicNewsRecipe | ||
from bs4 import BeautifulSoup | ||
|
||
class Volkskrant(BasicNewsRecipe): | ||
title = 'Dilema' | ||
__author__ = 'Cristi Ghera' | ||
max_articles_per_feed = 100 | ||
description = '"Sint vechi, domnule!" (I.L. Caragiale)' | ||
needs_subscription = False | ||
language = 'ro' | ||
country = 'RO' | ||
category = 'politics, culture, Romania' | ||
resolve_internal_links = True | ||
remove_tags_before = { 'class': 'post' } | ||
remove_tags_after = { 'class': 'post_content' } | ||
remove_tags = [ | ||
dict( | ||
attrs={ | ||
'class': [ | ||
'single_meta_category', | ||
'avatar', | ||
'jm-post-like', | ||
'fa', | ||
] | ||
} | ||
), | ||
dict( | ||
name=['div'], | ||
attrs={ | ||
'class': ['mb-2'] | ||
} | ||
), | ||
dict(id=['like', 'dlik']), | ||
dict(name=['script', 'noscript', 'style']), | ||
] | ||
remove_attributes = ["class", "id", "name", "style"] | ||
encoding = 'utf-8' | ||
no_stylesheets = True | ||
ignore_duplicate_articles = {'url'} | ||
|
||
def parse_index(self): | ||
homepage_url = 'https://www.dilema.ro/' | ||
soup = self.index_to_soup(homepage_url) | ||
|
||
articles = [] | ||
|
||
# .banner-container | ||
banner_container = soup.find('div', attrs={'class': 'banner-container'}) | ||
container = banner_container.find('h5') | ||
a = container.find('a') | ||
url = homepage_url + a.attrs['href'] | ||
articles.append( | ||
dict( | ||
title=self.tag_to_string(container).strip(), | ||
url=url, | ||
date=self.tag_to_string(banner_container.find(attrs={'class': 'post-date'})).strip(), | ||
description='', | ||
content='' | ||
) | ||
) | ||
|
||
# .homepage_builder_3grid_post | ||
containers = soup.findAll('div', attrs={'class': 'homepage_builder_3grid_post'}) | ||
for container in containers: | ||
if self.tag_to_string(container.find('h2')) in ['CELE MAI RECENTE', 'CELE MAI CITITE']: | ||
continue | ||
for article in container.findAll('div', attrs={'class': 'blog_grid_post_style'}): | ||
title_container = article.find('h3') | ||
if not title_container: | ||
continue | ||
url = title_container.find('a')['href'] | ||
url = homepage_url + url | ||
article_title = self.tag_to_string(title_container).strip() | ||
author = self.tag_to_string( | ||
article.find('a', attrs={'rel': 'author'}) | ||
).strip() | ||
summary = self.tag_to_string(article.find('p')).strip() | ||
pubdate = self.tag_to_string(article.find(attrs={'class': 'post-date'})) | ||
description = author + ' - ' + summary | ||
articles.append( | ||
dict( | ||
title=article_title, | ||
url=url, | ||
date=pubdate, | ||
description=description, | ||
content='' | ||
) | ||
) | ||
|
||
sections = [("Numărul curent", articles)] | ||
return sections | ||
|
||
def preprocess_html(self, soup): | ||
main_carousel = soup.find(attrs={'id': 'main-carousel'}) | ||
if main_carousel: | ||
img = main_carousel.find('img') | ||
body = soup.find('body') | ||
body.clear() | ||
body.append(img) | ||
return soup | ||
|
||
def get_cover_url(self): | ||
url = 'https://www.dilema.ro/coperta-saptaminii/' | ||
soup = self.index_to_soup(url) | ||
img = soup.find(attrs={'id': 'main-carousel'}).find('img') | ||
return url + img.attrs['src'] |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#!/usr/bin/env python | ||
from calibre.web.feeds.recipes import BasicNewsRecipe | ||
|
||
class Volkskrant(BasicNewsRecipe): | ||
title = 'Internazionale' | ||
__author__ = 'Cristi Ghera' | ||
max_articles_per_feed = 100 | ||
description = 'Internazionale - Notizie dall’Italia e dal mondo' | ||
needs_subscription = False | ||
language = 'it' | ||
country = 'IT' | ||
category = 'news, politics, Italy, world' | ||
resolve_internal_links = True | ||
remove_tags_before = { 'name': 'article' } | ||
remove_tags_after = { 'name': 'article' } | ||
remove_tags = [ | ||
dict( | ||
attrs={ | ||
'class': [ | ||
'item-banner', | ||
'hentryfeed__side', | ||
'magazine-article-share-tools', | ||
'magazine-article-share-popup', | ||
'article_next', | ||
'cta_nl_ext_container', | ||
] | ||
} | ||
), | ||
dict(name=['script', 'style']), | ||
] | ||
remove_attributes = ["class", "id", "name", "style"] | ||
encoding = 'utf-8' | ||
no_stylesheets = True | ||
ignore_duplicate_articles = {'url'} | ||
|
||
current_number_url = "https://www.internazionale.it/sommario" | ||
home_url = "https://www.internazionale.it" | ||
cover_url = None | ||
|
||
def extract_article(self, article): | ||
url = article.find('a')['href'] | ||
if url[0] == '/': | ||
url = self.home_url + url | ||
title_parts = [] | ||
tag = article.find('div', {'class': 'abstract-article__tag'}) | ||
if tag: title_parts.append(self.tag_to_string(tag).upper()) | ||
title_parts.append(self.tag_to_string(article.find('div', {'class': 'abstract-article__title'}))) | ||
article_title = ' \u2022 '.join(title_parts) | ||
pubdate='' | ||
description_parts = [] | ||
author = article.find('div', {'class': 'abstract-article__author'}) | ||
if author: description_parts.append(self.tag_to_string(author)) | ||
summary = article.find('div', {'class': 'abstract-article__content'}) | ||
if summary: description_parts.append(self.tag_to_string(summary)) | ||
description = ' \u2022 '.join(description_parts) | ||
return dict( | ||
title=article_title, | ||
url=url, | ||
date=pubdate, | ||
description=description, | ||
content='' | ||
) | ||
|
||
def parse_index(self): | ||
soup = self.index_to_soup(self.current_number_url) | ||
self.cover_url = soup.find('span', { 'class': 'img_expand' })['data-src'] | ||
main_container = soup.find('div', { 'class': 'content_data' }) | ||
children = main_container.findAll('div', recursive=False) | ||
sections = [] | ||
current_section = None | ||
for container in children: | ||
if 'abstract-testatina' in container['class'] or 'abstract-testatina-cultura' in container['class']: | ||
if current_section: | ||
sections.append(current_section) | ||
current_section = (self.tag_to_string(container), []) | ||
continue | ||
|
||
if 'masonry-items' in container['class']: | ||
for article in container.findAll('div', {'class': 'abstract-article'}): | ||
current_section[1].append(self.extract_article(article)) | ||
continue | ||
|
||
if 'abstract-article' in container['class']: | ||
current_section[1].append(self.extract_article(container)) | ||
continue | ||
|
||
# print(container['class']) | ||
if current_section: | ||
sections.append(current_section) | ||
return sections | ||
|
||
def preprocess_html(self, soup): | ||
for node in soup.findAll('figure'): | ||
img_src = None | ||
image_attributes = [ | ||
'data-media1024', | ||
'data-media1025', | ||
'data-media641', | ||
'data-media321', | ||
'data-media', | ||
] | ||
for attr in image_attributes: | ||
if node.has_attr(attr): | ||
img_src = node[attr] | ||
break | ||
node.name = 'div' | ||
if img_src: | ||
img = soup.new_tag('img', src=img_src) | ||
node.insert(0, img) | ||
for node in soup.findAll('figcaption'): | ||
node.name = 'div' | ||
# if self.browser.cookiejar: | ||
# self.browser.cookiejar.clear() | ||
return soup | ||
|
||
def get_cover_url(self): | ||
return self.cover_url |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
#!/usr/bin/env python | ||
from calibre.web.feeds.recipes import BasicNewsRecipe | ||
import uuid | ||
from mechanize import Request | ||
from contextlib import closing | ||
import json | ||
|
||
class Parool(BasicNewsRecipe): | ||
title = 'Het Parool' | ||
__author__ = 'Cristi Ghera' | ||
max_articles_per_feed = 100 | ||
description = 'Het Parool - Vrij, Onverveerd' | ||
needs_subscription = False | ||
language = 'nl' | ||
country = 'NL' | ||
category = 'news, politics, Netherlands' | ||
resolve_internal_links = True | ||
remove_tags_before = dict(id='main-content') | ||
remove_tags_after = dict(id='main-content') | ||
remove_tags = [ | ||
dict(attrs={'class':['article-footer__sharing', 'artstyle__editorial-tips', 'artstyle__advertisement','artstyle__container__icon','artstyle__disabled-embed','container__title__icon',]}), | ||
dict(attrs={'data-element-id': ['article-element-authors']}), | ||
dict(name=['script', 'noscript', 'style']), | ||
] | ||
remove_attributes = ["class", "id", "name", "style"] | ||
encoding = 'utf-8' | ||
no_stylesheets = True | ||
ignore_duplicate_articles = {'url'} | ||
|
||
def parse_index(self): | ||
soup = self.index_to_soup('https://www.parool.nl/privacy-wall/accept?redirectUri=%2Feditie%2Fvandaag%2F&authId=' + str(uuid.uuid4())) | ||
containers = soup.findAll('section', attrs={'class': 'section--horizontal'}) | ||
sections = [] | ||
for container in containers: | ||
section_title = self.tag_to_string(container.find('h2')).strip() | ||
articles = [] | ||
|
||
for art in container.findAll('article'): | ||
a = art.find('a') | ||
url = a['href'] | ||
if url[0] == '/': | ||
url = 'https://www.parool.nl' + url | ||
if '/editie/' not in url: | ||
continue | ||
header = a.find('header') | ||
teaser_label = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__label'})).strip() | ||
teaser_sublabel = self.tag_to_string(header.find('h4').find('span', attrs={'class': 'teaser__sublabel'})).strip() | ||
teaser_title = self.tag_to_string(header.find('h3').find('span', attrs={'class': 'teaser__title__value--short'})).strip() | ||
ignore = { "dirkjan", "s1ngle", "pukkels", "hein de kort" } | ||
if teaser_label.lower() in ignore: | ||
continue | ||
parts = [] | ||
if teaser_label: | ||
parts.append(teaser_label.upper()) | ||
if teaser_sublabel: | ||
parts.append(teaser_sublabel) | ||
if teaser_title: | ||
parts.append(teaser_title) | ||
article_title = ' \u2022 '.join(parts) | ||
articles.append(dict(title=article_title, | ||
url=url, | ||
content='')) | ||
|
||
sections.append((section_title, articles)) | ||
return sections | ||
|
||
def preprocess_html(self, soup): | ||
for tag in soup(): | ||
if tag.name == 'img': | ||
if tag['src'][0] == '/': | ||
tag['src'] = 'https://www.parool.nl' + tag['src'] | ||
for tag in soup(): | ||
if tag.name == "picture": | ||
tag.replaceWith(tag.find("img")) | ||
comic_articles = { | ||
"Alle strips van Dirkjan", | ||
"S1NGLE", | ||
"Pukkels", | ||
"Bekijk hier alle cartoons van Hein de Kort", | ||
} | ||
if self.tag_to_string(soup.find('h1')).strip() in comic_articles: | ||
for node in soup.find('figure').find_next_siblings(): | ||
node.extract() | ||
return soup | ||
|
||
def get_cover_url(self): | ||
headers = { | ||
'X-Requested-With': 'XMLHttpRequest', | ||
'Accept': 'application/json, text/javascript, */*; q=0.01', | ||
'DNT': '1', | ||
} | ||
url = "https://login-api.e-pages.dk/v1/krant.parool.nl/folders" | ||
with closing(self.browser.open(Request(url, None, headers))) as r: | ||
folders = json.loads(r.read()) | ||
return folders["objects"][0]["teaser_medium"] | ||
return None |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.