From d46cbd6436d4d5556af4529438309b2a4cf242ce Mon Sep 17 00:00:00 2001 From: marwoodandrew Date: Thu, 21 Dec 2023 12:25:08 +1100 Subject: [PATCH 1/2] SDAAP-102 Modify Apple formatter and transmitter --- .../formatters/aap_apple_news_formatter.py | 842 ++++++++---------- .../aap_apple_news_formatter_tests.py | 759 ++++++++-------- .../transmitters/http_push_apple_news.py | 33 +- .../transmitters/http_push_apple_news_test.py | 17 +- 4 files changed, 800 insertions(+), 851 deletions(-) diff --git a/server/aap/publish/formatters/aap_apple_news_formatter.py b/server/aap/publish/formatters/aap_apple_news_formatter.py index 261337da9..f9d8e846c 100644 --- a/server/aap/publish/formatters/aap_apple_news_formatter.py +++ b/server/aap/publish/formatters/aap_apple_news_formatter.py @@ -10,33 +10,33 @@ import logging import json -import re from datetime import datetime +from pytz import timezone +from superdesk.utc import get_date from copy import deepcopy -from eve.utils import ParsedRequest, config +from eve.utils import config +import lxml.html as lxml_html +from draftjs_exporter.dom import DOM +from textwrap import dedent +from urllib.parse import urlparse, unquote from superdesk.publish.formatters import Formatter -from superdesk.metadata.item import FORMAT, FORMATS, ITEM_STATE, CONTENT_STATE +from superdesk.metadata.item import FORMAT, FORMATS from superdesk import get_resource_service from superdesk.utils import json_serialize_datetime_objectId from superdesk.utc import utc_to_local -from superdesk.etree import parse_html, to_string from superdesk.text_utils import get_text -from aap.text_utils import format_text_content -from aap.utils import is_fact_check +from superdesk.editor_utils import get_content_state_fields, Editor3Content, DraftJSHTMLExporter, render_fragment from aap.errors import AppleNewsError - logger = logging.getLogger(__name__) class AAPAppleNewsFormatter(Formatter): - name = 'AAP Apple News' type = 'AAP Apple News' APPLE_NEWS_VERSION = '1.8' - URL_REGEX = re.compile(r'(?:(?:https|http)://)[\w/\-?=%.]+\.[\w/\-?=%#@.\+:]+', re.IGNORECASE) def __init__(self): self.format_type = 'AAP Apple News' @@ -52,44 +52,123 @@ def format(self, article, subscriber, codes=None): except Exception as ex: raise AppleNewsError.AppleNewsFormatter(exception=ex) - def _format(self, article): - apple_news = {} - self._parse_content(article) - if not article.get('_title') or not article.get('_analysis_first_line') or not article.get('_analysis') \ - or not article.get('_statement') or not article.get('_statement_attribution') or \ - not article.get('_verdict1') or not article.get('_verdict2') or not article.get('_references'): - missing_fields = { - 'title': True if article.get('_title') else False, - 'subtitle': True if article.get('_analysis_first_line') else False, - 'analysis': True if article.get('_analysis') else False, - 'statement': True if article.get('_statement') else False, - 'statement_attribution': True if article.get('_statement_attribution') else False, - 'verdict1': True if article.get('_verdict1') else False, - 'verdict2': True if article.get('_verdict2') else False, - 'references': True if article.get('_references') else False, - } + def _filter_blocks(self, item, field, bfilter, remove): + """ + Function to filter the embed blocks for video and audio and also will regenerate the html in a more friendly + form using the AppleExporter class + :param item: The article + :param field: the field to operate on + :param bfilter: Filter function to determine if the block is to be kept + :param remove: list of keys to remove + :return: + """ + editor = Editor3Content(item, field, True) + exporter = AppleExporter(editor) + editor.html_exporter = exporter + blocks = [] + for block in editor.blocks: + if bfilter(block, remove): + blocks.append(block) + editor.set_blocks(blocks) + editor.update_item() + + def _not_embed(self, block, remove): + if block.type.lower() == "atomic": + bk = [e.key for e in block.entities if e.key in remove] + if bk: + return False + return True + + def _remove_embeds(self, article, remove_keys): + """ + Removes the nominated embeds from the draftjs state and regenerates the HTML. + :param article: + :param remove_keys + :return: + """ + to_remove = [k.lstrip("editor_") for k in remove_keys] + fields = get_content_state_fields(article) + for field in fields: + self._filter_blocks(article, field, self._not_embed, to_remove) + + for key in remove_keys: + article.get("associations", {}).pop(key, None) + if "refs" in article: + article["refs"] = [r for r in article.get("refs", []) if r["key"] != key] + + def _remove_unwanted_embeds(self, article): + """ + Removes all embeds that are not images/pictures + :param article: + :return: + """ + remove_keys = [] + + # can only handle pictures at the moment + for key, item in article.get("associations", {}).items(): + if key.startswith("editor_") and item.get("type") != 'picture': + remove_keys.append(key) + + self._remove_embeds(article, remove_keys) + + def format_dateline(self, located, current_timestamp): + """ + Formats dateline to "Location, Month Date Source -" + + :return: formatted dateline string + """ - logger.warning('Failed to parse title for item: {}. ' - 'missing fields: {}'.format(article.get('item_id'), missing_fields)) + dateline_location = "{city_code}" + dateline_location_format_fields = located.get("dateline", "city") + dateline_location_format_fields = dateline_location_format_fields.split(",") + if "country" in dateline_location_format_fields and "state" in dateline_location_format_fields: + dateline_location = "{city_code}, {state_code}, {country_code}" + elif "state" in dateline_location_format_fields: + dateline_location = "{city_code}, {state_code}" + elif "country" in dateline_location_format_fields: + dateline_location = "{city_code}, {country_code}" + dateline_location = dateline_location.format(**located) + + if located.get("tz") and located["tz"] != "UTC": + current_timestamp = datetime.fromtimestamp(current_timestamp.timestamp(), tz=timezone(located["tz"])) + else: + current_timestamp = utc_to_local(config.DEFAULT_TIMEZONE, current_timestamp) + if current_timestamp.month == 9: + formatted_date = "Sept {}".format(current_timestamp.strftime("%-d")) + elif 3 <= current_timestamp.month <= 7: + formatted_date = current_timestamp.strftime("%B %-d") + else: + formatted_date = current_timestamp.strftime("%b %-d") + + return "{location}, {mmmdd} at {hhmmpa}".format( + location=dateline_location.upper(), mmmdd=formatted_date, hhmmpa=current_timestamp.strftime('%I:%M%p') + ) - raise Exception('Cannot format the article for Apple News. ' - 'Failed to parse the item: {}.'.format(article.get('item_id'))) + def _format(self, article): + # Remove any video or audio embeds since for apple news they must be externally hosted + self._remove_unwanted_embeds(article) + + apple_news = {} self._set_article_document(apple_news, article) + + # Set the associations for the transmitter to be able to get the binaries + apple_news['associations'] = article.get('associations', {}) return apple_news def can_format(self, format_type, article): """Can format text article that are not preformatted""" - return format_type == self.format_type and is_fact_check(article) \ - and article.get(FORMAT) == FORMATS.HTML + return format_type == self.format_type and article.get(FORMAT) == FORMATS.HTML def _set_advertising_settings(self, apple_news): - """Function to set the adversiting settings""" - apple_news['advertisingSettings'] = { - 'frequency': 5, - 'layout': { - 'margin': { - 'bottom': 15, - 'top': 15 + """Function to set the advertising settings""" + apple_news['autoplacement'] = { + "advertisement": { + "enabled": True, + "bannerType": "any", + "distanceFromMedia": "10vh", + "frequency": 10, + "layout": { + "margin": 10 } } } @@ -100,7 +179,7 @@ def _is_featuremedia_exists(self, article): def _set_language(self, apple_news, article): """Set language""" - apple_news['language'] = article.get('language') or 'en' + apple_news['language'] = 'en-AU' if article.get('language') == 'en' else article.get('language', 'en-AU') def _set_document_style(self, apple_news): """Set document style""" @@ -111,9 +190,8 @@ def _set_article_document(self, apple_news, article): self._set_language(apple_news, article) self._set_metadata(apple_news, article) apple_news['identifier'] = article['item_id'] - apple_news['title'] = article.get('_title') + apple_news['title'] = article.get('headline') apple_news['version'] = self.APPLE_NEWS_VERSION - apple_news['subtitle'] = article.get('_analysis_first_line') self._set_layout(apple_news) self._set_advertising_settings(apple_news) self._set_component_layouts(apple_news) @@ -126,13 +204,19 @@ def _set_metadata(self, apple_news, article): 'dateCreated': self._format_datetime(article.get('firstcreated')), 'datePublished': self._format_datetime(article.get('firstpublished')), 'dateModified': self._format_datetime(article.get('versioncreated')), - 'excerpt': article.get('_title') + 'excerpt': get_text(article.get('abstract', ''), content='html').strip() } + if article.get('byline'): + apple_news['metadata']['authors'] = [article.get('byline')] if self._is_featuremedia_exists(article): - apple_news['metadata']['thumbnailURL'] = 'bundle://header.jpg' + apple_news['metadata']['thumbnailURL'] = 'bundle://featuremedia' - def _format_datetime(self, article_date, date_format='%Y-%m-%dT%H:%M:%S%z'): - return datetime.strftime(utc_to_local(config.DEFAULT_TIMEZONE, article_date), date_format) + def _format_datetime(self, article_date, date_format=None): + if date_format is None: + aware_dt = article_date.astimezone() + return aware_dt.isoformat(timespec='seconds') + else: + return datetime.strftime(utc_to_local(config.DEFAULT_TIMEZONE, article_date), date_format) def _set_layout(self, apple_news): """Set Layout""" @@ -146,71 +230,60 @@ def _set_layout(self, apple_news): def _set_component_layouts(self, apple_news): apple_news['componentLayouts'] = { "bodyLayout": { - "columnSpan": 6, + "columnSpan": 7, "columnStart": 0, "margin": { "bottom": 15, "top": 15 } }, - "claimTagLayout": { - "columnSpan": 7, - "columnStart": 0 - }, "fixed_image_header_container": { "columnSpan": 7, "columnStart": 0, "ignoreDocumentMargin": True, "minimumHeight": "45vh" }, - "fixed_image_header_section": { - "ignoreDocumentMargin": True, - "margin": { - "bottom": 0, - "top": 40 - } - }, - "header-top-spacer": { - "minimumHeight": 30 - }, - "statementAttributionLayout": { - "margin": { - "bottom": 10 - } - }, - "statementLayout": { - "contentInset": True, + "titleLayout": { + "horizontalContentAlignment": "center", + "columnSpan": 5, + "columnStart": 1, "margin": { - "bottom": 10, - "top": 10 + "bottom": 5, + "top": 5 } }, - "subHeaderLayout": { + "captionLayout": { "horizontalContentAlignment": "left", + "columnSpan": 7, + "columnStart": 0, "margin": { - "bottom": 10, - "top": 15 + "bottom": 5, + "top": 5 } }, - "titleLayout": { - "columnSpan": 7, - "columnStart": 0, + "BodyCaptionLayout": { + "horizontalContentAlignment": "left", + "columnSpan": 5, + "columnStart": 1, "margin": { - "bottom": 15, + "bottom": 5, "top": 5 } }, - "verdictContainerLayout": { - "contentInset": True, - "ignoreDocumentMargin": True, + "bylineLayout": { + "columnSpan": 5, + "columnStart": 1, "margin": { - "bottom": 15, + "bottom": 2, "top": 5 } }, - "verdictLayout": { + "dateLineLayout": { + "columnSpan": 5, + "columnStart": 1, "margin": { - "bottom": 20 + "bottom": 5, + "top": 2 } } } @@ -223,7 +296,7 @@ def _set_component_styles(self, apple_news): } apple_news['componentTextStyles'] = { "bodyStyle": { - "fontName": "Merriweather-Regular", + "fontName": "HelveticaNeue", "fontSize": 16, "lineHeight": 26, "linkStyle": { @@ -235,55 +308,32 @@ def _set_component_styles(self, apple_news): "textAlignment": "left", "textColor": "#000" }, - "claimTagStyle": { - "fontName": "Merriweather-Bold", + "bylineStyle": { + "fontName": "HelveticaNeue-Bold", "fontSize": 18, - "lineHeight": 17, - "textAlignment": "left", - "textColor": "#FFF", - "textShadow": { - "color": "#000", - "offset": { - "x": 1, - "y": 1 - }, - "opacity": 0.5, - "radius": 2 - } - }, - "statementAttributionStyle": { - "fontName": "Merriweather-Italic", - "fontSize": 14, - "hyphenation": False, - "lineHeight": 22, - "textAlignment": "right", + "lineHeight": 18, + "textAlignment": "center", "textColor": "#000" }, - "statementStyle": { - "fontName": "Merriweather-BoldItalic", + "dateLineStyle": { + "fontName": "HelveticaNeue-Bold", "fontSize": 18, - "hyphenation": False, - "lineHeight": 26, - "textColor": "#FFF" - }, - "subHeaderStyle": { - "fontName": "FiraSans-Bold", - "fontSize": 30, - "hyphenation": False, - "lineHeight": 40, - "textColor": "#063c7f" + "lineHeight": 18, + "textAlignment": "center", + "textColor": "#000" }, "titleStyle": { - "fontName": "Merriweather-Black", + "fontName": "HelveticaNeue-CondensedBlack", "fontSize": 40, "lineHeight": 50, - "textAlignment": "left", - "textColor": "#FFF" + "textAlignment": "center", + "textColor": "#000" }, - "verdictStyle": { - "fontName": "Merriweather-Regular", - "fontSize": 18, - "lineHeight": 26, + "captionStyle": { + "fontName": "HelveticaNeue-Italic", + "fontSize": 12, + "hyphenation": False, + "lineHeight": 15, "textAlignment": "left", "textColor": "#000" } @@ -291,400 +341,224 @@ def _set_component_styles(self, apple_news): def _set_component(self, apple_news, article): components = [] + components.extend(self._set_header_component(article)) + components.extend(self._set_story_component(article)) apple_news['components'] = components - components.append(self._set_header_component(article)) - components.extend(self._set_statement_component(article)) - components.append({ - 'layout': { - 'horizontalContentAlignment': 'right', - 'margin': { - 'bottom': 5 - }, - 'maximumContentWidth': 180 - }, - 'role': 'divider', - 'stroke': { - 'color': '#063c7f', - 'style': 'dashed', - 'width': 1 - } - }) - components.extend(self._set_verdict_component(article, '_verdict1')) - components.extend(self._set_analysis_component(article)) - components.extend(self._set_verdict_component(article, '_verdict2')) - components.extend(self._set_references_component(article)) - components.extend(self._set_revision_history_component(article)) def _set_header_component(self, article): - header = { + header = [{ 'behaviour': {'type': 'background_parallax'}, 'layout': 'fixed_image_header_container', 'role': 'container', 'style': { 'fill': { - 'URL': 'bundle://header.jpg', + 'URL': 'bundle://featuremedia', 'type': 'image' } - }, - 'components': [ - { - 'anchor': { - 'originAnchorPosition': 'bottom', - 'targetAnchorPosition': 'bottom' - }, - 'components': [ - { - "layout": "titleLayout", - "role": "title", - "text": article.get('_title'), - "textStyle": "titleStyle" - } - ], - 'layout': 'fixed_image_header_section', - 'role': 'section', - 'style': { - 'fill': { - 'angle': 180, - 'colorStops': [ - {'color': '#00000000'}, - {'color': '#063c7f'} - ], - 'type': 'linear_gradient' - } - } - } - ] + } + }, + { + "layout": "captionLayout", + "role": "caption", + "text": "{} - {}".format( + article.get('associations', {}).get('featuremedia', {}).get('description_text', ''), + article.get('associations', {}).get('featuremedia', {}).get('byline', '')), + "textStyle": 'captionStyle' } + ] if not self._is_featuremedia_exists(article): - header.pop('style', None) + return [] return header - def _set_statement_component(self, article): - """Set the statement component - - :param dict article: + def _add_pieces(self, body, pieces, role, embed_url): """ - if not article.get('_statement'): - return [] - - return [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Statement', - 'textStyle': 'subHeaderStyle' - }, - { - 'layout': 'statementLayout', - 'role': 'body', - 'style': { - 'backgroundColor': '#063c7f' - }, - 'text': article.get('_statement'), - 'textStyle': 'statementStyle' - }, - { - 'layout': 'statementAttributionLayout', - 'role': 'body', - 'text': article.get('_statement_attribution'), - 'textStyle': 'statementAttributionStyle' - } - ] - - def _set_analysis_component(self, article): - """Set the analysis component - - :param dict article: + Adds the content so far to the body content, then adds the embed, and clears the pieces + :param body: the body built so far + :param pieces: the pieces accumulated + :param role: + :param embed_url: + :return: """ - if not article.get('_analysis'): - return [] + body.extend([{ + 'format': 'html', + 'layout': 'bodyLayout', + 'role': 'body', + 'text': ''.join(pieces), + 'textStyle': 'bodyStyle' + }, { + "role": role, + "layout": "bodyLayout", + "URL": embed_url + }]) + pieces.clear() + return + + def generate_article_content(self, article): + + fragments = lxml_html.fragments_fromstring(article.get('body_html', '

')) + par_pieces = [] + body_content = [] + + for elem in fragments: + if elem.tag == 'figure': + key = elem.find('./img').attrib['id'] + body_content.extend([ + { + 'format': 'html', + 'layout': 'bodyLayout', + 'role': 'body', + 'text': ''.join(par_pieces), + 'textStyle': 'bodyStyle' + }, + { + 'role': 'figure', + 'URL': 'bundle://{}'.format(key), + 'identifier': key, + 'accessibilityCaption': elem.find('./img').attrib['alt'], + 'caption': elem.find('./figcaption').text, + 'layout': 'bodyLayout' + }, + { + "layout": "BodyCaptionLayout", + "role": "caption", + "text": elem.find('./figcaption').text, + "textStyle": 'captionStyle' + } + ]) + par_pieces.clear() + elif elem.tag == 'div' and 'embed-block' in elem.attrib.get('class', ''): + bq = elem.find('./blockquote') + if bq is not None: + if bq.attrib.get('class') == 'twitter-tweet': + tweet = bq.find('./a').attrib.get('href', '') + if 'twitter' in tweet: + self._add_pieces(body_content, par_pieces, "tweet", tweet) + elif bq.attrib.get('class') == 'instagram-media': + insta_link = bq.attrib.get('data-instgrm-permalink') + if insta_link: + self._add_pieces(body_content, par_pieces, "instagram", insta_link) + elif bq.attrib.get('class') == 'tiktok-embed': + tiktok = bq.attrib.get('cite') + if tiktok: + self._add_pieces(body_content, par_pieces, "tiktok", tiktok) + else: + iframe = elem.find("./iframe") + if iframe is not None: + src = iframe.attrib.get('src') + if src: + url = urlparse(src) + query = unquote(url.query) + if query.startswith('href='): + fburl = query[len('href='):] + self._add_pieces(body_content, par_pieces, 'facebook_post', fburl) + else: + par_pieces.append(render_fragment(elem)) + # Add what is left over + body_content.append({ + 'format': 'html', + 'layout': 'bodyLayout', + 'role': 'body', + 'text': ''.join(par_pieces), + 'textStyle': 'bodyStyle' + }) - return [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Analysis', - 'textStyle': 'subHeaderStyle' - }, - { + if article.get('body_footer', '') != '': + body_content.append({ 'format': 'html', 'layout': 'bodyLayout', 'role': 'body', - 'text': article.get('_analysis'), - 'textStyle': 'bodyStyle' - } - ] + 'text': article.get('body_footer', ''), + 'textStyle': 'bodyStyle'} + ) - def _set_verdict_component(self, article, field_name): - """Set the verdict component + return body_content - :param dict article: - """ - if not article.get(field_name): - return [] + def _set_story_component(self, article): - return [ + article_body = self.generate_article_content(article) + + story_component = [ { - 'components': [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Verdict', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'verdictLayout', - 'role': 'body', - 'text': article.get(field_name), - 'textStyle': 'verdictStyle' + "layout": "titleLayout", + "role": "title", + "text": article.get('headline'), + "textStyle": "titleStyle", + "format": "html" + }, + { + 'role': 'divider', + 'layout': { + 'columnStart': 2, + 'columnSpan': 3, + 'margin': { + 'top': 5, + 'bottom': 5 } - ], - 'layout': 'verdictContainerLayout', - 'role': 'container', - 'animation': { - 'type': 'move_in', - 'preferredStartingPosition': 'left' }, - 'style': { - 'backgroundColor': '#e7ebf1' + 'stroke': { + 'color': '#063c7f', + 'style': 'solid', + 'width': 1 } - } - ] - - def _set_references_component(self, article): - """Set the references component - - :param dict article: - """ - if not article.get('_references'): - return [] - - return [ - { - "layout": "subHeaderLayout", - "role": "heading", - "text": "The References", - "textStyle": "subHeaderStyle" }, { - "format": "html", - "layout": "bodyLayout", - "role": "body", - "text": article.get('_references'), - "textStyle": "bodyStyle" - } - ] - - def _set_revision_history_component(self, article): - """Set the revision history component - - :param dict article: - """ - if not article.get('_revision_history'): - return [] - - return [ - { - "layout": "subHeaderLayout", - "role": "heading", - "text": "Revision History", - "textStyle": "subHeaderStyle" + 'role': 'byline', + 'text': 'By {}'.format(article.get('byline')), + 'layout': 'bylineLayout', + 'textStyle': 'bylineStyle' }, { - "format": "html", - "layout": "bodyLayout", - "role": "body", - "text": article.get('_revision_history'), - "textStyle": "bodyStyle" + 'role': 'byline', + 'text': self.format_dateline(article.get('dateline', {}).get('located'), + get_date(article.get('versioncreated'))), + 'layout': 'dateLineLayout', + 'textStyle': 'dateLineStyle' } ] - - def _parse_content(self, article): - """Parse body_html and mapping to fields required for apple news format - - :param article: - """ - statement_regex = re.compile(r'^The Statement$', re.IGNORECASE) - analysis_regex = re.compile(r'^The Analysis$', re.IGNORECASE) - verdict_regex = re.compile(r'^The Verdict$', re.IGNORECASE) - references_regex = re.compile(r'^The References$', re.IGNORECASE) - abstract = get_text(article.get('abstract'), content='html').strip() - - article['_title'] = abstract - body_html = article.get('body_html') - article['_analysis_first_line'] = '' - article['_analysis'] = '' - article['_statement'] = '' - article['_statement_attribution'] = '' - article['_verdict1'] = '' - article['_verdict2'] = '' - article['_references'] = '' - article['_revision_history'] = '' - - if article.get(ITEM_STATE) == CONTENT_STATE.KILLED or article.get(ITEM_STATE) == CONTENT_STATE.RECALLED: - article['_title'] = 'This article has been removed.' - article['_analysis_first_line'] = 'This article has been removed.' - article['_analysis'] = 'This article has been removed.' - article['_statement'] = 'This article has been removed.' - article['_statement_attribution'] = 'This article has been removed.' - article['_verdict1'] = 'This article has been removed.' - article['_verdict2'] = 'This article has been removed.' - article['_references'] = 'This article has been removed.' - self._set_revision_history(article) - return - - parsed_content = parse_html(body_html, content='html') - statement_found = False - analysis_found = False - analysis_first_line = False - verdict1_found = False - verdict2_found = False - references_found = False - statement_elements = [] - - for top_level_tag in parsed_content.xpath('/div/child::*'): - tag_text = format_text_content(top_level_tag).strip() - if not tag_text: - continue - - if not verdict1_found: - if not statement_found: - match = statement_regex.search(tag_text) - if match: - statement_found = True - continue - else: - # statement found - match = verdict_regex.search(tag_text) - if match: - verdict1_found = True - if len(statement_elements) > 1: - statement_length = len(statement_elements) - 1 - for i in range(statement_length): - article['_statement'] += get_text( - to_string(statement_elements[i], remove_root_div=False), - content='html' - ).strip() - if statement_length > 1 and i != statement_length - 1: - article['_statement'] += '\r\n' - - article['_statement_attribution'] = get_text( - to_string(statement_elements[-1:][0], remove_root_div=False), - content='html' - ).strip() - elif len(statement_elements) == 1: - article['_statement'] = to_string( - statement_elements[0], - remove_root_div=False - ) - continue - - statement_elements.append(top_level_tag) - continue - - if verdict1_found and not analysis_found: - match = analysis_regex.search(tag_text) - if match: - analysis_found = True - else: - article['_verdict1'] += to_string(top_level_tag, remove_root_div=False) - continue - - if analysis_found and not verdict2_found: - if not analysis_first_line: - article['_analysis_first_line'] = tag_text - analysis_first_line = True - - match = verdict_regex.search(tag_text) - if match: - verdict2_found = True - else: - article['_analysis'] += to_string(top_level_tag, remove_root_div=False) - continue - - if verdict2_found and not references_found: - match = references_regex.search(tag_text) - if match: - references_found = True - else: - article['_verdict2'] += to_string(top_level_tag, remove_root_div=False) - continue - - if references_found: - tag_text = re.sub(r'^\d*\s*[.):]?', '', tag_text).strip() - - article['_references'] += '
  • {}
  • '.format( - self._format_url_to_anchor_tag(tag_text) - ) - - if len(article['_references']): - article['_references'] = '
      {}
    '.format(article['_references']) - - if not article.get('_statement') and article.get('_statement_attribution'): - # if statement is not as per the format - article['_statement'] = article.get('_statement_attribution') - article['_statement_attribution'] = '' - - self._set_revision_history(article) - - # append footer to the analysis section - if article.get('_analysis') and article.get('body_footer'): - article['_analysis'] += article.get('body_footer') - - def _format_url_to_anchor_tag(self, tag_text): - def replacement(match_object): - value = match_object.group(0) - if value: - return '{0}'.format(value) - return '' - - return re.sub(self.URL_REGEX, replacement, tag_text) - - def _set_revision_history(self, article): - """Get revision history of published article - - :param dict article: - """ - query = { - 'query': { - 'filtered': { - 'filter': { - 'bool': { - 'must': { - 'term': {'item_id': article.get('item_id')} - } - } - } - } - }, - 'sort': [ - {'versioncreated': {'order': 'asc'}} - ] - } - - req = ParsedRequest() - repos = 'published,archived' - req.args = {'source': json.dumps(query), 'repo': repos, 'aggregations': 0} - revisions = list(get_resource_service('search').get(req=req, lookup=None)) - revisions_tag = [] - - for rev in revisions: - local_date = utc_to_local( - config.DEFAULT_TIMEZONE, - rev.get('firstpublished') if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED - else rev.get('versioncreated') + story_component.extend(article_body) + return story_component + + +class AppleExporter(DraftJSHTMLExporter): + """ + Exporter class that manipulates the html to inject the required src for the images and + also to inject the figcaption + """ + + def render_media(self, props): + embed_key = next( + k for k, v in self.content_state["entityMap"].items() if v["data"].get("media") == props["media"] + ) + media_props = props["media"] + media_type = media_props.get("type", "picture") + + alt_text = media_props.get("alt_text") or "" + desc = "{} - {}".format(media_props.get("description_text"), media_props.get('byline')) + if media_type == "picture": + src = 'bundle:\\editor_{}'.format(embed_key) + + embed_type = "Image" + elt = DOM.create_element( + "img", + {"src": src, "alt": alt_text, "id": "editor_{}".format(embed_key)}, + props["children"], ) - date_string = datetime.strftime(local_date, '%b XXX, %Y %H:%M %Z').replace('XXX', str(local_date.day)) - if rev.get(ITEM_STATE) == CONTENT_STATE.PUBLISHED: - revisions_tag.append('
  • {} {}
  • '.format('First published', date_string)) - else: - revision_markup = '{} {}'.format('Revision published', date_string) - ednote = get_text(rev.get('ednote') or '', content='html').strip() - if rev.get(ITEM_STATE) == CONTENT_STATE.CORRECTED and ednote: - revision_markup += '
    {}'.format(ednote) - revisions_tag.append('
  • {}
  • '.format(revision_markup)) - - article['_revision_history'] = '' .format(''.join(revisions_tag)) if revisions_tag else '' + content = DOM.render(elt) + + if desc: + content += "
    {}
    ".format(desc) + + # is needed for the comments, because a root node is necessary + # it will be removed during rendering. + embed = DOM.parse_html( + dedent( + """\ + +
    {content}
    +
    """ + ).format(embed_type=embed_type, key=embed_key, content=content) + ) + + return embed diff --git a/server/aap/publish/formatters/aap_apple_news_formatter_tests.py b/server/aap/publish/formatters/aap_apple_news_formatter_tests.py index 126678a5e..16ecb10fa 100644 --- a/server/aap/publish/formatters/aap_apple_news_formatter_tests.py +++ b/server/aap/publish/formatters/aap_apple_news_formatter_tests.py @@ -12,25 +12,6 @@ from datetime import datetime from superdesk.tests import TestCase from .aap_apple_news_formatter import AAPAppleNewsFormatter -from unittest.mock import patch, MagicMock - - -def get_data(resource): - service_mock = MagicMock() - service_mock.get = MagicMock() - service_mock.get.return_value = [ - { - 'state': 'published', - 'firstpublished': datetime(year=2018, month=2, day=15, hour=12, minute=30, second=0, tzinfo=pytz.UTC), - 'item_id': '1' - }, - { - 'state': 'corrected', - 'versioncreated': datetime(year=2018, month=2, day=15, hour=13, minute=45, second=0, tzinfo=pytz.UTC), - 'item_id': '1' - } - ] - return service_mock class AAPAppleNewsFormatterTest(TestCase): @@ -43,13 +24,23 @@ def setUp(self): def _get_article(self): return { 'type': 'text', - 'genre': [{'qcode': 'Fact Check'}], + 'genre': [{'qcode': 'Article'}], 'format': 'HTML', 'item_id': '1', 'firstcreated': datetime(year=2018, month=2, day=15, hour=11, minute=30, second=0, tzinfo=pytz.UTC), 'firstpublished': datetime(year=2018, month=2, day=15, hour=12, minute=30, second=0, tzinfo=pytz.UTC), 'versioncreated': datetime(year=2018, month=2, day=15, hour=13, minute=45, second=0, tzinfo=pytz.UTC), 'abstract': 'This is abstract', + 'headline': 'Headline of the story', + 'byline': 'John Doe', + 'dateline': { + "source": "AAP", + "located": { + "city": "Sydney", + "dateline": "city", + "city_code": "Sydney", + } + }, 'body_html': '

    The Statement

    ' '

    This is statement first line

    ' '

    This is statement second line

    ' @@ -73,19 +64,8 @@ def _get_article(self): } - def test_can_format_fact_check(self): + def test_can_format_check(self): self.assertTrue( - self.formatter.can_format( - self.formatter.format_type, - { - 'type': 'text', - 'genre': [{'qcode': 'Fact Check'}], - 'format': 'HTML' - } - ) - ) - - self.assertFalse( self.formatter.can_format( self.formatter.format_type, { @@ -96,330 +76,413 @@ def test_can_format_fact_check(self): ) ) - def test_parse_statement(self): - article = self._get_article() - self.formatter._parse_content(article) - self.assertEqual(article.get('_statement'), 'This is statement first line') - self.assertEqual(article.get('_statement_attribution'), 'This is statement second line') - self.assertEqual( - article.get('_analysis'), - '

    This is analysis first line

    ' - '

    This is analysis second line

    ' - ) - self.assertEqual( - article.get('_verdict1'), - '

    This is verdict 1 first line

    ' - '

    This is verdict 1 second line

    ' - ) - - self.assertEqual( - article.get('_verdict2'), - '

    This is verdict 2 first line

    ' - '

    This is verdict 2 second line

    ' - ) - - self.assertEqual( - article.get('_references'), - '
    1. This is references http://test.com
    2. ' - '
    3. This is references second line
    ' - ) - self.assertEqual(article.get('_revision_history'), '') - - @patch('aap.publish.formatters.aap_apple_news_formatter.get_resource_service', get_data) - def test_revision_history(self): - article = self._get_article() - self.formatter._set_revision_history(article) - self.assertEqual( - article.get('_revision_history'), - '' - ) - - def test_format_article_raises_exception_if_abstract_missing(self): - article = self._get_article() - article['abstract'] = '' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) - - def test_format_article_raises_exception_if_statement_missing(self): - article = self._get_article() - article['body_html'] = '

    The Statement

    '\ - '

    This is statement first line

    ' \ - '

    ' \ - '

    The Verdict

    ' \ - '

    This is verdict first line

    ' \ - '

    This is verdict second line

    ' \ - '

    ' \ - '

    The Analysis

    '\ - '

    This is analysis first line

    '\ - '

    This is analysis second line

    '\ - '

    '\ - '

    The Verdict

    '\ - '

    This is verdict first line

    '\ - '

    This is verdict second line

    '\ - '

    '\ - '

    The References

    '\ - '

    1. This is references http://test.com

    '\ - '

    2. This is references second line

    '\ - '

    ' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) - - def test_format_article_raises_exception_if_analysis_missing(self): + def test_format_title(self): article = self._get_article() - article['body_html'] = '

    The Statement

    '\ - '

    This is statement first line

    '\ - '

    '\ - '

    The Verdict

    '\ - '

    This is verdict first line

    '\ - '

    This is verdict second line

    '\ - '

    '\ - '

    The References

    '\ - '

    1. This is references http://test.com

    '\ - '

    2. This is references second line

    '\ - '

    ' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) + apple_news = self.formatter._format(article) + self.assertEqual(apple_news.get('identifier'), '1') + self.assertEqual(apple_news.get('title'), 'Headline of the story') + self.assertEqual(apple_news.get('components'), [{"layout": "titleLayout", + "role": "title", "text": "Headline of the story", + "textStyle": "titleStyle", + "format": "html"}, + {"role": "divider", + "layout": {"columnStart": 2, "columnSpan": 3, + "margin": {"top": 5, "bottom": 5}}, + "stroke": {"color": "#063c7f", "style": "solid", "width": 1}}, + {"role": "byline", "text": "By John Doe", + "layout": "bylineLayout", + "textStyle": "bylineStyle"}, + {"role": "byline", "text": "SYDNEY, Feb 16 at 12:45AM", + "layout": "dateLineLayout", "textStyle": "dateLineStyle"}, + {"format": "html", "layout": "bodyLayout", "role": "body", + "text": "

    The Statement

    " + "

    This is statement first line

    " + "

    This is statement second line

    " + "


    The Verdict

    " + "

    This is verdict 1 first line

    " + "

    This is verdict 1 second line

    " + "


    The Analysis

    " + "

    This is analysis first line

    " + "

    This is analysis second line

    " + "


    The Verdict

    " + "

    This is verdict 2 first line

    " + "

    This is verdict 2 second line

    " + "


    The References

    " + "

    1. This is references http://test.com

    " + "

    2. This is references second line


    ", + "textStyle": "bodyStyle"}]) - def test_format_article_raises_exception_if_verdict_missing(self): + def test_format_article_with_embeds(self): article = self._get_article() - article['body_html'] = '

    The Statement

    '\ - '

    This is statement first line

    ' \ - '

    This is statement second line

    ' \ - '

    '\ - '

    The Analysis

    '\ - '

    This is analysis first line

    '\ - '

    This is analysis second line

    '\ - '

    '\ - '

    The References

    '\ - '

    1. This is references http://test.com

    '\ - '

    2. This is references second line

    '\ - '

    ' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) + article['associations'] = {'featuremedia': {'description_text': 'Protesters participate in a Halloween themed ' + 'Extinction Rebellion rally in Sydney, ' + 'Thursday, October 31, 2019.'}, + 'editor_0': {'type': 'video'}, + 'editor_1': {'type': 'picture'}} + article['fields_meta'] = { + "body_html": { + "draftjsState": [ + { + "blocks": [ + { + "key": "f8mk1", + "text": "First paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + }, + { + "key": "97qeo", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 0 + } + ], + "data": {} + }, + { + "key": "bu6bt", + "text": "Second paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "66lpo", + "text": "Third paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "4sgtb", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 1 + } + ], + "data": {} + }, + { + "key": "9n4jj", + "text": "Fourth paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "1trdb", + "text": "Fifth paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "2jrhi", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 2 + } + ], + "data": {} + }, + { + "key": "d51og", + "text": "Sixth paragraph", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + } + ], + "entityMap": { + "0": { + "type": "MEDIA", + "mutability": "MUTABLE", + "data": { + } + }, + "1": { + "type": "MEDIA", + "mutability": "MUTABLE", + "data": { + "media": { + "headline": "POLESTAR ELECTRIC VEHICLE", + "alt_text": "Alt Text", + "description_text": "Description text or caption", + "source": "PR Handout Image", + "byline": "PR Handout Image/POLESTAR", + "type": "picture", + "format": "HTML", + } + } + }, + "2": { + "type": "EMBED", + "mutability": "MUTABLE", + "data": { + "data": { + "html": "

    " + ""This is actually my first time to ever enter a competition" + "."

    Photographer Jialing Cai went diving in the dark to " + "capture her award-winning image of a female paper nautilus, a type " + "of octopus that can grow its own shell.

    Via " + "" + "@liz: " + "https://t.co/u1rGHr1heD " + "pic.twitter.com/SIBTwJfisP

    — " + "Australian Associated Press (AAP) (@AAPNewswire) " + "" + "November 16, 2023
    " + } + } + } + } + } + ] + }} + apple_news = self.formatter._format(article) + self.assertEqual(apple_news['components'][7]['URL'], 'bundle://editor_1') + self.assertEqual(apple_news['components'][0]['style']['fill']['URL'], 'bundle://featuremedia') + self.assertEqual(apple_news['components'][10]['URL'], 'https://twitter.com/AAPNewswire/status/1') - def test_format_article_raises_exception_if_references_missing(self): + def test_format_article_with_instagram(self): article = self._get_article() - article['body_html'] = '

    The Statement

    '\ - '

    This is statement first line

    ' \ - '

    This is statement second line

    ' \ - '

    '\ - '

    The Analysis

    '\ - '

    This is analysis first line

    '\ - '

    This is analysis second line

    '\ - '

    '\ - '

    The Verdict

    '\ - '

    This is verdict first line

    '\ - '

    This is verdict second line

    '\ - '

    ' - with self.assertRaises(Exception) as ex_context: - self.formatter._format(article) - self.assertIn('Cannot format the article for Apple News', ex_context.exception) + article['fields_meta'] = { + "body_html": { + "draftjsState": [ + { + "blocks": [ + { + "key": "bkf9p", + "text": "instagram", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + }, + { + "key": "ed90t", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 0 + } + ], + "data": {} + }, + { + "key": "30a8e", + "text": "", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + } + ], + "entityMap": { + "0": { + "type": "EMBED", + "mutability": "MUTABLE", + "data": { + "data": { + "html": "
    " + }, + "description": "Test Instagram post" + } + } + } + } + ] + } + } + apple_news = self.formatter._format(article) + self.assertEqual(apple_news['components'][5]['URL'], "https://www.instagram.com/reel/C") - def test_format_title(self): + def test_format_article_with_facebook(self): article = self._get_article() + article['fields_meta'] = { + "body_html": { + "draftjsState": [ + { + "blocks": [ + { + "key": "tqgt", + "text": "Facebook post", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + }, + { + "key": "b0nn5", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 0 + } + ], + "data": {} + }, + { + "key": "1loq9", + "text": "Following text", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + } + ], + "entityMap": { + "0": { + "type": "EMBED", + "mutability": "MUTABLE", + "data": { + "data": { + "html": "" + }, + "description": "Embed description" + } + } + } + } + ] + } + } apple_news = self.formatter._format(article) - self.assertEqual(apple_news.get('identifier'), '1') - self.assertEqual(apple_news.get('title'), 'This is abstract') - self.assertEqual(apple_news.get('subtitle'), 'This is analysis first line') - self.assertEqual(apple_news.get('components'), - [ - { - 'behaviour': { - 'type': 'background_parallax' - }, - 'components': [{ - 'anchor': { - 'originAnchorPosition': 'bottom', - 'targetAnchorPosition': 'bottom' - }, - 'components': [{ - 'layout': 'titleLayout', - 'role': 'title', - 'text': 'This is abstract', - 'textStyle': 'titleStyle' - }], - 'layout': 'fixed_image_header_section', - 'role': 'section', - 'style': { - 'fill': { - 'angle': 180, - 'colorStops': [ - {'color': '#00000000'}, - {'color': '#063c7f'} - ], - 'type': 'linear_gradient' - } - } - }], - 'layout': 'fixed_image_header_container', - 'role': 'container' - }, - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Statement', - 'textStyle': 'subHeaderStyle' - }, - { - 'layout': 'statementLayout', - 'role': 'body', - 'style': { - 'backgroundColor': '#063c7f' - }, - 'text': 'This is statement first line', - 'textStyle': 'statementStyle' - }, - { - 'layout': 'statementAttributionLayout', - 'role': 'body', - 'text': 'This is statement second line', - 'textStyle': 'statementAttributionStyle' - }, - { - 'layout': { - 'horizontalContentAlignment': 'right', - 'margin': { - 'bottom': 5 - }, - 'maximumContentWidth': 180 - }, - 'role': 'divider', - 'stroke': { - 'color': '#063c7f', - 'style': 'dashed', - 'width': 1 - } - }, - { - 'animation': { - 'preferredStartingPosition': 'left', - 'type': 'move_in' - }, - 'components': [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Verdict', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'verdictLayout', - 'role': 'body', - 'text': '

    This is verdict 1 first line

    ' - '

    This is verdict 1 second line

    ', - 'textStyle': 'verdictStyle' - } - ], - 'layout': 'verdictContainerLayout', - 'role': 'container', - 'style': { - 'backgroundColor': '#e7ebf1' - } - }, - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Analysis', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'bodyLayout', - 'role': 'body', - 'text': '

    This is analysis first line

    ' - '

    This is analysis second line

    ', - 'textStyle': 'bodyStyle' - }, - { - 'animation': { - 'preferredStartingPosition': 'left', - 'type': 'move_in' - }, - 'components': [ - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The Verdict', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'verdictLayout', - 'role': 'body', - 'text': '

    This is verdict 2 first line

    ' - '

    This is verdict 2 second line

    ', - 'textStyle': 'verdictStyle' - } - ], - 'layout': 'verdictContainerLayout', - 'role': 'container', - 'style': { - 'backgroundColor': '#e7ebf1' - } - }, - { - 'layout': 'subHeaderLayout', - 'role': 'heading', - 'text': 'The References', - 'textStyle': 'subHeaderStyle' - }, - { - 'format': 'html', - 'layout': 'bodyLayout', - 'role': 'body', - 'text': '
    1. This is references http://test.com' - '
    2. This is references second line
    ', - 'textStyle': 'bodyStyle' - }] - ) + self.assertEqual(apple_news['components'][5]['URL'], 'https://www.facebook.com/aapnewswire/posts/pfbid') - def test_format_killed_article(self): + def test_format_article_with_tik_tok(self): article = self._get_article() - article['state'] = 'killed' + article['fields_meta'] = { + "body_html": { + "draftjsState": [ + { + "blocks": [ + { + "key": "36ias", + "text": "Tcik Tock Test", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": { + "MULTIPLE_HIGHLIGHTS": {} + } + }, + { + "key": "cshfs", + "text": " ", + "type": "atomic", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [ + { + "offset": 0, + "length": 1, + "key": 0 + } + ], + "data": {} + }, + { + "key": "7n6o7", + "text": "Following text", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + }, + { + "key": "bsq3s", + "text": "", + "type": "unstyled", + "depth": 0, + "inlineStyleRanges": [], + "entityRanges": [], + "data": {} + } + ], + "entityMap": { + "0": { + "type": "EMBED", + "mutability": "MUTABLE", + "data": { + "data": { + "html": "
    " + }, + "description": "Tik Toc Test description" + } + } + } + } + ] + } + } apple_news = self.formatter._format(article) - self.assertEqual(apple_news.get('title'), 'This article has been removed.') - self.assertEqual(apple_news.get('subtitle'), 'This article has been removed.') - - def test_format_url_to_anchor_tag(self): - inputs = [ - 'this is http://test.com', - 'this is second line', - 'Australian politics live with Amy Remeikis, by Amy Remeikis.' - 'The Guardian.May 3, 2019: https://www.theguardian.com/australia-news/live/2019/' - 'may/03/federal-election-2019-liberals-to-dump-another-candidate-politics-live?' - 'page=with:block-5ccb88c18f086f179813a12b', - '7121.0 - Agricultural Commodities Australia 2017 - 18. Australian Bureau of Statistics:' - ' https://www.abs.gov.au/AUSSTATS/abs@.nsf/Lookup/7121.0Main+Features12017-18?OpenDocument', - 'Explanatory Notes.7121.0 - Agricultural Commodities, Australia, 2017 - 18. Australian' - ' Bureau of Statistics: https://www.abs.gov.au/AUSSTATS/abs@.nsf/Lookup/7121.0' - 'Explanatory%20Notes12017-18?OpenDocument' - ] - - outputs = [ - 'this is http://test.com', - 'this is second line', - 'Australian politics live with Amy Remeikis, by Amy Remeikis.' - 'The Guardian.May 3, 2019: https://www.theguardian.com/australia-news/live/2019/' - 'may/03/federal-election-2019-liberals-to-dump-another-candidate-politics-live?' - 'page=with:block-5ccb88c18f086f179813a12b', - '7121.0 - Agricultural Commodities Australia 2017 - 18. Australian Bureau of Statistics:' - ' ' - 'https://www.abs.gov.au/AUSSTATS/abs@.nsf/Lookup/7121.0Main+Features12017-18?OpenDocument', - 'Explanatory Notes.7121.0 - Agricultural Commodities, Australia, 2017 - 18. Australian' - ' Bureau of Statistics: https://www.abs.gov.au/AUSSTATS/abs@.nsf/Lookup/7121.0' - 'Explanatory%20Notes12017-18?OpenDocument' - - ] - - for i in range(len(inputs)): - text = self.formatter._format_url_to_anchor_tag(inputs[i]) - self.assertEqual(outputs[i], text) + self.assertEqual(apple_news['components'][5]['URL'], 'https://www.tiktok.com/@dic/video/7') diff --git a/server/aap/publish/transmitters/http_push_apple_news.py b/server/aap/publish/transmitters/http_push_apple_news.py index f3a38b863..3ab1f5b59 100644 --- a/server/aap/publish/transmitters/http_push_apple_news.py +++ b/server/aap/publish/transmitters/http_push_apple_news.py @@ -59,11 +59,22 @@ def _get_item(self, queue_item): _current_version=queue_item.get('item_version') ) + def _get_original_guid(self, item): + guid = item.get('rewrite_of', item.get('guid', item.get('item_id', None))) + for i in range(item.get('rewrite_sequence', 1)): + prev = get_resource_service('archive').find_one( + req=None, _id=guid + ) + if not prev or not prev.get('rewrite_of'): + break + guid = prev['rewrite_of'] + return guid + def _push_item(self, queue_item): data = json.loads(queue_item['formatted_item']) + associations = data.pop('associations', None) destination = queue_item.get('destination', {}) item = self._get_item(queue_item) - service = get_resource_service('subscriber_transmit_references') if not item: raise Exception('Could not find the item to publish.') @@ -71,8 +82,11 @@ def _push_item(self, queue_item): channel = self._get_channel(destination) current_date = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ") method = 'POST' if item.get(ITEM_STATE) not in {CONTENT_STATE.RECALLED, CONTENT_STATE.KILLED} else 'DELETE' + + original_id = self._get_original_guid(item) + service = get_resource_service('subscriber_transmit_references') subscriber_reference = service.get_subscriber_reference( - item.get('item_id'), + original_id, queue_item.get('subscriber_id') ) metadata = {} @@ -98,9 +112,12 @@ def _push_item(self, queue_item): payload = json.dumps(data) parts.append(self._part('article.json', payload, len(payload), 'application/json')) - binary = self._get_media(self._get_header_image_rendition(destination), item) - if binary: - parts.append(self._part('header.jpg', binary.read(), binary.length, 'image/jpeg')) + for association, details in (associations or {}).items(): + binary = self._get_media(association, self._get_header_image_rendition(destination), item) + if binary: + parts.append( + self._part(association, binary.read(), binary.length, details.get('mimetype', 'image/jpeg'))) + body, content_type = encode_multipart_formdata(parts) canonical_request = self._get_canonical_request(method, url, current_date, content_type, body) else: @@ -125,7 +142,7 @@ def _push_item(self, queue_item): # response status_code is 204 for delete apple_article = json.loads(response.text) service.insert_update_reference( - item.get('item_id'), + original_id, queue_item.get('subscriber_id'), apple_article, apple_article.get('data').get('id') @@ -149,8 +166,8 @@ def _part(self, name, data, length, content_type): part.headers['Content-Type'] = content_type return part - def _get_media(self, rendition_name, item): - featuremedia = (item.get('associations') or {}).get('featuremedia') + def _get_media(self, name, rendition_name, item): + featuremedia = (item.get('associations') or {}).get(name) if not featuremedia: return None diff --git a/server/aap/publish/transmitters/http_push_apple_news_test.py b/server/aap/publish/transmitters/http_push_apple_news_test.py index 53ce52375..5cc3cd203 100644 --- a/server/aap/publish/transmitters/http_push_apple_news_test.py +++ b/server/aap/publish/transmitters/http_push_apple_news_test.py @@ -110,10 +110,8 @@ def test_transmit_new_item(self): with patch('aap.publish.transmitters.http_push_apple_news.get_resource_service', mocked_service): with HTTMock(self.new_item_response): self.http_push._push_item(queue_item) - find_one.assert_called_once() - find_one.assert_called_with(req=None, - item_id=queue_item.get('item_id'), - _current_version=queue_item.get('item_version')) + find_one.assert_called() + find_one.assert_called_with(req=None, _id='1') get_subscriber_reference.assert_called_once() get_subscriber_reference.assert_called_with('1', 'foo bar') @@ -140,10 +138,9 @@ def test_transmit_existing_item(self): with patch('aap.publish.transmitters.http_push_apple_news.get_resource_service', mocked_service): with HTTMock(self.existing_item_response): self.http_push._push_item(queue_item) - find_one.assert_called_once() + find_one.assert_called() find_one.assert_called_with(req=None, - item_id=queue_item.get('item_id'), - _current_version=queue_item.get('item_version')) + _id='1') get_subscriber_reference.assert_called_once() get_subscriber_reference.assert_called_with('1', 'foo bar') @@ -170,10 +167,8 @@ def test_transmit_delete_item(self): with patch('aap.publish.transmitters.http_push_apple_news.get_resource_service', mocked_service): with HTTMock(self.delete_item_response): self.http_push._push_item(queue_item) - find_one.assert_called_once() - find_one.assert_called_with(req=None, - item_id=queue_item.get('item_id'), - _current_version=queue_item.get('item_version')) + find_one.assert_called() + find_one.assert_called_with(req=None, _id='1') get_subscriber_reference.assert_called_once() get_subscriber_reference.assert_called_with('1', 'foo bar') From 4f5d7404e3403f89a2c858ef697264826e99ae92 Mon Sep 17 00:00:00 2001 From: marwoodandrew Date: Wed, 27 Dec 2023 17:01:26 +1100 Subject: [PATCH 2/2] handle archived kills --- .../formatters/aap_apple_news_formatter.py | 4 +- .../transmitters/http_push_apple_news.py | 14 +++-- server/aap/tests/io/fixtures/dc_response.xml | 62 ------------------- 3 files changed, 12 insertions(+), 68 deletions(-) delete mode 100644 server/aap/tests/io/fixtures/dc_response.xml diff --git a/server/aap/publish/formatters/aap_apple_news_formatter.py b/server/aap/publish/formatters/aap_apple_news_formatter.py index f9d8e846c..d54c02ecd 100644 --- a/server/aap/publish/formatters/aap_apple_news_formatter.py +++ b/server/aap/publish/formatters/aap_apple_news_formatter.py @@ -93,7 +93,7 @@ def _remove_embeds(self, article, remove_keys): for key in remove_keys: article.get("associations", {}).pop(key, None) - if "refs" in article: + if article.get("refs") is not None: article["refs"] = [r for r in article.get("refs", []) if r["key"] != key] def _remove_unwanted_embeds(self, article): @@ -105,7 +105,7 @@ def _remove_unwanted_embeds(self, article): remove_keys = [] # can only handle pictures at the moment - for key, item in article.get("associations", {}).items(): + for key, item in (article.get("associations") or {}).items(): if key.startswith("editor_") and item.get("type") != 'picture': remove_keys.append(key) diff --git a/server/aap/publish/transmitters/http_push_apple_news.py b/server/aap/publish/transmitters/http_push_apple_news.py index 3ab1f5b59..c13cb421c 100644 --- a/server/aap/publish/transmitters/http_push_apple_news.py +++ b/server/aap/publish/transmitters/http_push_apple_news.py @@ -60,11 +60,17 @@ def _get_item(self, queue_item): ) def _get_original_guid(self, item): + """ + Chase up the rewrite list until we get to the orignal in either the archive or legal archive (if the item has + expired from production + :param item: + :return: + """ guid = item.get('rewrite_of', item.get('guid', item.get('item_id', None))) - for i in range(item.get('rewrite_sequence', 1)): - prev = get_resource_service('archive').find_one( - req=None, _id=guid - ) + for _i in range(item.get('rewrite_sequence', 1)): + prev = get_resource_service('archive').find_one(req=None, _id=guid) + if not prev: + prev = get_resource_service('legal_archive').find_one(req=None, _id=guid) if not prev or not prev.get('rewrite_of'): break guid = prev['rewrite_of'] diff --git a/server/aap/tests/io/fixtures/dc_response.xml b/server/aap/tests/io/fixtures/dc_response.xml deleted file mode 100644 index 3edfbb709..000000000 --- a/server/aap/tests/io/fixtures/dc_response.xml +++ /dev/null @@ -1,62 +0,0 @@ - - - /archives/aapimage/search/?search[form][fulltext]={searchTerms} - AAP Image Pool - AAP Image Pool - archive - yes - - 584 - 1 - 1 - 500 - full - (ORIGINALTRANSMISSIONREFERENCE=5e17c05cb622e78f93f8eba7)&(MODDATE>20200108) - - - - - - - show_collections, sp_gateway - - - - 20200110 - 110918 - aapimage - aapfeed - 1 - 1 - 657x981 - p - 20200110 - ingest_no_news_value_priority.JPG - aapfeed - image - JPG - Testy McTest face - 5e17c05cb622e78f93f8eba7 - 20200110 - 110918 - Edited 10/01/2020 11:10:am (user1) - 5e17c05cb622e78f93f8eba7 Testy McTest face - 20200110 - 111027 - 1 - - - - - - \ No newline at end of file