From de8de887bddebc9d063a43c156059c657a97cc83 Mon Sep 17 00:00:00 2001 From: Petr Jasek Date: Thu, 8 Feb 2024 13:21:09 +0100 Subject: [PATCH] fix lint --- server/cp/ai/semaphore.py | 474 +++++++++--------- server/cp/output/formatter/jimi_2.py | 32 +- .../cp/output/formatter/ninjs_formatter_2.py | 215 +++++--- 3 files changed, 406 insertions(+), 315 deletions(-) diff --git a/server/cp/ai/semaphore.py b/server/cp/ai/semaphore.py index b7883dfa..81005fa6 100644 --- a/server/cp/ai/semaphore.py +++ b/server/cp/ai/semaphore.py @@ -10,8 +10,6 @@ import superdesk - - logger = logging.getLogger(__name__) session = requests.Session() @@ -20,81 +18,72 @@ class Semaphore(AIServiceBase): """Semaphore autotagging service - - Environment variables SEMAPHORE_BASE_URL, SEMAPHORE_ANALYZE_URL, SEMAPHORE_SEARCH_URL, SEMAPHORE_GET_PARENT_URL , SEMAPHORE_CREATE_TAG_URL , - SEMAPHORE_CREATE_TAG_TASK , SEMAPHORE_CREATE_TAG_QUERY, SEMAPHORE_API_KEY and INDEX_FILE_PATH must be set. + + Environment variables SEMAPHORE_BASE_URL, SEMAPHORE_ANALYZE_URL, SEMAPHORE_SEARCH_URL, SEMAPHORE_GET_PARENT_URL, + SEMAPHORE_CREATE_TAG_URL, SEMAPHORE_CREATE_TAG_TASK, SEMAPHORE_CREATE_TAG_QUERY and SEMAPHORE_API_KEY must be set. """ name = "semaphore" label = "Semaphore autotagging service" - - def __init__(self,data): + def __init__(self, data): # SEMAPHORE_BASE_URL OR TOKEN_ENDPOINT Goes Here - self.base_url = os.getenv('SEMAPHORE_BASE_URL') + self.base_url = os.getenv("SEMAPHORE_BASE_URL") + + # SEMAPHORE_ANALYZE_URL Goes Here + self.analyze_url = os.getenv("SEMAPHORE_ANALYZE_URL") - # SEMAPHORE_ANALYZE_URL Goes Here - self.analyze_url = os.getenv('SEMAPHORE_ANALYZE_URL') + # SEMAPHORE_API_KEY Goes Here + self.api_key = os.getenv("SEMAPHORE_API_KEY") - # SEMAPHORE_API_KEY Goes Here - self.api_key = os.getenv('SEMAPHORE_API_KEY') + # SEMAPHORE_SEARCH_URL Goes Here + self.search_url = os.getenv("SEMAPHORE_SEARCH_URL") - # SEMAPHORE_SEARCH_URL Goes Here - self.search_url = os.getenv('SEMAPHORE_SEARCH_URL') + # SEMAPHORE_GET_PARENT_URL Goes Here + self.get_parent_url = os.getenv("SEMAPHORE_GET_PARENT_URL") - # SEMAPHORE_GET_PARENT_URL Goes Here - self.get_parent_url = os.getenv('SEMAPHORE_GET_PARENT_URL') - # SEMAPHORE_CREATE_TAG_URL Goes Here - self.create_tag_url = os.getenv('SEMAPHORE_CREATE_TAG_URL') + self.create_tag_url = os.getenv("SEMAPHORE_CREATE_TAG_URL") - # SEMAPHORE_CREATE_TAG_TASK Goes Here - self.create_tag_task = os.getenv('SEMAPHORE_CREATE_TAG_TASK') + # SEMAPHORE_CREATE_TAG_TASK Goes Here + self.create_tag_task = os.getenv("SEMAPHORE_CREATE_TAG_TASK") # SEMAPHORE_CREATE_TAG_QUERY Goes Here - self.create_tag_query = os.getenv('SEMAPHORE_CREATE_TAG_QUERY') - + self.create_tag_query = os.getenv("SEMAPHORE_CREATE_TAG_QUERY") def convert_to_desired_format(input_data): result = { "tags": { - "subject": input_data['subject'], - "organisation": input_data['organisation'], - "person": input_data['person'], - "event": input_data['event'], - "place": input_data['place'], - "object": [] # Assuming no data for 'object' + "subject": input_data["subject"], + "organisation": input_data["organisation"], + "person": input_data["person"], + "event": input_data["event"], + "place": input_data["place"], + "object": [], # Assuming no data for 'object' }, - "broader": { - "subject": input_data['broader'] - } + "broader": {"subject": input_data["broader"]}, } return result - + def get_access_token(self): """Get access token for Semaphore.""" url = self.base_url - - payload = f'grant_type=apikey&key={self.api_key}' - headers = { - 'Content-Type': 'application/x-www-form-urlencoded' - } + payload = f"grant_type=apikey&key={self.api_key}" + headers = {"Content-Type": "application/x-www-form-urlencoded"} response = session.post(url, headers=headers, data=payload, timeout=TIMEOUT) response.raise_for_status() return response.json().get("access_token") - - def fetch_parent_info(self,qcode): - + def fetch_parent_info(self, qcode): headers = {"Authorization": f"Bearer {self.get_access_token()}"} try: - frank = f"?relationshipType=has%20broader" + frank = "?relationshipType=has%20broader" query = qcode - parent_url = self.get_parent_url+query+frank + parent_url = self.get_parent_url + query + frank response = session.get(parent_url, headers=headers) response.raise_for_status() @@ -102,42 +91,43 @@ def fetch_parent_info(self,qcode): path = root.find(".//PATH[@TYPE='Narrower Term']") parent_info = [] if path is not None: - for field in path.findall('FIELD'): - if field.find('CLASS').get('NAME') == 'Topic': - score = field.get('score', '0') - parent_info.append({ - "name": field.get('NAME'), - "qcode": field.get('ID'), - "relevance": score, - "parent": None # Set to None initially - }) + for field in path.findall("FIELD"): + if field.find("CLASS").get("NAME") == "Topic": + score = field.get("score", "0") + parent_info.append( + { + "name": field.get("NAME"), + "qcode": field.get("ID"), + "relevance": score, + "parent": None, # Set to None initially + } + ) return parent_info, parent_info[::-1] # return parent_info[::-1] # Reverse to get ancestors in order except Exception as e: logger.error(f"Error fetching parent info: {str(e)}") - return [] - + return [] + # Analyze2 changed name to analyze_parent_info def analyze_parent_info(self, html_content: dict) -> dict: try: if not self.base_url or not self.api_key: - logger.warning("Semaphore Search is not configured properly, can't analyze content") + logger.warning( + "Semaphore Search is not configured properly, can't analyze content" + ) return {} - - print(html_content['searchString']) - query = html_content['searchString'] - - new_url = self.search_url+query+".json" + + print(html_content["searchString"]) + query = html_content["searchString"] + + new_url = self.search_url + query + ".json" # Make a POST request using XML payload - headers = { - "Authorization": f"bearer {self.get_access_token()}" - } + headers = {"Authorization": f"bearer {self.get_access_token()}"} - try: response = session.get(new_url, headers=headers) - print('response is') + print("response is") print(response) response.raise_for_status() @@ -146,13 +136,11 @@ def analyze_parent_info(self, html_content: dict) -> dict: logger.error(f"An error occurred while making the request: {str(e)}") root = response.text - print('Root is') + print("Root is") print(root) print(type(root)) - - # def transform_xml_response(xml_data): def transform_xml_response(api_response): result = { @@ -161,12 +149,11 @@ def transform_xml_response(api_response): "person": [], "event": [], "place": [], - "broader": [] + "broader": [], } # Process each termHint item in the API response for item in api_response["termHints"]: - scheme_url = "http://cv.cp.org/" if "Organization" in item["classes"]: @@ -186,17 +173,17 @@ def transform_xml_response(api_response): category = "subject" scheme_url = "http://cv.iptc.org/newscodes/mediatopic/" - score = item.get('score', '100') + score = item.get("score", "100") entry = { "name": item["name"], "qcode": item["id"], "source": "Semaphore", - "creator":"Human", - "relevance" : score, + "creator": "Human", + "relevance": score, "altids": {"source_name": "source_id"}, "original_source": "original_source_value", "scheme": scheme_url, - "parent": None # Initial parent assignment + "parent": None, # Initial parent assignment } # Assign to correct category based on class @@ -210,94 +197,109 @@ def transform_xml_response(api_response): result["place"].append(entry) else: # Fetch parent info for each subject item - parent_info, reversed_parent_info = self.fetch_parent_info(item["id"]) + parent_info, reversed_parent_info = self.fetch_parent_info( + item["id"] + ) # Assign the immediate parent to the subject item if parent_info: - entry["parent"] = reversed_parent_info[0]["qcode"] # Immediate parent is the first in the list + entry["parent"] = reversed_parent_info[0][ + "qcode" + ] # Immediate parent is the first in the list entry["scheme"] = "http://cv.iptc.org/newscodes/mediatopic/" result["subject"].append(entry) # Process broader items using reversed_parent_info for i in range(len(reversed_parent_info)): - broader_entry = { "name": reversed_parent_info[i]["name"], "qcode": reversed_parent_info[i]["qcode"], - "parent": reversed_parent_info[i + 1]["qcode"] if i + 1 < len(reversed_parent_info) else None, - "creator":"Human", + "parent": reversed_parent_info[i + 1]["qcode"] + if i + 1 < len(reversed_parent_info) + else None, + "creator": "Human", "source": "Semaphore", - "relevance" : "100", + "relevance": "100", "altids": {"source_name": "source_id"}, "original_source": "original_source_value", - "scheme": "http://cv.iptc.org/newscodes/mediatopic/" + "scheme": "http://cv.iptc.org/newscodes/mediatopic/", } result["broader"].append(broader_entry) return result - - - def convert_to_desired_format(input_data): result = { "tags": { - "subject": [capitalize_name_if_parent_none(tag) for tag in input_data['subject']], - "organisation": [capitalize_name_if_parent_none(tag) for tag in input_data['organisation']], - "person": [capitalize_name_if_parent_none(tag) for tag in input_data['person']], - "event": [capitalize_name_if_parent_none(tag) for tag in input_data['event']], - "place": [capitalize_name_if_parent_none(tag) for tag in input_data['place']], - "object": [] # Assuming no data for 'object' + "subject": [ + capitalize_name_if_parent_none(tag) + for tag in input_data["subject"] + ], + "organisation": [ + capitalize_name_if_parent_none(tag) + for tag in input_data["organisation"] + ], + "person": [ + capitalize_name_if_parent_none(tag) + for tag in input_data["person"] + ], + "event": [ + capitalize_name_if_parent_none(tag) + for tag in input_data["event"] + ], + "place": [ + capitalize_name_if_parent_none(tag) + for tag in input_data["place"] + ], + "object": [], # Assuming no data for 'object' }, "broader": { - "subject": [capitalize_name_if_parent_none(tag) for tag in input_data['broader']] - } + "subject": [ + capitalize_name_if_parent_none(tag) + for tag in input_data["broader"] + ] + }, } return result - - - - root = json.loads(root) - json_response = transform_xml_response(root) + json_response = transform_xml_response(root) json_response = convert_to_desired_format(json_response) - print('Json Response is ') + print("Json Response is ") print(json_response) - return json_response - + except requests.exceptions.RequestException as e: traceback.print_exc() - logger.error(f"Semaphore Search request failed. We are in analyze RequestError exception: {str(e)}") - - - def create_tag_in_semaphore(self,html_content: str) -> dict: + logger.error( + f"Semaphore Search request failed. We are in analyze RequestError exception: {str(e)}" + ) + def create_tag_in_semaphore(self, html_content: str) -> dict: try: if not self.create_tag_url or not self.api_key: - logger.warning("Semaphore Create is not configured properly, can't analyze content") + logger.warning( + "Semaphore Create is not configured properly, can't analyze content" + ) return {} - + url = self.create_tag_url task = self.create_tag_task query_string = self.create_tag_query - - new_url = url+task+query_string - + new_url = url + task + query_string # Make a POST request using XML payload headers = { "Authorization": f"bearer {self.get_access_token()}", - "Content-Type": "application/ld+json" + "Content-Type": "application/ld+json", } manual_tags = extract_manual_tags(html_content["data"]) @@ -312,89 +314,86 @@ def create_tag_in_semaphore(self,html_content: str) -> dict: id_value = "http://cv.cp.org/4916d989-2227-4f2d-8632-525cd462ab9f" elif scheme == "organization": - id_value = "http://cv.cp.org/e2c332d3-05e0-4dcc-b358-9e4855e80e88" - + id_value = "http://cv.cp.org/e2c332d3-05e0-4dcc-b358-9e4855e80e88" + elif scheme == "places": - id_value = "http://cv.cp.org/c3b17bf6-7969-424d-92ae-966f4f707a95" + id_value = "http://cv.cp.org/c3b17bf6-7969-424d-92ae-966f4f707a95" - elif scheme =="person": + elif scheme == "person": id_value = "http://cv.cp.org/1630a532-329f-43fe-9606-b381330c35cf" - + elif scheme == "event": id_value = "http://cv.cp.org/3c493189-023f-4d14-a2f4-fc7b79735ffc" - - - - payload = json.dumps({ - "@type": [ - "skos:Concept" - ], - "rdfs:label": "ConceptNameForUriGeneration", - "skos:topConceptOf": { - "@id": id_value - }, - "skosxl:prefLabel": [ - { - "@type": [ - "skosxl:Label" - ], + payload = json.dumps( + { + "@type": ["skos:Concept"], + "rdfs:label": "ConceptNameForUriGeneration", + "skos:topConceptOf": {"@id": id_value}, + "skosxl:prefLabel": [ + { + "@type": ["skosxl:Label"], "skosxl:literalForm": [ - { - "@value": concept_name, - "@language": "en" - } - ] - } - ] - }) - + {"@value": concept_name, "@language": "en"} + ], + } + ], + } + ) + try: response = session.post(new_url, headers=headers, data=payload) - if response.status_code == 409: print("Tag already exists in KMM. Response is 409 . The Tag is") print(concept_name) else: response.raise_for_status() - print('Tag Got Created is ') + print("Tag Got Created is ") print(concept_name) - except HTTPError as http_err: # Handle specific HTTP errors here logger.error(f"HTTP error occurred: {http_err}") except Exception as e: traceback.print_exc() - logger.error(f"An error occurred while making the create tag request: {str(e)}") - - - + logger.error( + f"An error occurred while making the create tag request: {str(e)}" + ) except requests.exceptions.RequestException as e: traceback.print_exc() - logger.error(f"Semaphore Create Tag Failed failed. We are in analyze RequestError exception: {str(e)}") + logger.error( + f"Semaphore Create Tag Failed failed. We are in analyze RequestError exception: {str(e)}" + ) - def data_operation(self, verb: str, operation: str, name: Optional[str], data: dict) -> dict: + def data_operation( + self, verb: str, operation: str, name: Optional[str], data: dict + ) -> dict: if operation == "feedback": return self.analyze(data["item"]) if operation == "search": return self.search(data) - + def search(self, data) -> dict: try: - print('----------------------------------------------------------------------') - print('----------------------------------------------------------------------') - print('Running for Search') - + print( + "----------------------------------------------------------------------" + ) + print( + "----------------------------------------------------------------------" + ) + print("Running for Search") + self.output = self.analyze_parent_info(data) try: updated_output = replace_qcodes(self.output) return updated_output except Exception as e: - print(f"Error occurred in replace_qcodes while Analyzing Parent Info: {e}") + print( + f"Error occurred in replace_qcodes while Analyzing Parent Info: {e}" + ) return self.output except Exception as e: print(e) @@ -403,25 +402,23 @@ def search(self, data) -> dict: def analyze(self, html_content: dict[str, str], tags=None) -> dict: try: - if not self.base_url or not self.api_key: - logger.warning("Semaphore is not configured properly, can't analyze content") + logger.warning( + "Semaphore is not configured properly, can't analyze content" + ) return {} - + # Convert HTML to XML xml_payload = self.html_to_xml(html_content) - - payload = {'XML_INPUT': xml_payload} - + payload = {"XML_INPUT": xml_payload} + # Make a POST request using XML payload - headers = { - "Authorization": f"bearer {self.get_access_token()}" - } - + headers = {"Authorization": f"bearer {self.get_access_token()}"} + try: response = session.post(self.analyze_url, headers=headers, data=payload) - print('response is') + print("response is") print(response) response.raise_for_status() @@ -430,7 +427,6 @@ def analyze(self, html_content: dict[str, str], tags=None) -> dict: logger.error(f"An error occurred while making the request: {str(e)}") root = response.text - def transform_xml_response(xml_data): # Parse the XML data @@ -442,7 +438,7 @@ def transform_xml_response(xml_data): "organisation": [], "person": [], "event": [], - "place": [] + "place": [], } # Temporary storage for path labels and GUIDs @@ -458,10 +454,9 @@ def add_to_dict(group, tag_data): for element in root.iter(): if element.tag == "META": meta_name = element.get("name") - meta_value = element.get("value") - meta_score = element.get("score","0") - - meta_id = element.get("id") + meta_value = element.get("value") or "" + meta_score = element.get("score", "0") + meta_id = element.get("id") or "" # Process 'Media Topic_PATH_LABEL' and 'Media Topic_PATH_GUID' if meta_name == "Media Topic_PATH_LABEL": @@ -491,11 +486,10 @@ def add_to_dict(group, tag_data): "qcode": meta_id if meta_id else "", "creator": "Machine", "source": "Semaphore", - "relevance" : meta_score, - "altids": {"source_name": "source_id"}, - "altids": f'{{"{meta_value}": "{meta_id}"}}', + "relevance": meta_score, + "altids": {meta_value: meta_id}, "original_source": "original_source_value", - "scheme": scheme_url + "scheme": scheme_url, } add_to_dict(group, tag_data) @@ -513,18 +507,18 @@ def add_to_dict(group, tag_data): "parent": parent_qcode, "source": "Semaphore", "creator": "Machine", - "relevance" : score, + "relevance": score, "altids": {"source_name": "source_id"}, "original_source": "original_source_value", - "scheme": "http://cv.iptc.org/newscodes/mediatopic/" + "scheme": "http://cv.iptc.org/newscodes/mediatopic/", } add_to_dict("subject", tag_data) - parent_qcode = guid # Update the parent qcode for the next iteration + parent_qcode = ( + guid # Update the parent qcode for the next iteration + ) return response_dict - - - + json_response = transform_xml_response(root) json_response = capitalize_name_if_parent_none_for_analyze(json_response) @@ -532,76 +526,70 @@ def add_to_dict(group, tag_data): try: updated_output = replace_qcodes(json_response) return updated_output - + except Exception as e: print(f"Error occurred in replace_qcodes: {e}") return json_response - except requests.exceptions.RequestException as e: traceback.print_exc() - logger.error(f"Semaphore request failed. We are in analyze RequestError exception: {str(e)}") + logger.error( + f"Semaphore request failed. We are in analyze RequestError exception: {str(e)}" + ) except Exception as e: traceback.print_exc() logger.error(f"An error occurred. We are in analyze exception: {str(e)}") - - def html_to_xml(self, html_content: str) -> str: - + def html_to_xml(self, html_content: str) -> str: def clean_html_content(input_str): # Remove full HTML tags using regular expressions - your_string = input_str.replace('

', '') - your_string = your_string.replace('

', '') - your_string = your_string.replace('
', '') - your_string = your_string.replace(' ', '') - your_string = your_string.replace('&', '') - your_string = your_string.replace('<>', '') - - - - - return your_string - - + your_string = input_str.replace("

", "") + your_string = your_string.replace("

", "") + your_string = your_string.replace("
", "") + your_string = your_string.replace(" ", "") + your_string = your_string.replace("&", "") + your_string = your_string.replace("<>", "") + + return your_string + xml_template = """ - - - <?xml version="1.0" encoding="UTF-8"?> - <story> - <headline>{}</headline> - <headline_extended>{}</headline_extended> - <body_html>{}</body_html> - <slugline>{}</slugline> - </story> - - - - """ - - - - - body_html = html_content.get('body_html', '') - headline = html_content.get('headline', '') - headline_extended = html_content.get('abstract', '') - slugline = html_content.get('slugline', '') - - # Embed the 'body_html' into the XML template - xml_output = xml_template.format(headline,headline_extended,body_html,slugline) + + + <?xml version="1.0" encoding="UTF-8"?> + <story> + <headline>{}</headline> + <headline_extended>{}</headline_extended> + <body_html>{}</body_html> + <slugline>{}</slugline> + </story> + + + + """ + + body_html = html_content.get("body_html", "") + headline = html_content.get("headline", "") + headline_extended = html_content.get("abstract", "") + slugline = html_content.get("slugline", "") + + # Embed the 'body_html' into the XML template + xml_output = xml_template.format( + headline, headline_extended, body_html, slugline + ) xml_output = clean_html_content(xml_output) - + return xml_output - + def extract_manual_tags(data): manual_tags = [] if "tags" in data: # Loop through each tag type (like 'subject', 'person', etc.) - for category, tags in data['tags'].items(): + for category, tags in data["tags"].items(): # Loop through each tag in the tag type - + for tag in tags: # Check if the source is 'manual' if tag.get("source") == "manual": @@ -609,41 +597,45 @@ def extract_manual_tags(data): return manual_tags + def capitalize_name_if_parent_none(tag): # Check if 'parent' is None and capitalize the first letter of 'name' if so - if tag.get('parent') is None: - tag['name'] = tag['name'].title() + if tag.get("parent") is None: + tag["name"] = tag["name"].title() return tag + def capitalize_name_if_parent_none_for_analyze(response): - for category in ['subject', 'organisation', 'person', 'event', 'place']: + for category in ["subject", "organisation", "person", "event", "place"]: for item in response.get(category, []): - if item.get('parent') is None: - item['name'] = item['name'].title() + if item.get("parent") is None: + item["name"] = item["name"].title() return response def replace_qcodes(output_data): - cv = superdesk.get_resource_service("vocabularies").find_one(req=None, _id="subject_custom") + cv = superdesk.get_resource_service("vocabularies").find_one( + req=None, _id="subject_custom" + ) # Create a mapping from semaphore_id to qcode - semaphore_to_qcode = {item['semaphore_id']: item['qcode'] for item in cv['items']} + semaphore_to_qcode = {item["semaphore_id"]: item["qcode"] for item in cv["items"]} # Define a function to replace qcodes in a given list def replace_in_list(data_list): for item in data_list: - if item['qcode'] in semaphore_to_qcode: - item['qcode'] = semaphore_to_qcode[item['qcode']] - if item.get('parent') and item['parent'] in semaphore_to_qcode: - item['parent'] = semaphore_to_qcode[item['parent']] + if item["qcode"] in semaphore_to_qcode: + item["qcode"] = semaphore_to_qcode[item["qcode"]] + if item.get("parent") and item["parent"] in semaphore_to_qcode: + item["parent"] = semaphore_to_qcode[item["parent"]] # Iterate over different categories and apply the replacement - for category in ['subject']: + for category in ["subject"]: if category in output_data: replace_in_list(output_data[category]) return output_data + def init_app(app): Semaphore(app) - diff --git a/server/cp/output/formatter/jimi_2.py b/server/cp/output/formatter/jimi_2.py index 5f4d0fe2..a1dc429b 100644 --- a/server/cp/output/formatter/jimi_2.py +++ b/server/cp/output/formatter/jimi_2.py @@ -106,7 +106,6 @@ def is_french(item) -> bool: class JimiFormatter(Formatter): - ENCODING = "utf-8" type = "jimi_2" @@ -181,7 +180,9 @@ def _format_item(self, root, item, pub_seq_num, service, services) -> None: if root.find("PscCodes") is None: etree.SubElement(root, "PscCodes").text = "Online" elif service: - etree.SubElement(root, "Services").text = "Écrit" if is_french(item) else "Print" + etree.SubElement(root, "Services").text = ( + "Écrit" if is_french(item) else "Print" + ) etree.SubElement(root, "PscCodes").text = service else: self._format_subject_code(root, item, "PscCodes", cp.DESTINATIONS) @@ -255,9 +256,7 @@ def _format_item(self, root, item, pub_seq_num, service, services) -> None: item.get("abstract") ) etree.SubElement(content, "ContentText").text = self._format_html(content_html) - etree.SubElement(content, "Language").text = ( - "2" if is_french(item) else "1" - ) + etree.SubElement(content, "Language").text = "2" if is_french(item) else "1" if item["type"] == "text" and content_html: content.find("DirectoryText").text = format_maxlength( @@ -273,7 +272,7 @@ def _format_item(self, root, item, pub_seq_num, service, services) -> None: etree.SubElement(content, "Stocks").text = ",".join(item["keywords"]) # IndexCodes are set here - + self._format_category_index(content, item) self._format_genre(content, item) self._format_urgency(content, item.get("urgency"), item["language"]) @@ -302,7 +301,9 @@ def _format_item(self, root, item, pub_seq_num, service, services) -> None: def get_item_id(self, item): if item.get("family_id"): - ingest_item = superdesk.get_resource_service("ingest").find_one(req=None, _id=item["family_id"]) + ingest_item = superdesk.get_resource_service("ingest").find_one( + req=None, _id=item["family_id"] + ) if ingest_item and ingest_item.get("unique_id"): return ingest_item["unique_id"] return item["unique_id"] @@ -401,11 +402,12 @@ def _format_category_index(self, content, item): indexes = uniq(categories + self._get_indexes(item)) # Add code here to remove the small case letters from here - filtered_indexes = [' '.join(word for word in index.split() if not word[0].islower()) for index in indexes] + filtered_indexes = [ + " ".join(word for word in index.split() if not word[0].islower()) + for index in indexes + ] # Remove empty strings from the filtered list indexes = [index for index in filtered_indexes if index] - - if categories: etree.SubElement(content, "Category").text = ",".join(categories) @@ -448,11 +450,11 @@ def _get_indexes(self, item): SUBJECTS_ID_3 = "http://cv.iptc.org/newscodes/mediatopic/" - subject = [ s for s in item.get("subject", []) - if s.get("name") and s.get("scheme") in (None, SUBJECTS_ID, SUBJECTS_ID_2, SUBJECTS_ID_3) + if s.get("name") + and s.get("scheme") in (None, SUBJECTS_ID, SUBJECTS_ID_2, SUBJECTS_ID_3) ] return self._resolve_names(subject, item["language"], SUBJECTS_ID) @@ -682,7 +684,11 @@ def _format_content(self, item, is_broadcast): elem.tag = "em" # Remove whitespace and empty tags - if elem.tag in INLINE_ELEMENTS and elem.text is not None and not elem.text.strip(): + if ( + elem.tag in INLINE_ELEMENTS + and elem.text is not None + and not elem.text.strip() + ): elem.drop_tree() return sd_etree.to_string(tree, encoding="unicode", method="html") diff --git a/server/cp/output/formatter/ninjs_formatter_2.py b/server/cp/output/formatter/ninjs_formatter_2.py index ba0a84b8..6d0c2609 100644 --- a/server/cp/output/formatter/ninjs_formatter_2.py +++ b/server/cp/output/formatter/ninjs_formatter_2.py @@ -39,7 +39,13 @@ from eve.utils import config from superdesk.publish.formatters import Formatter from superdesk.errors import FormatterError -from superdesk.metadata.item import ITEM_TYPE, CONTENT_TYPE, EMBARGO, GUID_FIELD, ASSOCIATIONS +from superdesk.metadata.item import ( + ITEM_TYPE, + CONTENT_TYPE, + EMBARGO, + GUID_FIELD, + ASSOCIATIONS, +) from superdesk.metadata.packages import RESIDREF, GROUP_ID, GROUPS, ROOT_GROUP, REFS from superdesk.utils import json_serialize_datetime_objectId from superdesk.media.renditions import get_renditions_spec @@ -76,14 +82,20 @@ def get_locale_name(item, language): def format_cv_item(item, language): """Format item from controlled vocabulary for output.""" if item.get("scheme") == "subject": - return filter_empty_vals( - {"code": item.get("qcode"), "name": get_locale_name(item, language), "scheme": "http://cv.iptc.org/newscodes/mediatopic/"} - ) + { + "code": item.get("qcode"), + "name": get_locale_name(item, language), + "scheme": "http://cv.iptc.org/newscodes/mediatopic/", + } + ) else: - return filter_empty_vals( - {"code": item.get("qcode"), "name": get_locale_name(item, language), "scheme": item.get("scheme")} + { + "code": item.get("qcode"), + "name": get_locale_name(item, language), + "scheme": item.get("scheme"), + } ) @@ -152,10 +164,17 @@ def __init__(self): def format(self, article, subscriber, codes=None): try: - pub_seq_num = superdesk.get_resource_service("subscribers").generate_sequence_number(subscriber) + pub_seq_num = superdesk.get_resource_service( + "subscribers" + ).generate_sequence_number(subscriber) ninjs = self._transform_to_ninjs(article, subscriber) - return [(pub_seq_num, json.dumps(ninjs, default=json_serialize_datetime_objectId))] + return [ + ( + pub_seq_num, + json.dumps(ninjs, default=json_serialize_datetime_objectId), + ) + ] except Exception as ex: raise FormatterError.ninjsFormatterError(ex, subscriber) @@ -199,7 +218,7 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True): # Updated the output for associations HERE if article.get("associations"): ninjs["associations"] = self._get_associations(article, subscriber) - + if article.get("embargoed"): ninjs["embargoed"] = article["embargoed"].isoformat() @@ -213,15 +232,24 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True): # Merging Various Entities into Subjects for ninjs Response # --------------------------------------------------------- - # This section of the code is responsible for aggregating different entity types - # like 'organisation', 'place', 'event', and 'person' along with 'subject' into + # This section of the code is responsible for aggregating different entity types + # like 'organisation', 'place', 'event', and 'person' along with 'subject' into # a single list. - - - if article.get("subject") or article.get("organisation") or article.get("place") or article.get("event") or article.get("person"): - combined_subjects = (self._get_subject(article) + self._get_organisation(article) + - self._get_place(article) + self._get_event(article) + - self._get_person(article)) + + if ( + article.get("subject") + or article.get("organisation") + or article.get("place") + or article.get("event") + or article.get("person") + ): + combined_subjects = ( + self._get_subject(article) + + self._get_organisation(article) + + self._get_place(article) + + self._get_event(article) + + self._get_person(article) + ) ninjs["subject"] = combined_subjects if article.get("anpa_category"): @@ -240,9 +268,9 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs["description_text"] = text_utils.get_text(article["abstract"]) elif article.get("description_text"): ninjs["description_text"] = article["description_text"] - ninjs["description_html"] = article.get("description_html") or "

{}

".format( - article["description_text"] - ) + ninjs["description_html"] = article.get( + "description_html" + ) or "

{}

".format(article["description_text"]) elif "abstract" in article: # BC ninjs["description_text"] = ninjs["description_html"] = "" @@ -251,7 +279,12 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True): { "name": c.get("name", ""), "rel": "Securities Identifier", - "symbols": [{"ticker": c.get("qcode", ""), "exchange": c.get("security_exchange", "")}], + "symbols": [ + { + "ticker": c.get("qcode", ""), + "exchange": c.get("security_exchange", ""), + } + ], } for c in article["company_codes"] ] @@ -261,8 +294,14 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True): if article.get("rewrite_of"): ninjs["evolvedfrom"] = article["rewrite_of"] - if not ninjs.get("copyrightholder") and not ninjs.get("copyrightnotice") and not ninjs.get("usageterms"): - ninjs.update(superdesk.get_resource_service("vocabularies").get_rightsinfo(article)) + if ( + not ninjs.get("copyrightholder") + and not ninjs.get("copyrightnotice") + and not ninjs.get("usageterms") + ): + ninjs.update( + superdesk.get_resource_service("vocabularies").get_rightsinfo(article) + ) if article.get("genre"): ninjs["genre"] = self._get_genre(article) @@ -271,22 +310,30 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs["signal"] = self._format_signal_cwarn() if article.get("signal"): - ninjs.setdefault("signal", []).extend([self._format_signal(signal) for signal in article["signal"]]) + ninjs.setdefault("signal", []).extend( + [self._format_signal(signal) for signal in article["signal"]] + ) if article.get("attachments"): ninjs["attachments"] = self._format_attachments(article) - if ninjs["type"] == CONTENT_TYPE.TEXT and ("body_html" in ninjs or "body_text" in ninjs): + if ninjs["type"] == CONTENT_TYPE.TEXT and ( + "body_html" in ninjs or "body_text" in ninjs + ): if "body_html" in ninjs: body_html = ninjs["body_html"] word_count = text_utils.get_word_count(body_html) char_count = text_utils.get_char_count(body_html) - readtime = text_utils.get_reading_time(body_html, word_count, article.get("language")) + readtime = text_utils.get_reading_time( + body_html, word_count, article.get("language") + ) else: body_text = ninjs["body_text"] word_count = text_utils.get_text_word_count(body_text) char_count = len(body_text) - readtime = text_utils.get_reading_time(body_text, word_count, article.get("language")) + readtime = text_utils.get_reading_time( + body_text, word_count, article.get("language") + ) ninjs["charcount"] = char_count ninjs["wordcount"] = word_count ninjs["readtime"] = readtime @@ -295,8 +342,10 @@ def _transform_to_ninjs(self, article, subscriber, recursive=True): ninjs["authors"] = self._format_authors(article) if (article.get("schedule_settings") or {}).get("utc_publish_schedule"): - ninjs["publish_schedule"] = article["schedule_settings"]["utc_publish_schedule"] - + ninjs["publish_schedule"] = article["schedule_settings"][ + "utc_publish_schedule" + ] + # Added Code to create Original_id attribute if article.get("family_id"): ninjs["original_id"] = article["family_id"] @@ -330,19 +379,16 @@ def _get_type(self, article): return CONTENT_TYPE.TEXT return article[ITEM_TYPE] - - # Added an updated _get_associations method - def _get_associations(self, article, subscriber): + def _get_associations(self, article, subscriber): """Create associations dict for package groups, including only the guid.""" associations = {} - for key, value in article.get('associations', {}).items(): - if '_id' in value: - associations[key] = {'guid': value['_id']} + for key, value in article.get("associations", {}).items(): + if "_id" in value: + associations[key] = {"guid": value["_id"]} return associations - def _format_related(self, article, subscriber): """Format all associated items for simple items (not packages).""" @@ -353,7 +399,10 @@ def _format_related(self, article, subscriber): archive_service = superdesk.get_resource_service("archive") article_associations = OrderedDict( - sorted(article.get(ASSOCIATIONS, {}).items(), key=lambda itm: (itm[1] or {}).get("order", 1)) + sorted( + article.get(ASSOCIATIONS, {}).items(), + key=lambda itm: (itm[1] or {}).get("order", 1), + ) ) for key, item in article_associations.items(): @@ -369,7 +418,9 @@ def _format_related(self, article, subscriber): renditions = item.get("renditions") if renditions: for rendition in renditions.keys(): - if rendition != "original" and renditions.get(rendition, {}).get("poi"): + if rendition != "original" and renditions.get( + rendition, {} + ).get("poi"): renditions[rendition].pop("poi", None) associations[key] = item # all items should stay in associations @@ -381,15 +432,22 @@ def _format_related(self, article, subscriber): try: profile = article["profile"] except KeyError: - logger.warning("missing profile in article (guid: {guid})".format(guid=article.get("guid"))) + logger.warning( + "missing profile in article (guid: {guid})".format( + guid=article.get("guid") + ) + ) content_profile = {"schema": {}} else: - content_profile = superdesk.get_resource_service("content_types").find_one( - _id=profile, req=None - ) + content_profile = superdesk.get_resource_service( + "content_types" + ).find_one(_id=profile, req=None) field_id = match.group("field_id") schema = content_profile["schema"].get(field_id, {}) - if schema.get("type") == "media" or schema.get("type") == "related_content": + if ( + schema.get("type") == "media" + or schema.get("type") == "related_content" + ): # we want custom media fields in "extra_items", cf. SDESK-2955 version = match.group("version") media.setdefault(field_id, []).append((version, item)) @@ -415,38 +473,53 @@ def _get_genre(self, article): lang = article.get("language", "") return [format_cv_item(item, lang) for item in article["genre"]] - - - def _get_subject(self, article): """Get subject list for article.""" - return [format_cv_item(item, article.get("language", "")) for item in article.get("subject", [])] + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("subject", []) + ] # Updated Code here to fetch Organisations from Article def _get_organisation(self, article): - return [format_cv_item(item, article.get("language", "")) for item in article.get("organisation", [])] + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("organisation", []) + ] # Updated Code here to fetch Places from Article def _get_place(self, article): """Get place list for article.""" - return [format_cv_item(item, article.get("language", "")) for item in article.get("place", [])] + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("place", []) + ] # Updated Code here to fetch Events from Article def _get_event(self, article): """Get event list for article.""" - return [format_cv_item(item, article.get("language", "")) for item in article.get("event", [])] + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("event", []) + ] # Updated Code here to fetch Person from Article def _get_person(self, article): """Get person list for article.""" - return [format_cv_item(item, article.get("language", "")) for item in article.get("person", [])] - + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("person", []) + ] + def _get_service(self, article): """Get service list for article. It's using `anpa_category` to populate service field for now. """ - return [format_cv_item(item, article.get("language", "")) for item in article.get("anpa_category", [])] + return [ + format_cv_item(item, article.get("language", "")) + for item in article.get("anpa_category", []) + ] def _get_renditions(self, article): """Get renditions for article.""" @@ -459,7 +532,9 @@ def _get_renditions(self, article): ) # filter renditions and keep only the ones we want to publish actual_renditions = { - name: actual_renditions[name] for name in renditions_to_publish if name in actual_renditions + name: actual_renditions[name] + for name in renditions_to_publish + if name in actual_renditions } # format renditions to Ninjs renditions = {} @@ -492,7 +567,11 @@ def _format_place(self, article): def get_label(item): if locator_map: - locators = [loc for loc in locator_map.get("items", []) if loc["qcode"] == item.get("qcode")] + locators = [ + loc + for loc in locator_map.get("items", []) + if loc["qcode"] == item.get("qcode") + ] if locators and len(locators) == 1: return ( locators[0].get("state") @@ -540,7 +619,10 @@ def _format_geonames(self, place): if app.config.get("NINJS_PLACE_EXTENDED") and place.get("location"): geo["geometry_point"] = { "type": "Point", - "coordinates": [place["location"].get("lat"), place["location"].get("lon")], + "coordinates": [ + place["location"].get("lat"), + place["location"].get("lon"), + ], } return geo @@ -548,13 +630,17 @@ def _format_profile(self, profile): return superdesk.get_resource_service("content_types").get_output_name(profile) def _format_signal_cwarn(self): - return [{"name": "Content Warning", "code": "cwarn", "scheme": SCHEME_MAP["sig"]}] + return [ + {"name": "Content Warning", "code": "cwarn", "scheme": SCHEME_MAP["sig"]} + ] def _format_attachments(self, article): output = [] attachments_service = superdesk.get_resource_service("attachments") for attachment_ref in article["attachments"]: - attachment = attachments_service.find_one(req=None, _id=attachment_ref["attachment"]) + attachment = attachments_service.find_one( + req=None, _id=attachment_ref["attachment"] + ) href = get_attachment_public_url(attachment) if href: # If we get a href, the attachment is available for subscriber consumption @@ -580,7 +666,11 @@ def _format_authors(self, article): job_titles_voc["items"] = vocabularies_service.get_locale_vocabulary( job_titles_voc.get("items"), article.get("language") ) - job_titles_map = {v["qcode"]: v["name"] for v in job_titles_voc["items"]} if job_titles_voc is not None else {} + job_titles_map = ( + {v["qcode"]: v["name"] for v in job_titles_voc["items"]} + if job_titles_voc is not None + else {} + ) authors = [] for author in article["authors"]: @@ -621,7 +711,10 @@ def _format_authors(self, article): job_title_qcode = user.get("job_title") if job_title_qcode is not None: - author["jobtitle"] = {"qcode": job_title_qcode, "name": job_titles_map.get(job_title_qcode, "")} + author["jobtitle"] = { + "qcode": job_title_qcode, + "name": job_titles_map.get(job_title_qcode, ""), + } authors.append(author) return authors @@ -670,7 +763,7 @@ def __init__(self): self.format_type = "ninjs3" def _transform_to_ninjs(self, article, subscriber, recursive=True): - print('Using NinjsFormatter 2') + print("Using NinjsFormatter 2") ninjs = super()._transform_to_ninjs(article, subscriber, recursive) ninjs["version"] = str(article.get("correction_sequence", 1)) return ninjs