Merge pull request #78 from marwoodandrew/fix-html-headlines

fix(formatters) remove markup from headlines and bylines
superdesk · Aug 9, 2016 · 7069fdf · 7069fdf
2 parents 2ab8813 + 0ac5b50
commit 7069fdf
Show file tree

Hide file tree

Showing 4 changed files with 20 additions and 7 deletions.
diff --git a/server/aap/publish/formatters/aap_ipnews_formatter_test.py b/server/aap/publish/formatters/aap_ipnews_formatter_test.py
@@ -94,8 +94,8 @@ def testIPNewsHtmlToText(self):
             '_id': '1',
             'source': 'AAP',
             'anpa_category': [{'qcode': 'a'}],
-            'headline': 'This is a test headline',
-            'byline': 'joe',
+            'headline': '<p>This is a test headline</p>',
+            'byline': '<div>joe</div>',
             'slugline': 'slugline',
             'subject': [{'qcode': '02011001'}],
             'anpa_take_key': 'take_key',
@@ -122,6 +122,8 @@ def testIPNewsHtmlToText(self):
         expected = '   The story body line 1\r\nLine 2\r\n   abcdefghi abcdefghi abcdefghi abcdefghi ' \
                    'abcdefghi abcdefghi abcdefghi abcdefghi\r\n\r\nMORE'
         self.assertEqual(item['article_text'], expected)
+        self.assertEqual(item['headline'], 'This is a test headline')
+        self.assertEqual(item['author'], 'joe')
 
     def testLastTake(self):
         article = {

diff --git a/server/aap/publish/formatters/aap_odbc_formatter.py b/server/aap/publish/formatters/aap_odbc_formatter.py
@@ -17,6 +17,7 @@
 from eve.utils import config
 import superdesk
 from .unicodetoascii import to_ascii
+from bs4 import BeautifulSoup
 
 
 class AAPODBCFormatter():
@@ -32,8 +33,10 @@ def get_odbc_item(self, article, subscriber, category, codes):
         pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
         odbc_item = dict(originator=article.get('source', None), sequence=pub_seq_num,
                          category=category.get('qcode').lower(),
-                         headline=to_ascii(article.get('headline', '')).replace('\'', '\'\'').replace('\xA0', ' '),
-                         author=(article.get('byline', '') or '').replace('\'', '\'\''),
+                         headline=to_ascii(BeautifulSoup(article.get('headline', ''), 'html.parser').text).replace
+                         ('\'', '\'\'').replace('\xA0', ' '),
+                         author=BeautifulSoup(article.get('byline', '') or '', 'html.parser').text.replace
+                         ('\'', '\'\''),
                          keyword=SluglineMapper().map(article=article,
                                                       category=category.get('qcode').upper(),
                                                       truncate=True).replace('\'', '\'\''),

diff --git a/server/aap/publish/formatters/anpa_formatter.py b/server/aap/publish/formatters/anpa_formatter.py
@@ -108,7 +108,8 @@ def format(self, article, subscriber, codes=None):
                     anpa.append(ednote.encode('ascii', 'replace'))
 
                 if BYLINE in formatted_article:
-                    anpa.append(formatted_article.get(BYLINE).encode('ascii', 'ignore'))
+                    anpa.append(BeautifulSoup(formatted_article.get(BYLINE), 'html.parser').text.encode
+                                ('ascii', 'ignore'))
                     anpa.append(b'\x0D\x0A')
 
                 if formatted_article.get(FORMAT) == FORMATS.PRESERVED:
@@ -174,7 +175,7 @@ def format_text_content(self, tag):
     def _process_headline(self, anpa, article, category):
         # prepend the locator to the headline if required
         headline_prefix = LocatorMapper().map(article, category.decode('UTF-8').upper())
-        headline = to_ascii(article.get('headline', ''))
+        headline = to_ascii(BeautifulSoup(article.get('headline', ''), 'html.parser').text)
         if headline_prefix:
             headline = '{}:{}'.format(headline_prefix, headline)
 

diff --git a/server/aap/publish/formatters/anpa_formatter_test.py b/server/aap/publish/formatters/anpa_formatter_test.py
@@ -139,7 +139,7 @@ def testANPAWithBylineFormatter(self):
         subscriber = self.app.data.find('subscribers', None, None)[0]
         subscriber['name'] = 'not notes'
         byline_article = dict(self.article.copy())
-        byline_article['byline'] = 'Joe Blogs'
+        byline_article['byline'] = '<p>Joe Blogs</p>'
 
         f = AAPAnpaFormatter()
         seq, item = f.format(byline_article, subscriber)[0]
@@ -200,6 +200,13 @@ def test_process_headline_empty_sequence_short_headline(self):
         f._process_headline(anpa, article, b'a')
         self.assertEqual(anpa[0], b'12345678901234567890123456789012345678901234567890')
 
+    def test_headline_with_markup(self):
+        f = AAPAnpaFormatter()
+        article = {'headline': '<p>headline</p>'}
+        anpa = []
+        f._process_headline(anpa, article, b'a')
+        self.assertEqual(anpa[0], b'headline')
+
     def test_process_headline_empty_sequence_long_headline(self):
         f = AAPAnpaFormatter()
         article = {'headline': '1234567890' * 7}