Skip to content

Commit

Permalink
Merge pull request #78 from marwoodandrew/fix-html-headlines
Browse files Browse the repository at this point in the history
fix(formatters) remove markup from headlines and bylines
  • Loading branch information
marwoodandrew authored Aug 9, 2016
2 parents 2ab8813 + 0ac5b50 commit 7069fdf
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 7 deletions.
6 changes: 4 additions & 2 deletions server/aap/publish/formatters/aap_ipnews_formatter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ def testIPNewsHtmlToText(self):
'_id': '1',
'source': 'AAP',
'anpa_category': [{'qcode': 'a'}],
'headline': 'This is a test headline',
'byline': 'joe',
'headline': '<p>This is a test headline</p>',
'byline': '<div>joe</div>',
'slugline': 'slugline',
'subject': [{'qcode': '02011001'}],
'anpa_take_key': 'take_key',
Expand All @@ -122,6 +122,8 @@ def testIPNewsHtmlToText(self):
expected = ' The story body line 1\r\nLine 2\r\n abcdefghi abcdefghi abcdefghi abcdefghi ' \
'abcdefghi abcdefghi abcdefghi abcdefghi\r\n\r\nMORE'
self.assertEqual(item['article_text'], expected)
self.assertEqual(item['headline'], 'This is a test headline')
self.assertEqual(item['author'], 'joe')

def testLastTake(self):
article = {
Expand Down
7 changes: 5 additions & 2 deletions server/aap/publish/formatters/aap_odbc_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from eve.utils import config
import superdesk
from .unicodetoascii import to_ascii
from bs4 import BeautifulSoup


class AAPODBCFormatter():
Expand All @@ -32,8 +33,10 @@ def get_odbc_item(self, article, subscriber, category, codes):
pub_seq_num = superdesk.get_resource_service('subscribers').generate_sequence_number(subscriber)
odbc_item = dict(originator=article.get('source', None), sequence=pub_seq_num,
category=category.get('qcode').lower(),
headline=to_ascii(article.get('headline', '')).replace('\'', '\'\'').replace('\xA0', ' '),
author=(article.get('byline', '') or '').replace('\'', '\'\''),
headline=to_ascii(BeautifulSoup(article.get('headline', ''), 'html.parser').text).replace
('\'', '\'\'').replace('\xA0', ' '),
author=BeautifulSoup(article.get('byline', '') or '', 'html.parser').text.replace
('\'', '\'\''),
keyword=SluglineMapper().map(article=article,
category=category.get('qcode').upper(),
truncate=True).replace('\'', '\'\''),
Expand Down
5 changes: 3 additions & 2 deletions server/aap/publish/formatters/anpa_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ def format(self, article, subscriber, codes=None):
anpa.append(ednote.encode('ascii', 'replace'))

if BYLINE in formatted_article:
anpa.append(formatted_article.get(BYLINE).encode('ascii', 'ignore'))
anpa.append(BeautifulSoup(formatted_article.get(BYLINE), 'html.parser').text.encode
('ascii', 'ignore'))
anpa.append(b'\x0D\x0A')

if formatted_article.get(FORMAT) == FORMATS.PRESERVED:
Expand Down Expand Up @@ -174,7 +175,7 @@ def format_text_content(self, tag):
def _process_headline(self, anpa, article, category):
# prepend the locator to the headline if required
headline_prefix = LocatorMapper().map(article, category.decode('UTF-8').upper())
headline = to_ascii(article.get('headline', ''))
headline = to_ascii(BeautifulSoup(article.get('headline', ''), 'html.parser').text)
if headline_prefix:
headline = '{}:{}'.format(headline_prefix, headline)

Expand Down
9 changes: 8 additions & 1 deletion server/aap/publish/formatters/anpa_formatter_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def testANPAWithBylineFormatter(self):
subscriber = self.app.data.find('subscribers', None, None)[0]
subscriber['name'] = 'not notes'
byline_article = dict(self.article.copy())
byline_article['byline'] = 'Joe Blogs'
byline_article['byline'] = '<p>Joe Blogs</p>'

f = AAPAnpaFormatter()
seq, item = f.format(byline_article, subscriber)[0]
Expand Down Expand Up @@ -200,6 +200,13 @@ def test_process_headline_empty_sequence_short_headline(self):
f._process_headline(anpa, article, b'a')
self.assertEqual(anpa[0], b'12345678901234567890123456789012345678901234567890')

def test_headline_with_markup(self):
f = AAPAnpaFormatter()
article = {'headline': '<p>headline</p>'}
anpa = []
f._process_headline(anpa, article, b'a')
self.assertEqual(anpa[0], b'headline')

def test_process_headline_empty_sequence_long_headline(self):
f = AAPAnpaFormatter()
article = {'headline': '1234567890' * 7}
Expand Down

0 comments on commit 7069fdf

Please sign in to comment.