Skip to content

Commit

Permalink
Merge pull request #737 from LeXofLeviafan/fix-keywords
Browse files Browse the repository at this point in the history
[#734] fixed parsing of webpage keywords
  • Loading branch information
jarun authored Apr 30, 2024
2 parents 4c2bb26 + cfb43e3 commit 83576b4
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 4 deletions.
2 changes: 1 addition & 1 deletion buku
Original file line number Diff line number Diff line change
Expand Up @@ -4061,7 +4061,7 @@ def parse_decoded_page(page):
try:
if keywords:
keys = keywords.get('content').strip().replace('\n', ' ')
keys = re.sub(r'\s{2,}', ' ', keys)
keys = re.sub(r'\s{2,}', ' ', re.sub(r'\s*,\s*', ',', keys))
if is_unusual_tag(keys):
if keys not in (title, desc):
LOGDBG('keywords to description: %s', keys)
Expand Down
7 changes: 4 additions & 3 deletions tests/test_buku.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,10 +940,11 @@ def test_get_data_from_page(charset, mode):
'charset': f'\n<meta charset="{charset}"/>',
'content': f'\n<meta http-equiv="content-type" content="text/html; charset={charset}"/>',
}.get(mode, '')
body = f'<html>\n\n<head>{meta}\n<title>{title}</title>\n</head>\n<body></body>\n\n</html>\n'
keywords = '<meta name="keywords" content="foo, bar baz, quux"/>'
body = f'<html>\n\n<head>{meta}\n{keywords}\n<title>{title}</title>\n</head>\n<body></body>\n\n</html>\n'
resp = HTTPResponse(body.encode(charset), headers)
parsed_title, desc, keywords = get_data_from_page(resp)
assert parsed_title == title
parsed_title, desc, tags = get_data_from_page(resp)
assert (parsed_title, tags) == (title, "foo,bar baz,quux")


@pytest.mark.parametrize('tokens, valid, expected', [
Expand Down

0 comments on commit 83576b4

Please sign in to comment.