Skip to content

Commit

Permalink
Dealing with could_be_html false negatives
Browse files Browse the repository at this point in the history
Fix #183
  • Loading branch information
Yomguithereal committed Aug 17, 2023
1 parent 020a41a commit 8ab9889
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 5 deletions.
4 changes: 4 additions & 0 deletions test/could_be_html_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@
("https://www.lemonde.fr/img/figure.jpg", False),
("https://www.cosmopolitan.fr/inspirations-mode,2511387.asp1", True),
("https://www.cosmopolitan.fr/mode,2002.asp2", True),
(
"https://www.closermag.fr/people/photos.-les-vacances-des-bleus-adil-rami-assailli-par-les-fans-hugo-lloris-se-detend-en-famille-en-croisiere-851976",
True,
),
]


Expand Down
8 changes: 4 additions & 4 deletions test/lru_trie_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,11 +92,11 @@ def test_punycode(self):
sentinel = object()

trie = LRUTrie()
trie.set('http://françai.se', sentinel)
trie.set("http://françai.se", sentinel)

assert trie.match('http://xn--franai-zua.se') is None
assert trie.match("http://xn--franai-zua.se") is None

trie = CanonicalizedLRUTrie()
trie.set('http://françai.se', sentinel)
trie.set("http://françai.se", sentinel)

assert trie.match('http://xn--franai-zua.se') is sentinel
assert trie.match("http://xn--franai-zua.se") is sentinel
2 changes: 1 addition & 1 deletion ural/could_be_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def could_be_html(url):

_, ext = splitext(path)

if not ext:
if not ext or len(ext) > 16:
return True

return ext in HTML_LIKE_EXTENSIONS

0 comments on commit 8ab9889

Please sign in to comment.