Skip to content

Commit

Permalink
Use a single slug implementation
Browse files Browse the repository at this point in the history
Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify'
method which was a duplicate of 'slug'.

Since 'GeniusFetchTest' only tested whether the artist name is cleaned
up (the rest of the functionality is patched), remove it and move its
test cases to the 'test_slug' test.
  • Loading branch information
snejus committed Sep 19, 2024
1 parent 87f674f commit 3eb04ba
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 114 deletions.
36 changes: 8 additions & 28 deletions beetsplug/lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,12 @@
import itertools
import os.path
import re
import unicodedata
import urllib
from contextlib import contextmanager
from functools import cached_property
from html import unescape
from typing import Any, Iterator
from urllib.parse import urlparse

import requests
from typing_extensions import TypedDict
Expand Down Expand Up @@ -183,7 +183,7 @@ def generate_alternatives(string, patterns):
return itertools.product(artists, multi_titles)


def slug(text):
def slug(text: str) -> str:
"""Make a URL-safe, human-readable version of the given text
This will do the following:
Expand All @@ -193,10 +193,6 @@ def slug(text):
3. strip whitespace
4. replace other non-word characters with dashes
5. strip extra dashes
This somewhat duplicates the :func:`Google.slugify` function but
slugify is not as generic as this one, which can be reused
elsewhere.
"""
return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")

Expand Down Expand Up @@ -666,32 +662,17 @@ def is_lyrics(self, text, artist=None):
self.debug("Bad triggers detected: {}", bad_triggers_occ)
return len(bad_triggers_occ) < 2

def slugify(self, text):
"""Normalize a string and remove non-alphanumeric characters."""
text = re.sub(r"[-'_\s]", "_", text)
text = re.sub(r"_+", "_", text).strip("_")
pat = r"([^,\(]*)\((.*?)\)" # Remove content within parentheses
text = re.sub(pat, r"\g<1>", text).strip()
try:
text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
except UnicodeDecodeError:
self.debug("Failed to normalize '{}'", text)
return text

BY_TRANS = ["by", "par", "de", "von"]
LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]

def is_page_candidate(self, url_link, url_title, title, artist):
"""Return True if the URL title makes it a good candidate to be a
page that contains lyrics of title by artist.
"""
title = self.slugify(title.lower())
artist = self.slugify(artist.lower())
sitename = re.search(
"//([^/]+)/.*", self.slugify(url_link.lower())
).group(1)
url_title = self.slugify(url_title.lower())
title = slug(title)
artist = re.escape(slug(artist))
sitename = urlparse(url_link).netloc
url_title = slug(url_title)

# Check if URL title contains song title (exact match)
if url_title.find(title) != -1:
Expand All @@ -700,14 +681,13 @@ def is_page_candidate(self, url_link, url_title, title, artist):
# or try extracting song title from URL title and check if
# they are close enough
tokens = (
[by + "_" + artist for by in self.BY_TRANS]
[by + "-" + artist for by in self.BY_TRANS]
+ [artist, sitename, sitename.replace("www.", "")]
+ self.LYRICS_TRANS
)
tokens = [re.escape(t) for t in tokens]
song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)

song_title = song_title.strip("_|")
song_title = song_title.strip("-|")
typo_ratio = 0.9
ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
return ratio >= typo_ratio
Expand Down
105 changes: 19 additions & 86 deletions test/plugins/test_lyrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,10 @@

"""Tests for the 'lyrics' plugin."""

import itertools
import os
import re
import unittest
from functools import partial
from http import HTTPStatus
from unittest.mock import patch

import pytest

Expand All @@ -43,11 +40,7 @@
)


class LyricsPluginTest(unittest.TestCase):
def setUp(self):
"""Set up configuration."""
lyrics.LyricsPlugin()

class TestLyricsUtils:
def test_search_artist(self):
item = Item(artist="Alice ft. Bob", title="song")
assert ("Alice ft. Bob", ["song"]) in lyrics.search_pairs(item)
Expand Down Expand Up @@ -159,6 +152,24 @@ def test_scrape_merge_paragraphs(self):
text = "one</p> <p class='myclass'>two</p><p>three"
assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"

@pytest.mark.parametrize(
"text, expected",
[
("test", "test"),
("Mørdag", "mordag"),
("l'été c'est fait pour jouer", "l-ete-c-est-fait-pour-jouer"),
("\xe7afe au lait (boisson)", "cafe-au-lait-boisson"),
("Multiple spaces -- and symbols! -- merged", "multiple-spaces-and-symbols-merged"), # noqa: E501
("\u200bno-width-space", "no-width-space"),
("El\u002dp", "el-p"),
("\u200bblackbear", "blackbear"),
("\u200d", ""),
("\u2010", ""),
],
) # fmt: skip
def test_slug(self, text, expected):
assert lyrics.slug(text) == expected


@pytest.fixture(scope="module")
def lyrics_root_dir(pytestconfig: pytest.Config):
Expand Down Expand Up @@ -333,10 +344,6 @@ def test_is_page_candidate(
def test_bad_lyrics(self, backend, lyrics):
assert not backend.is_lyrics(lyrics)

def test_slugify(self, backend):
text = "http://site.com/\xe7afe-au_lait(boisson)"
assert backend.slugify(text) == "http://site.com/cafe_au_lait"


class TestGeniusLyrics(LyricsPluginBackendMixin):
@pytest.fixture(scope="class")
Expand All @@ -358,52 +365,6 @@ def test_scrape_genius_lyrics(

assert len(result.splitlines()) == expected_line_count

@patch.object(lyrics.Genius, "scrape_lyrics")
@patch.object(lyrics.Backend, "fetch_text", return_value=True)
def test_json(self, mock_fetch_text, mock_scrape, backend):
"""Ensure we're finding artist matches"""
with patch.object(
lyrics.Genius,
"_search",
return_value={
"response": {
"hits": [
{
"result": {
"primary_artist": {
"name": "\u200bblackbear",
},
"url": "blackbear_url",
}
},
{
"result": {
"primary_artist": {"name": "El\u002dp"},
"url": "El-p_url",
}
},
]
}
},
) as mock_json:
# genius uses zero-width-spaces (\u200B) for lowercase
# artists so we make sure we can match those
assert backend.fetch("blackbear", "Idfc") is not None
mock_fetch_text.assert_called_once_with("blackbear_url")
mock_scrape.assert_called_once_with(True)

# genius uses the hyphen minus (\u002D) as their dash
assert backend.fetch("El-p", "Idfc") is not None
mock_fetch_text.assert_called_with("El-p_url")
mock_scrape.assert_called_with(True)

# test no matching artist
assert backend.fetch("doesntexist", "none") is None

# test invalid json
mock_json.return_value = {"response": {"hits": []}}
assert backend.fetch("blackbear", "Idfc") is None


class TestTekstowoLyrics(LyricsPluginBackendMixin):
@pytest.fixture(scope="class")
Expand Down Expand Up @@ -519,31 +480,3 @@ def test_pick_lyrics_match(self, fetch_lyrics, expected_lyrics):
)
def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
assert fetch_lyrics() == expected_lyrics


class SlugTests(unittest.TestCase):
def test_slug(self):
# plain ascii passthrough
text = "test"
assert lyrics.slug(text) == "test"

# german unicode and capitals
text = "Mørdag"
assert lyrics.slug(text) == "mordag"

# more accents and quotes
text = "l'été c'est fait pour jouer"
assert lyrics.slug(text) == "l-ete-c-est-fait-pour-jouer"

# accents, parens and spaces
text = "\xe7afe au lait (boisson)"
assert lyrics.slug(text) == "cafe-au-lait-boisson"
text = "Multiple spaces -- and symbols! -- merged"
assert lyrics.slug(text) == "multiple-spaces-and-symbols-merged"
text = "\u200bno-width-space"
assert lyrics.slug(text) == "no-width-space"

# variations of dashes should get standardized
dashes = ["\u200d", "\u2010"]
for dash1, dash2 in itertools.combinations(dashes, 2):
assert lyrics.slug(dash1) == lyrics.slug(dash2)

0 comments on commit 3eb04ba

Please sign in to comment.