From 3eb04bab767f101613fb0e6b305f412bbc48d32c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Fri, 6 Sep 2024 12:11:01 +0100
Subject: [PATCH] Use a single slug implementation

Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify'
method which was a duplicate of 'slug'.

Since 'GeniusFetchTest' only tested whether the artist name is cleaned
up (the rest of the functionality is patched), remove it and move its
test cases to the 'test_slug' test.
---
 beetsplug/lyrics.py         |  36 +++----------
 test/plugins/test_lyrics.py | 105 +++++++-----------------------------
 2 files changed, 27 insertions(+), 114 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index c662110e90..88e0616feb 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -22,12 +22,12 @@
 import itertools
 import os.path
 import re
-import unicodedata
 import urllib
 from contextlib import contextmanager
 from functools import cached_property
 from html import unescape
 from typing import Any, Iterator
+from urllib.parse import urlparse
 
 import requests
 from typing_extensions import TypedDict
@@ -183,7 +183,7 @@ def generate_alternatives(string, patterns):
     return itertools.product(artists, multi_titles)
 
 
-def slug(text):
+def slug(text: str) -> str:
     """Make a URL-safe, human-readable version of the given text
 
     This will do the following:
@@ -193,10 +193,6 @@ def slug(text):
     3. strip whitespace
     4. replace other non-word characters with dashes
     5. strip extra dashes
-
-    This somewhat duplicates the :func:`Google.slugify` function but
-    slugify is not as generic as this one, which can be reused
-    elsewhere.
     """
     return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
 
@@ -666,19 +662,6 @@ def is_lyrics(self, text, artist=None):
             self.debug("Bad triggers detected: {}", bad_triggers_occ)
         return len(bad_triggers_occ) < 2
 
-    def slugify(self, text):
-        """Normalize a string and remove non-alphanumeric characters."""
-        text = re.sub(r"[-'_\s]", "_", text)
-        text = re.sub(r"_+", "_", text).strip("_")
-        pat = r"([^,\(]*)\((.*?)\)"  # Remove content within parentheses
-        text = re.sub(pat, r"\g<1>", text).strip()
-        try:
-            text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
-            text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
-        except UnicodeDecodeError:
-            self.debug("Failed to normalize '{}'", text)
-        return text
-
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
@@ -686,12 +669,10 @@ def is_page_candidate(self, url_link, url_title, title, artist):
         """Return True if the URL title makes it a good candidate to be a
         page that contains lyrics of title by artist.
         """
-        title = self.slugify(title.lower())
-        artist = self.slugify(artist.lower())
-        sitename = re.search(
-            "//([^/]+)/.*", self.slugify(url_link.lower())
-        ).group(1)
-        url_title = self.slugify(url_title.lower())
+        title = slug(title)
+        artist = re.escape(slug(artist))
+        sitename = urlparse(url_link).netloc
+        url_title = slug(url_title)
 
         # Check if URL title contains song title (exact match)
         if url_title.find(title) != -1:
@@ -700,14 +681,13 @@ def is_page_candidate(self, url_link, url_title, title, artist):
         # or try extracting song title from URL title and check if
         # they are close enough
         tokens = (
-            [by + "_" + artist for by in self.BY_TRANS]
+            [by + "-" + artist for by in self.BY_TRANS]
             + [artist, sitename, sitename.replace("www.", "")]
             + self.LYRICS_TRANS
         )
-        tokens = [re.escape(t) for t in tokens]
         song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
 
-        song_title = song_title.strip("_|")
+        song_title = song_title.strip("-|")
         typo_ratio = 0.9
         ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
         return ratio >= typo_ratio
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 0fc345e352..a93f271deb 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -14,13 +14,10 @@
 
 """Tests for the 'lyrics' plugin."""
 
-import itertools
 import os
 import re
-import unittest
 from functools import partial
 from http import HTTPStatus
-from unittest.mock import patch
 
 import pytest
 
@@ -43,11 +40,7 @@
 )
 
 
-class LyricsPluginTest(unittest.TestCase):
-    def setUp(self):
-        """Set up configuration."""
-        lyrics.LyricsPlugin()
-
+class TestLyricsUtils:
     def test_search_artist(self):
         item = Item(artist="Alice ft. Bob", title="song")
         assert ("Alice ft. Bob", ["song"]) in lyrics.search_pairs(item)
@@ -159,6 +152,24 @@ def test_scrape_merge_paragraphs(self):
         text = "one</p>   <p class='myclass'>two</p><p>three"
         assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
 
+    @pytest.mark.parametrize(
+        "text, expected",
+        [
+            ("test", "test"),
+            ("Mørdag", "mordag"),
+            ("l'été c'est fait pour jouer", "l-ete-c-est-fait-pour-jouer"),
+            ("\xe7afe au lait (boisson)", "cafe-au-lait-boisson"),
+            ("Multiple  spaces -- and symbols! -- merged", "multiple-spaces-and-symbols-merged"),  # noqa: E501
+            ("\u200bno-width-space", "no-width-space"),
+            ("El\u002dp", "el-p"),
+            ("\u200bblackbear", "blackbear"),
+            ("\u200d", ""),
+            ("\u2010", ""),
+        ],
+    )  # fmt: skip
+    def test_slug(self, text, expected):
+        assert lyrics.slug(text) == expected
+
 
 @pytest.fixture(scope="module")
 def lyrics_root_dir(pytestconfig: pytest.Config):
@@ -333,10 +344,6 @@ def test_is_page_candidate(
     def test_bad_lyrics(self, backend, lyrics):
         assert not backend.is_lyrics(lyrics)
 
-    def test_slugify(self, backend):
-        text = "http://site.com/\xe7afe-au_lait(boisson)"
-        assert backend.slugify(text) == "http://site.com/cafe_au_lait"
-
 
 class TestGeniusLyrics(LyricsPluginBackendMixin):
     @pytest.fixture(scope="class")
@@ -358,52 +365,6 @@ def test_scrape_genius_lyrics(
 
         assert len(result.splitlines()) == expected_line_count
 
-    @patch.object(lyrics.Genius, "scrape_lyrics")
-    @patch.object(lyrics.Backend, "fetch_text", return_value=True)
-    def test_json(self, mock_fetch_text, mock_scrape, backend):
-        """Ensure we're finding artist matches"""
-        with patch.object(
-            lyrics.Genius,
-            "_search",
-            return_value={
-                "response": {
-                    "hits": [
-                        {
-                            "result": {
-                                "primary_artist": {
-                                    "name": "\u200bblackbear",
-                                },
-                                "url": "blackbear_url",
-                            }
-                        },
-                        {
-                            "result": {
-                                "primary_artist": {"name": "El\u002dp"},
-                                "url": "El-p_url",
-                            }
-                        },
-                    ]
-                }
-            },
-        ) as mock_json:
-            # genius uses zero-width-spaces (\u200B) for lowercase
-            # artists so we make sure we can match those
-            assert backend.fetch("blackbear", "Idfc") is not None
-            mock_fetch_text.assert_called_once_with("blackbear_url")
-            mock_scrape.assert_called_once_with(True)
-
-            # genius uses the hyphen minus (\u002D) as their dash
-            assert backend.fetch("El-p", "Idfc") is not None
-            mock_fetch_text.assert_called_with("El-p_url")
-            mock_scrape.assert_called_with(True)
-
-            # test no matching artist
-            assert backend.fetch("doesntexist", "none") is None
-
-            # test invalid json
-            mock_json.return_value = {"response": {"hits": []}}
-            assert backend.fetch("blackbear", "Idfc") is None
-
 
 class TestTekstowoLyrics(LyricsPluginBackendMixin):
     @pytest.fixture(scope="class")
@@ -519,31 +480,3 @@ def test_pick_lyrics_match(self, fetch_lyrics, expected_lyrics):
     )
     def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
         assert fetch_lyrics() == expected_lyrics
-
-
-class SlugTests(unittest.TestCase):
-    def test_slug(self):
-        # plain ascii passthrough
-        text = "test"
-        assert lyrics.slug(text) == "test"
-
-        # german unicode and capitals
-        text = "Mørdag"
-        assert lyrics.slug(text) == "mordag"
-
-        # more accents and quotes
-        text = "l'été c'est fait pour jouer"
-        assert lyrics.slug(text) == "l-ete-c-est-fait-pour-jouer"
-
-        # accents, parens and spaces
-        text = "\xe7afe au lait (boisson)"
-        assert lyrics.slug(text) == "cafe-au-lait-boisson"
-        text = "Multiple  spaces -- and symbols! -- merged"
-        assert lyrics.slug(text) == "multiple-spaces-and-symbols-merged"
-        text = "\u200bno-width-space"
-        assert lyrics.slug(text) == "no-width-space"
-
-        # variations of dashes should get standardized
-        dashes = ["\u200d", "\u2010"]
-        for dash1, dash2 in itertools.combinations(dashes, 2):
-            assert lyrics.slug(dash1) == lyrics.slug(dash2)