From 70db4eea536a609a36324ecc2f64a6f28ee4f657 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 12 Oct 2024 02:20:37 +0100
Subject: [PATCH 01/23] Apply dist_thresh to Genius and Google backends

This commit introduces a distance threshold mechanism for the Genius and
Google backends.

- Create a new `SearchBackend` base class with a method `check_match`
  that performs checking.
- Start using undocumented `dist_thresh` configuration option for good,
  and mention it in the docs. This controls the maximum allowable
  distance for matching artist and title names.

These changes aim to improve the accuracy of lyrics matching, especially
when there are slight variations in artist or title names, see #4791.
---
 beetsplug/lyrics.py         | 119 +++++++++++++++++++++---------------
 docs/changelog.rst          |   7 +++
 docs/plugins/lyrics.rst     |   6 ++
 test/plugins/test_lyrics.py |  40 +++++++++++-
 4 files changed, 121 insertions(+), 51 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index e6ab217c5b..10105d8c7e 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,10 +16,10 @@
 
 from __future__ import annotations
 
-import difflib
 import errno
 import itertools
 import json
+import math
 import os.path
 import re
 import struct
@@ -30,7 +30,7 @@
 from functools import cached_property, partial, total_ordering
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode
+from urllib.parse import quote, urlencode, urlparse
 
 import requests
 from typing_extensions import TypedDict
@@ -38,6 +38,7 @@
 
 import beets
 from beets import plugins, ui
+from beets.autotag.hooks import string_dist
 
 if TYPE_CHECKING:
     from beets.importer import ImportTask
@@ -485,15 +486,47 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         return lyrics
 
 
-class Genius(Backend):
+class SearchBackend(Backend):
+    REQUIRES_BS = True
+
+    @cached_property
+    def dist_thresh(self) -> float:
+        return self.config["dist_thresh"].get(float)
+
+    def check_match(
+        self, target_artist: str, target_title: str, artist: str, title: str
+    ) -> bool:
+        """Check if the given artist and title are 'good enough' match."""
+        max_dist = max(
+            string_dist(target_artist, artist),
+            string_dist(target_title, title),
+        )
+
+        if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
+            return True
+
+        if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
+            # log out the candidate that did not make it but was close.
+            # This may show a matching candidate with some noise in the name
+            self._log.debug(
+                "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
+                artist,
+                title,
+                target_artist,
+                target_title,
+                max_dist,
+            )
+
+        return False
+
+
+class Genius(SearchBackend):
     """Fetch lyrics from Genius via genius-api.
 
     Simply adapted from
     bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
     """
 
-    REQUIRES_BS = True
-
     base_url = "https://api.genius.com"
 
     def __init__(self, config, log):
@@ -516,19 +549,15 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
             self._log.debug("Genius API request returned invalid JSON")
             return None
 
-        # find a matching artist in the json
+        check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
-            hit_artist = hit["result"]["primary_artist"]["name"]
-
-            if slug(hit_artist) == slug(artist):
-                html = self.fetch_url(hit["result"]["url"])
+            result = hit["result"]
+            if check(result["primary_artist"]["name"], result["title"]):
+                html = self.fetch_url(result["url"])
                 if not html:
                     return None
                 return self._scrape_lyrics_from_html(html)
 
-        self._log.debug(
-            "Genius failed to find a matching artist for '{0}'", artist
-        )
         return None
 
     def _search(self, artist, title):
@@ -724,10 +753,9 @@ def is_text_notcode(text):
         return None
 
 
-class Google(Backend):
+class Google(SearchBackend):
     """Fetch lyrics from Google search results."""
 
-    REQUIRES_BS = True
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
     def is_lyrics(self, text, artist=None):
@@ -775,21 +803,20 @@ def slugify(self, text):
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
-    def is_page_candidate(self, url_link, url_title, title, artist):
+    def is_page_candidate(
+        self, artist: str, title: str, url_link: str, url_title: str
+    ) -> bool:
         """Return True if the URL title makes it a good candidate to be a
         page that contains lyrics of title by artist.
         """
-        title = self.slugify(title.lower())
-        artist = self.slugify(artist.lower())
-        sitename = re.search(
-            "//([^/]+)/.*", self.slugify(url_link.lower())
-        ).group(1)
-        url_title = self.slugify(url_title.lower())
-
-        # Check if URL title contains song title (exact match)
-        if url_title.find(title) != -1:
+        title_slug = self.slugify(title.lower())
+        url_title_slug = self.slugify(url_title.lower())
+        if title_slug in url_title_slug:
             return True
 
+        artist = self.slugify(artist.lower())
+        sitename = urlparse(url_link).netloc
+
         # or try extracting song title from URL title and check if
         # they are close enough
         tokens = (
@@ -798,12 +825,9 @@ def is_page_candidate(self, url_link, url_title, title, artist):
             + self.LYRICS_TRANS
         )
         tokens = [re.escape(t) for t in tokens]
-        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title)
+        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
 
-        song_title = song_title.strip("_|")
-        typo_ratio = 0.9
-        ratio = difflib.SequenceMatcher(None, song_title, title).ratio()
-        return ratio >= typo_ratio
+        return self.check_match(artist, title_slug, artist, song_title)
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
         params = {
@@ -825,24 +849,21 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
             self._log.debug("google backend error: {0}", reason)
             return None
 
-        if "items" in data.keys():
-            for item in data["items"]:
-                url_link = item["link"]
-                url_title = item.get("title", "")
-                if not self.is_page_candidate(
-                    url_link, url_title, title, artist
-                ):
-                    continue
-                html = self.fetch_url(url_link)
-                if not html:
-                    continue
-                lyrics = scrape_lyrics_from_html(html)
-                if not lyrics:
-                    continue
-
-                if self.is_lyrics(lyrics, artist):
-                    self._log.debug("got lyrics from {0}", item["displayLink"])
-                    return lyrics
+        check_candidate = partial(self.is_page_candidate, artist, title)
+        for item in data.get("items", []):
+            url_link = item["link"]
+            if not check_candidate(url_link, item.get("title", "")):
+                continue
+            html = self.fetch_url(url_link)
+            if not html:
+                continue
+            lyrics = scrape_lyrics_from_html(html)
+            if not lyrics:
+                continue
+
+            if self.is_lyrics(lyrics, artist):
+                self._log.debug("got lyrics from {0}", item["displayLink"])
+                return lyrics
 
         return None
 
@@ -866,6 +887,7 @@ def __init__(self):
                 "bing_client_secret": None,
                 "bing_lang_from": [],
                 "bing_lang_to": None,
+                "dist_thresh": 0.11,
                 "google_API_key": None,
                 "google_engine_ID": "009217259823014548361:lndtuqkycfu",
                 "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
@@ -877,7 +899,6 @@ def __init__(self):
                 # Musixmatch is disabled by default as they are currently blocking
                 # requests with the beets user agent.
                 "sources": [s for s in self.SOURCES if s != "musixmatch"],
-                "dist_thresh": 0.1,
             }
         )
         self.config["bing_client_secret"].redact = True
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 48b91c44c8..737631971f 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -11,6 +11,10 @@ New features:
 * :doc:`/plugins/substitute`: Allow the replacement string to use capture groups
   from the match. It is thus possible to create more general rules, applying to
   many different artists at once.
+* :doc:`plugins/lyrics`: Add new configuration option ``dist_thresh`` to
+  control the maximum allowed distance between the lyrics search result and the
+  tagged item's artist and title. This is useful for preventing false positives
+  when fetching lyrics.
 
 Bug fixes:
 
@@ -28,6 +32,9 @@ Bug fixes:
   ``lrclib`` over other sources since it returns reliable results quicker than
   others.
   :bug:`5102`
+* :doc:`plugins/lyrics`: Fix the issue with ``genius`` backend not being able
+  to match lyrics when there is a slight variation in the artist name.
+  :bug:`4791`
 
 For packagers:
 
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d1f434d70f..d080b1f940 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -42,6 +42,12 @@ configuration file. The available options are:
   Default: ``[]``
 - **bing_lang_to**: Language to translate lyrics into.
   Default: None.
+- **dist_thresh**: The maximum distance between the artist and title
+  combination of the music file and lyrics candidate to consider them a match.
+  Lower values will make the plugin more strict, higher values will make it
+  more lenient. This does not apply to the ``lrclib`` backend as it matches
+  durations.
+  Default: ``0.11``.
 - **fallback**: By default, the file will be left unchanged when no lyrics are
   found. Use the empty string ``''`` to reset the lyrics in such a case.
   Default: None.
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 0dee427ec3..0d625f8b6d 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -161,6 +161,42 @@ def test_slug(self, text, expected):
         assert lyrics.slug(text) == expected
 
 
+class TestSearchBackend:
+    @pytest.fixture
+    def backend(self, dist_thresh):
+        plugin = lyrics.LyricsPlugin()
+        plugin.config.set({"dist_thresh": dist_thresh})
+        return lyrics.SearchBackend(plugin.config, plugin._log)
+
+    @pytest.mark.parametrize(
+        "dist_thresh, target_artist, artist, should_match",
+        [
+            (0.11, "Target Artist", "Target Artist", True),
+            (0.11, "Target Artist", "Target Artis", True),
+            (0.11, "Target Artist", "Target Arti", False),
+            (0.11, "Psychonaut", "Psychonaut (BEL)", True),
+            (0.11, "beets song", "beats song", True),
+            (0.10, "beets song", "beats song", False),
+            (
+                0.11,
+                "Lucid Dreams (Forget Me)",
+                "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+                False,
+            ),
+            (
+                0.12,
+                "Lucid Dreams (Forget Me)",
+                "Lucid Dreams (Remix) ft. Lil Uzi Vert",
+                True,
+            ),
+        ],
+    )
+    def test_check_match(self, backend, target_artist, artist, should_match):
+        assert (
+            backend.check_match(target_artist, "", artist, "") == should_match
+        )
+
+
 @pytest.fixture(scope="module")
 def lyrics_root_dir(pytestconfig: pytest.Config):
     return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
@@ -275,10 +311,10 @@ def test_is_page_candidate(
         self, backend, lyrics_html, url_title, artist, should_be_candidate
     ):
         result = backend.is_page_candidate(
+            artist,
+            self.TITLE,
             "http://www.example.com/lyrics/beetssong",
             url_title,
-            self.TITLE,
-            artist,
         )
         assert bool(result) == should_be_candidate
 

From 5af8f0dd27e034d2f866fb96766fbe082ff205ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Tue, 27 Aug 2024 13:43:31 +0100
Subject: [PATCH 02/23] Make lyrics plugin documentation slightly more clear

---
 beetsplug/lyrics.py     |   6 +-
 docs/plugins/lyrics.rst | 130 ++++++++++++++++++++--------------------
 2 files changed, 70 insertions(+), 66 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 10105d8c7e..8c9c951033 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -890,8 +890,10 @@ def __init__(self):
                 "dist_thresh": 0.11,
                 "google_API_key": None,
                 "google_engine_ID": "009217259823014548361:lndtuqkycfu",
-                "genius_api_key": "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
-                "76V-uFL5jks5dNvcGCdarqFjDhP9c",
+                "genius_api_key": (
+                    "Ryq93pUGm8bM6eUWwD_M3NOFFDAtp2yEE7W"
+                    "76V-uFL5jks5dNvcGCdarqFjDhP9c"
+                ),
                 "fallback": None,
                 "force": False,
                 "local": False,
diff --git a/docs/plugins/lyrics.rst b/docs/plugins/lyrics.rst
index d080b1f940..f034cf47a1 100644
--- a/docs/plugins/lyrics.rst
+++ b/docs/plugins/lyrics.rst
@@ -2,25 +2,27 @@ Lyrics Plugin
 =============
 
 The ``lyrics`` plugin fetches and stores song lyrics from databases on the Web.
-Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_, `LRCLIB`_
-and, optionally, the Google custom search API.
+Namely, the current version of the plugin uses `Genius.com`_, `Tekstowo.pl`_,
+`LRCLIB`_ and, optionally, the Google Custom Search API.
 
 .. _Genius.com: https://genius.com/
 .. _Tekstowo.pl: https://www.tekstowo.pl/
 .. _LRCLIB: https://lrclib.net/
 
 
-Fetch Lyrics During Import
---------------------------
+Install
+-------
 
-To automatically fetch lyrics for songs you import, first enable it in your
-configuration (see :ref:`using-plugins`). Then, install ``beets`` with
-``lyrics`` extra
+Firstly, enable ``lyrics`` plugin in your configuration (see
+:ref:`using-plugins`). Then, install ``beets`` with ``lyrics`` extra
 
 .. code-block:: bash
 
     pip install "beets[lyrics]"
 
+Fetch Lyrics During Import
+--------------------------
+
 When importing new files, beets will now fetch lyrics for files that don't
 already have them. The lyrics will be stored in the beets database. If the
 ``import.write`` config option is on, then the lyrics will also be written to
@@ -29,52 +31,52 @@ the files' tags.
 Configuration
 -------------
 
-To configure the plugin, make a ``lyrics:`` section in your
-configuration file. The available options are:
+To configure the plugin, make a ``lyrics:`` section in your configuration file.
+Default configuration:
+
+.. code-block:: yaml
+
+    lyrics:
+        auto: yes
+        bing_client_secret: null
+        bing_lang_from: []
+        bing_lang_to: null
+        dist_thresh: 0.11
+        fallback: null
+        force: no
+        google_API_key: null
+        google_engine_ID: 009217259823014548361:lndtuqkycfu
+        sources: [lrclib, google, genius, tekstowo]
+        synced: no
+
+The available options are:
 
 - **auto**: Fetch lyrics automatically during import.
-  Default: ``yes``.
 - **bing_client_secret**: Your Bing Translation application password
-  (to :ref:`lyrics-translation`)
+  (see :ref:`lyrics-translation`)
 - **bing_lang_from**: By default all lyrics with a language other than
   ``bing_lang_to`` are translated. Use a list of lang codes to restrict the set
   of source languages to translate.
-  Default: ``[]``
 - **bing_lang_to**: Language to translate lyrics into.
-  Default: None.
 - **dist_thresh**: The maximum distance between the artist and title
   combination of the music file and lyrics candidate to consider them a match.
   Lower values will make the plugin more strict, higher values will make it
   more lenient. This does not apply to the ``lrclib`` backend as it matches
   durations.
-  Default: ``0.11``.
 - **fallback**: By default, the file will be left unchanged when no lyrics are
   found. Use the empty string ``''`` to reset the lyrics in such a case.
-  Default: None.
 - **force**: By default, beets won't fetch lyrics if the files already have
   ones. To instead always fetch lyrics, set the ``force`` option to ``yes``.
-  Default: ``no``.
 - **google_API_key**: Your Google API key (to enable the Google Custom Search
   backend).
-  Default: None.
 - **google_engine_ID**: The custom search engine to use.
   Default: The `beets custom search engine`_, which gathers an updated list of
   sources known to be scrapeable.
 - **sources**: List of sources to search for lyrics. An asterisk ``*`` expands
-  to all available sources.
-  Default: ``lrclib google genius tekstowo``, i.e., all the available sources. The
-  ``google`` source will be automatically deactivated if no ``google_API_key``
-  is setup.
-  The ``google``, ``genius``, and ``tekstowo`` sources will only be enabled if
-  BeautifulSoup is installed.
-- **synced**: Prefer synced lyrics over plain lyrics if a source offers them. Currently `lrclib` is the only source that provides them. Default: `no`.
-
-Here's an example of ``config.yaml``::
-
-    lyrics:
-      fallback: ''
-      google_API_key: AZERTYUIOPQSDFGHJKLMWXCVBN1234567890_ab
-      google_engine_ID: 009217259823014548361:lndtuqkycfu
+  to all available sources. The ``google`` source will be automatically
+  deactivated if no ``google_API_key`` is setup.
+- **synced**: Prefer synced lyrics over plain lyrics if a source offers them.
+  Currently ``lrclib`` is the only source that provides them.
 
 .. _beets custom search engine: https://www.google.com:443/cse/publicurl?cx=009217259823014548361:lndtuqkycfu
 
@@ -89,74 +91,74 @@ by that band, and ``beet lyrics`` will get lyrics for my entire library. The
 lyrics will be added to the beets database and, if ``import.write`` is on,
 embedded into files' metadata.
 
-The ``-p`` option to the ``lyrics`` command makes it print lyrics out to the
-console so you can view the fetched (or previously-stored) lyrics.
+The ``-p, --print`` option to the ``lyrics`` command makes it print lyrics out
+to the console so you can view the fetched (or previously-stored) lyrics.
 
-The ``-f`` option forces the command to fetch lyrics, even for tracks that
-already have lyrics. Inversely, the ``-l`` option restricts operations
-to lyrics that are locally available, which show lyrics faster without using
-the network at all.
+The ``-f, --force`` option forces the command to fetch lyrics, even for tracks
+that already have lyrics.
+
+Inversely, the ``-l, --local`` option restricts operations to lyrics that are
+locally available, which show lyrics faster without using the network at all.
 
 Rendering Lyrics into Other Formats
 -----------------------------------
 
-The ``-r directory`` option renders all lyrics as `reStructuredText`_ (ReST)
-documents in ``directory`` (by default, the current directory). That
-directory, in turn, can be parsed by tools like `Sphinx`_ to generate HTML,
-ePUB, or PDF documents.
+The ``-r directory, --write-rest directory`` option renders all lyrics as
+`reStructuredText`_ (ReST) documents in ``directory`` (by default, the current
+directory). That directory, in turn, can be parsed by tools like `Sphinx`_ to
+generate HTML, ePUB, or PDF documents.
 
-A minimal ``conf.py`` and ``index.rst`` files are created the first time the
+Minimal ``conf.py`` and ``index.rst`` files are created the first time the
 command is run. They are not overwritten on subsequent runs, so you can safely
 modify these files to customize the output.
 
-.. _Sphinx: https://www.sphinx-doc.org/
-.. _reStructuredText: http://docutils.sourceforge.net/rst.html
+Sphinx supports various `builders`_, see a few suggestions:
 
-Sphinx supports various `builders
-<https://www.sphinx-doc.org/en/stable/builders.html>`_, but here are a
-few suggestions.
 
- * Build an HTML version::
+.. admonition:: Build an HTML version
 
-    sphinx-build -b html . _build/html
+  ::
 
- * Build an ePUB3 formatted file, usable on ebook readers::
+      sphinx-build -b html . _build/html
 
-    sphinx-build -b epub3 . _build/epub
+.. admonition:: Build an ePUB3 formatted file, usable on ebook readers
 
- * Build a PDF file, which incidentally also builds a LaTeX file::
+  ::
 
-    sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
+      sphinx-build -b epub3 . _build/epub
 
-.. _activate-google-custom-search:
+.. admonition:: Build a PDF file, which incidentally also builds a LaTeX file
+
+  ::
+
+      sphinx-build -b latex %s _build/latex && make -C _build/latex all-pdf
+
+
+.. _Sphinx: https://www.sphinx-doc.org/
+.. _reStructuredText: http://docutils.sourceforge.net/rst.html
+.. _builders: https://www.sphinx-doc.org/en/stable/builders.html
 
 Activate Google Custom Search
 ------------------------------
 
 You need to `register for a Google API key`_. Set the ``google_API_key``
 configuration option to your key.
+
 Then add ``google`` to the list of sources in your configuration (or use
 default list, which includes it as long as you have an API key).
 If you use default ``google_engine_ID``, we recommend limiting the sources to
 ``google`` as the other sources are already included in the Google results.
 
-.. _register for a Google API key: https://console.developers.google.com/
-
 Optionally, you can `define a custom search engine`_. Get your search engine's
 token and use it for your ``google_engine_ID`` configuration option. By
 default, beets use a list of sources known to be scrapeable.
 
-.. _define a custom search engine: https://www.google.com/cse/all
-
 Note that the Google custom search API is limited to 100 queries per day.
 After that, the lyrics plugin will fall back on other declared data sources.
 
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
-
-Activate Genius and Tekstowo.pl Lyrics
---------------------------------------
+.. _register for a Google API key: https://console.developers.google.com/
+.. _define a custom search engine: https://www.google.com/cse/all
 
-These backends are enabled by default.
 
 .. _lyrics-translation:
 
@@ -167,6 +169,6 @@ You need to register for a Microsoft Azure Marketplace free account and
 to the `Microsoft Translator API`_. Follow the four steps process, specifically
 at step 3 enter ``beets`` as *Client ID* and copy/paste the generated
 *Client secret* into your ``bing_client_secret`` configuration, alongside
-``bing_lang_to`` target `language code`.
+``bing_lang_to`` target ``language code``.
 
 .. _Microsoft Translator API: https://docs.microsoft.com/en-us/azure/cognitive-services/translator/translator-how-to-signup

From ad53e8dc3966231d8e6d4abdae99ebefa6f7c067 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Wed, 4 Sep 2024 04:15:46 +0100
Subject: [PATCH 03/23] Centralize requests setup with requests.Session

Improve requests performance with requests.Session which uses connection
pooling for repeated requests to the same host.

Additionally, this centralizes request configuration, making sure that
we use the same timeout and provide beets user agent for all requests.
---
 beetsplug/lyrics.py | 68 +++++++++++++++++----------------------------
 setup.cfg           |  4 +--
 2 files changed, 28 insertions(+), 44 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 8c9c951033..6db5d3c6db 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import atexit
 import errno
 import itertools
 import json
@@ -24,13 +25,12 @@
 import re
 import struct
 import unicodedata
-import warnings
 from contextlib import suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
-from urllib.parse import quote, urlencode, urlparse
+from urllib.parse import quote, urlparse
 
 import requests
 from typing_extensions import TypedDict
@@ -106,6 +106,22 @@ class NotFoundError(requests.exceptions.HTTPError):
     pass
 
 
+class TimeoutSession(requests.Session):
+    def request(self, *args, **kwargs):
+        kwargs.setdefault("timeout", 10)
+        return super().request(*args, **kwargs)
+
+
+r_session = TimeoutSession()
+r_session.headers.update({"User-Agent": USER_AGENT})
+
+
+@atexit.register
+def close_session():
+    """Close the requests session on shut down."""
+    r_session.close()
+
+
 # Utilities.
 
 
@@ -246,21 +262,7 @@ def fetch_url(self, url, **kwargs):
         is unreachable.
         """
         try:
-            # Disable the InsecureRequestWarning that comes from using
-            # `verify=false`.
-            # https://github.com/kennethreitz/requests/issues/2214
-            # We're not overly worried about the NSA MITMing our lyrics scraper
-            with warnings.catch_warnings():
-                warnings.simplefilter("ignore")
-                r = requests.get(
-                    url,
-                    verify=False,
-                    headers={
-                        "User-Agent": USER_AGENT,
-                    },
-                    timeout=10,
-                    **kwargs,
-                )
+            r = r_session.get(url)
         except requests.RequestException as exc:
             self._log.debug("lyrics request failed: {0}", exc)
             return
@@ -368,9 +370,7 @@ def warn(self, message: str, *args) -> None:
 
     def fetch_json(self, *args, **kwargs):
         """Wrap the request method to raise an exception on HTTP errors."""
-        kwargs.setdefault("timeout", 10)
-        kwargs.setdefault("headers", {"User-Agent": USER_AGENT})
-        r = requests.get(*args, **kwargs)
+        r = r_session.get(*args, **kwargs)
         if r.status_code == HTTPStatus.NOT_FOUND:
             raise NotFoundError("HTTP Error: Not Found", response=r)
         r.raise_for_status()
@@ -532,10 +532,7 @@ class Genius(SearchBackend):
     def __init__(self, config, log):
         super().__init__(config, log)
         self.api_key = config["genius_api_key"].as_str()
-        self.headers = {
-            "Authorization": "Bearer %s" % self.api_key,
-            "User-Agent": USER_AGENT,
-        }
+        self.headers = {"Authorization": f"Bearer {self.api_key}"}
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
         """Fetch lyrics from genius.com
@@ -570,18 +567,13 @@ def _search(self, artist, title):
         search_url = self.base_url + "/search"
         data = {"q": title + " " + artist.lower()}
         try:
-            response = requests.get(
-                search_url,
-                params=data,
-                headers=self.headers,
-                timeout=10,
-            )
+            r = r_session.get(search_url, params=data, headers=self.headers)
         except requests.RequestException as exc:
             self._log.debug("Genius API request failed: {0}", exc)
             return None
 
         try:
-            return response.json()
+            return r.json()
         except ValueError:
             return None
 
@@ -976,13 +968,7 @@ def get_bing_access_token(self):
         }
 
         oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
-        oauth_token = json.loads(
-            requests.post(
-                oauth_url,
-                data=urlencode(params),
-                timeout=10,
-            ).content
-        )
+        oauth_token = r_session.post(oauth_url, params=params).json()
         if "access_token" in oauth_token:
             return "Bearer " + oauth_token["access_token"]
         else:
@@ -1199,10 +1185,8 @@ def append_translation(self, text, to_lang):
                 "https://api.microsofttranslator.com/v2/Http.svc/"
                 "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
             )
-            r = requests.get(
-                url,
-                headers={"Authorization ": self.bing_auth_token},
-                timeout=10,
+            r = r_session.get(
+                url, headers={"Authorization": self.bing_auth_token}
             )
             if r.status_code != 200:
                 self._log.debug(
diff --git a/setup.cfg b/setup.cfg
index 15ca23f658..8e3d7e3b82 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -21,8 +21,8 @@ omit = beets/test/*
 precision = 2
 skip_empty = true
 show_missing = true
-exclude_lines =
-    pragma: no cover
+exclude_also =
+    @atexit.register
     if TYPE_CHECKING
     if typing.TYPE_CHECKING
     raise AssertionError

From 16eada1de8934fab9bea31d5a6dad1c476301fad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 16:14:15 +0100
Subject: [PATCH 04/23] Centralise request error handling

---
 beetsplug/lyrics.py         | 228 +++++++++++++++---------------------
 test/plugins/test_lyrics.py |  62 ++++++----
 2 files changed, 136 insertions(+), 154 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 6db5d3c6db..5c0cbcc90e 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -19,13 +19,12 @@
 import atexit
 import errno
 import itertools
-import json
 import math
 import os.path
 import re
 import struct
 import unicodedata
-from contextlib import suppress
+from contextlib import contextmanager, suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
 from http import HTTPStatus
@@ -108,8 +107,14 @@ class NotFoundError(requests.exceptions.HTTPError):
 
 class TimeoutSession(requests.Session):
     def request(self, *args, **kwargs):
+        """Wrap the request method to raise an exception on HTTP errors."""
         kwargs.setdefault("timeout", 10)
-        return super().request(*args, **kwargs)
+        r = super().request(*args, **kwargs)
+        if r.status_code == HTTPStatus.NOT_FOUND:
+            raise NotFoundError("HTTP Error: Not Found", response=r)
+        r.raise_for_status()
+
+        return r
 
 
 r_session = TimeoutSession()
@@ -250,28 +255,36 @@ def try_parse_html(html, **kwargs):
         return None
 
 
-class Backend:
+class RequestHandler:
+    _log: beets.logging.Logger
+
+    def fetch_text(self, url: str, **kwargs) -> str:
+        """Return text / HTML data from the given URL."""
+        self._log.debug("Fetching HTML from {}", url)
+        return r_session.get(url, **kwargs).text
+
+    def fetch_json(self, url: str, **kwargs):
+        """Return JSON data from the given URL."""
+        self._log.debug("Fetching JSON from {}", url)
+        return r_session.get(url, **kwargs).json()
+
+    @contextmanager
+    def handle_request(self) -> Iterator[None]:
+        try:
+            yield
+        except requests.JSONDecodeError:
+            self._log.warning("Could not decode response JSON data")
+        except requests.RequestException as exc:
+            self._log.warning("Request error: {}", exc)
+
+
+class Backend(RequestHandler):
     REQUIRES_BS = False
 
     def __init__(self, config, log):
         self._log = log
         self.config = config
 
-    def fetch_url(self, url, **kwargs):
-        """Retrieve the content at a given URL, or return None if the source
-        is unreachable.
-        """
-        try:
-            r = r_session.get(url)
-        except requests.RequestException as exc:
-            self._log.debug("lyrics request failed: {0}", exc)
-            return
-        if r.status_code == requests.codes.ok:
-            return r.text
-        else:
-            self._log.debug("failed to fetch: {0} ({1})", url, r.status_code)
-            return None
-
     def fetch(
         self, artist: str, title: str, album: str, length: int
     ) -> str | None:
@@ -368,15 +381,6 @@ def warn(self, message: str, *args) -> None:
         """Log a warning message with the class name."""
         self._log.warning(f"{self.__class__.__name__}: {message}", *args)
 
-    def fetch_json(self, *args, **kwargs):
-        """Wrap the request method to raise an exception on HTTP errors."""
-        r = r_session.get(*args, **kwargs)
-        if r.status_code == HTTPStatus.NOT_FOUND:
-            raise NotFoundError("HTTP Error: Not Found", response=r)
-        r.raise_for_status()
-
-        return r.json()
-
     def fetch_candidates(
         self, artist: str, title: str, album: str, length: int
     ) -> Iterator[list[LRCLibItem]]:
@@ -414,13 +418,7 @@ def fetch(
                 filter(None, map(pick, (map(make, x) for x in fetch())))
             ).get_text(self.config["synced"])
         except StopIteration:
-            pass
-        except requests.JSONDecodeError:
-            self.warn("Could not decode response JSON data")
-        except requests.RequestException as exc:
-            self.warn("Request error: {}", exc)
-
-        return None
+            return None
 
 
 class DirectBackend(Backend):
@@ -460,9 +458,7 @@ def encode(cls, text: str) -> str:
     def fetch(self, artist: str, title: str, *_) -> str | None:
         url = self.build_url(artist, title)
 
-        html = self.fetch_url(url)
-        if not html:
-            return None
+        html = self.fetch_text(url)
         if "We detected that your IP is blocked" in html:
             self._log.warning(
                 "we are blocked at MusixMatch: url %s failed" % url
@@ -528,6 +524,7 @@ class Genius(SearchBackend):
     """
 
     base_url = "https://api.genius.com"
+    search_url = f"{base_url}/search"
 
     def __init__(self, config, log):
         super().__init__(config, log)
@@ -550,10 +547,9 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         for hit in json["response"]["hits"]:
             result = hit["result"]
             if check(result["primary_artist"]["name"], result["title"]):
-                html = self.fetch_url(result["url"])
-                if not html:
-                    return None
-                return self._scrape_lyrics_from_html(html)
+                return self._scrape_lyrics_from_html(
+                    self.fetch_text(result["url"])
+                )
 
         return None
 
@@ -564,29 +560,20 @@ def _search(self, artist, title):
 
         :returns: json response
         """
-        search_url = self.base_url + "/search"
-        data = {"q": title + " " + artist.lower()}
-        try:
-            r = r_session.get(search_url, params=data, headers=self.headers)
-        except requests.RequestException as exc:
-            self._log.debug("Genius API request failed: {0}", exc)
-            return None
-
-        try:
-            return r.json()
-        except ValueError:
-            return None
+        return self.fetch_json(
+            self.search_url,
+            params={"q": f"{title} {artist.lower()}"},
+            headers=self.headers,
+        )
 
     def replace_br(self, lyrics_div):
         for br in lyrics_div.find_all("br"):
             br.replace_with("\n")
 
-    def _scrape_lyrics_from_html(self, html):
+    def _scrape_lyrics_from_html(self, html: str) -> str | None:
         """Scrape lyrics from a given genius.com html"""
 
         soup = try_parse_html(html)
-        if not soup:
-            return
 
         # Remove script tags that they put in the middle of the lyrics.
         [h.extract() for h in soup("script")]
@@ -657,10 +644,12 @@ def encode(cls, text: str) -> str:
         return cls.non_alpha_to_underscore(unidecode(text.lower()))
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
-        if html := self.fetch_url(self.build_url(artist, title)):
-            return self.extract_lyrics(html)
-
-        return None
+        # We are expecting to receive a 404 since we are guessing the URL.
+        # Thus suppress the error so that it does not end up in the logs.
+        with suppress(NotFoundError):
+            return self.extract_lyrics(
+                self.fetch_text(self.build_url(artist, title))
+            )
 
     def extract_lyrics(self, html: str) -> str | None:
         html = _scrape_strip_cruft(html)
@@ -714,7 +703,7 @@ def _scrape_merge_paragraphs(html):
     return re.sub(r"<div .*>\s*</div>", "\n", html)
 
 
-def scrape_lyrics_from_html(html):
+def scrape_lyrics_from_html(html: str) -> str | None:
     """Scrape lyrics from a URL. If no lyrics can be found, return None
     instead.
     """
@@ -734,8 +723,6 @@ def is_text_notcode(text):
 
     # extract all long text blocks that are not code
     soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
-    if not soup:
-        return None
 
     # Get the longest text element (if any).
     strings = sorted(soup.stripped_strings, key=len, reverse=True)
@@ -828,39 +815,26 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
             "q": f"{artist} {title}",
         }
 
-        data = self.fetch_url(self.SEARCH_URL, params=params)
-        if not data:
-            self._log.debug("google backend returned no data")
-            return None
-        try:
-            data = json.loads(data)
-        except ValueError as exc:
-            self._log.debug("google backend returned malformed JSON: {}", exc)
-        if "error" in data:
-            reason = data["error"]["errors"][0]["reason"]
-            self._log.debug("google backend error: {0}", reason)
-            return None
-
         check_candidate = partial(self.is_page_candidate, artist, title)
-        for item in data.get("items", []):
+        for item in self.fetch_json(self.SEARCH_URL, params=params).get(
+            "items", []
+        ):
             url_link = item["link"]
             if not check_candidate(url_link, item.get("title", "")):
                 continue
-            html = self.fetch_url(url_link)
-            if not html:
-                continue
-            lyrics = scrape_lyrics_from_html(html)
-            if not lyrics:
-                continue
+            with self.handle_request():
+                lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
+                if not lyrics:
+                    continue
 
-            if self.is_lyrics(lyrics, artist):
-                self._log.debug("got lyrics from {0}", item["displayLink"])
-                return lyrics
+                if self.is_lyrics(lyrics, artist):
+                    self._log.debug("got lyrics from {0}", item["displayLink"])
+                    return lyrics
 
         return None
 
 
-class LyricsPlugin(plugins.BeetsPlugin):
+class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
     SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
     SOURCE_BACKENDS = {
         "google": Google,
@@ -931,7 +905,6 @@ def __init__(self):
         self.config["bing_lang_from"] = [
             x.lower() for x in self.config["bing_lang_from"].as_str_seq()
         ]
-        self.bing_auth_token = None
 
         if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
             self._log.warning(
@@ -959,7 +932,8 @@ def sanitize_bs_sources(self, sources):
 
         return enabled_sources
 
-    def get_bing_access_token(self):
+    @cached_property
+    def bing_access_token(self) -> str | None:
         params = {
             "client_id": "beets",
             "client_secret": self.config["bing_client_secret"],
@@ -968,14 +942,11 @@ def get_bing_access_token(self):
         }
 
         oauth_url = "https://datamarket.accesscontrol.windows.net/v2/OAuth2-13"
-        oauth_token = r_session.post(oauth_url, params=params).json()
-        if "access_token" in oauth_token:
-            return "Bearer " + oauth_token["access_token"]
-        else:
-            self._log.warning(
-                "Could not get Bing Translate API access token."
-                ' Check your "bing_client_secret" password'
-            )
+        with self.handle_request():
+            r = r_session.post(oauth_url, params=params)
+            return r.json()["access_token"]
+
+        return None
 
     def commands(self):
         cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
@@ -1164,44 +1135,39 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None:
         None if no lyrics were found.
         """
         for backend in self.backends:
-            lyrics = backend.fetch(artist, title, *args)
-            if lyrics:
-                self._log.debug(
-                    "got lyrics from backend: {0}", backend.__class__.__name__
-                )
-                return _scrape_strip_cruft(lyrics, True)
+            with backend.handle_request():
+                if lyrics := backend.fetch(artist, title, *args):
+                    self._log.debug(
+                        "got lyrics from backend: {0}",
+                        backend.__class__.__name__,
+                    )
+                    return _scrape_strip_cruft(lyrics, True)
 
         return None
 
     def append_translation(self, text, to_lang):
         from xml.etree import ElementTree
 
-        if not self.bing_auth_token:
-            self.bing_auth_token = self.get_bing_access_token()
-        if self.bing_auth_token:
-            # Extract unique lines to limit API request size per song
-            text_lines = set(text.split("\n"))
-            url = (
-                "https://api.microsofttranslator.com/v2/Http.svc/"
-                "Translate?text=%s&to=%s" % ("|".join(text_lines), to_lang)
+        if not (token := self.bing_access_token):
+            self._log.warning(
+                "Could not get Bing Translate API access token. "
+                "Check your 'bing_client_secret' password."
             )
-            r = r_session.get(
-                url, headers={"Authorization": self.bing_auth_token}
+            return text
+
+        # Extract unique lines to limit API request size per song
+        lines = text.split("\n")
+        unique_lines = set(lines)
+        url = "https://api.microsofttranslator.com/v2/Http.svc/Translate"
+        with self.handle_request():
+            text = self.fetch_text(
+                url,
+                headers={"Authorization": f"Bearer {token}"},
+                params={"text": "|".join(unique_lines), "to": to_lang},
             )
-            if r.status_code != 200:
-                self._log.debug(
-                    "translation API error {}: {}", r.status_code, r.text
-                )
-                if "token has expired" in r.text:
-                    self.bing_auth_token = None
-                    return self.append_translation(text, to_lang)
-                return text
-            lines_translated = ElementTree.fromstring(
-                r.text.encode("utf-8")
-            ).text
-            # Use a translation mapping dict to build resulting lyrics
-            translations = dict(zip(text_lines, lines_translated.split("|")))
-            result = ""
-            for line in text.split("\n"):
-                result += "{} / {}\n".format(line, translations[line])
-            return result
+            if translated := ElementTree.fromstring(text.encode("utf-8")).text:
+                # Use a translation mapping dict to build resulting lyrics
+                translations = dict(zip(unique_lines, translated.split("|")))
+                return "".join(f"{ln} / {translations[ln]}\n" for ln in lines)
+
+        return text
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 0d625f8b6d..73b661c0b2 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -202,7 +202,7 @@ def lyrics_root_dir(pytestconfig: pytest.Config):
     return pytestconfig.rootpath / "test" / "rsrc" / "lyrics"
 
 
-class LyricsBackendTest(PluginMixin):
+class LyricsPluginMixin(PluginMixin):
     plugin = "lyrics"
 
     @pytest.fixture
@@ -218,6 +218,42 @@ def lyrics_plugin(self, backend_name, plugin_config):
 
         return lyrics.LyricsPlugin()
 
+
+class TestLyricsPlugin(LyricsPluginMixin):
+    @pytest.fixture
+    def backend_name(self):
+        """Return lyrics configuration to test."""
+        return "lrclib"
+
+    @pytest.mark.parametrize(
+        "request_kwargs, expected_log_match",
+        [
+            (
+                {"status_code": HTTPStatus.BAD_GATEWAY},
+                r"lyrics: Request error: 502",
+            ),
+            ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"),
+        ],
+    )
+    def test_error_handling(
+        self,
+        requests_mock,
+        lyrics_plugin,
+        caplog,
+        request_kwargs,
+        expected_log_match,
+    ):
+        """Errors are logged with the plugin name."""
+        requests_mock.get(lyrics.LRCLib.GET_URL, **request_kwargs)
+
+        assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None
+        assert caplog.messages
+        last_log = caplog.messages[-1]
+        assert last_log
+        assert re.search(expected_log_match, last_log, re.I)
+
+
+class LyricsBackendTest(LyricsPluginMixin):
     @pytest.fixture
     def backend(self, lyrics_plugin):
         """Return a lyrics backend instance."""
@@ -399,13 +435,9 @@ def backend_name(self):
         return "lrclib"
 
     @pytest.fixture
-    def request_kwargs(self, response_data):
-        return {"json": response_data}
-
-    @pytest.fixture
-    def fetch_lyrics(self, backend, requests_mock, request_kwargs):
+    def fetch_lyrics(self, backend, requests_mock, response_data):
         requests_mock.get(backend.GET_URL, status_code=HTTPStatus.NOT_FOUND)
-        requests_mock.get(backend.SEARCH_URL, **request_kwargs)
+        requests_mock.get(backend.SEARCH_URL, json=response_data)
 
         return partial(backend.fetch, "la", "la", "la", self.ITEM_DURATION)
 
@@ -463,19 +495,3 @@ def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
     @pytest.mark.parametrize("plugin_config", [{"synced": True}])
     def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
         assert fetch_lyrics() == expected_lyrics
-
-    @pytest.mark.parametrize(
-        "request_kwargs, expected_log_match",
-        [
-            (
-                {"status_code": HTTPStatus.BAD_GATEWAY},
-                r"LRCLib: Request error: 502",
-            ),
-            ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
-        ],
-    )
-    def test_error(self, caplog, fetch_lyrics, expected_log_match):
-        assert fetch_lyrics() is None
-        assert caplog.messages
-        assert (last_log := caplog.messages[-1])
-        assert re.search(expected_log_match, last_log, re.I)

From 9a2602591a7512c2fb93695f509adccb9b6b30cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Fri, 6 Sep 2024 07:35:24 +0100
Subject: [PATCH 05/23] Include class name in the log messages

---
 beetsplug/lyrics.py         | 77 +++++++++++++++++--------------------
 test/plugins/test_lyrics.py |  6 +--
 2 files changed, 39 insertions(+), 44 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 5c0cbcc90e..8bc1461984 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -258,14 +258,26 @@ def try_parse_html(html, **kwargs):
 class RequestHandler:
     _log: beets.logging.Logger
 
+    def debug(self, message: str, *args) -> None:
+        """Log a debug message with the class name."""
+        self._log.debug(f"{self.__class__.__name__}: {message}", *args)
+
+    def info(self, message: str, *args) -> None:
+        """Log an info message with the class name."""
+        self._log.info(f"{self.__class__.__name__}: {message}", *args)
+
+    def warn(self, message: str, *args) -> None:
+        """Log warning with the class name."""
+        self._log.warning(f"{self.__class__.__name__}: {message}", *args)
+
     def fetch_text(self, url: str, **kwargs) -> str:
         """Return text / HTML data from the given URL."""
-        self._log.debug("Fetching HTML from {}", url)
+        self.debug("Fetching HTML from {}", url)
         return r_session.get(url, **kwargs).text
 
     def fetch_json(self, url: str, **kwargs):
         """Return JSON data from the given URL."""
-        self._log.debug("Fetching JSON from {}", url)
+        self.debug("Fetching JSON from {}", url)
         return r_session.get(url, **kwargs).json()
 
     @contextmanager
@@ -273,9 +285,9 @@ def handle_request(self) -> Iterator[None]:
         try:
             yield
         except requests.JSONDecodeError:
-            self._log.warning("Could not decode response JSON data")
+            self.warn("Could not decode response JSON data")
         except requests.RequestException as exc:
-            self._log.warning("Request error: {}", exc)
+            self.warn("Request error: {}", exc)
 
 
 class Backend(RequestHandler):
@@ -377,10 +389,6 @@ class LRCLib(Backend):
     GET_URL = f"{BASE_URL}/get"
     SEARCH_URL = f"{BASE_URL}/search"
 
-    def warn(self, message: str, *args) -> None:
-        """Log a warning message with the class name."""
-        self._log.warning(f"{self.__class__.__name__}: {message}", *args)
-
     def fetch_candidates(
         self, artist: str, title: str, album: str, length: int
     ) -> Iterator[list[LRCLibItem]]:
@@ -460,9 +468,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
 
         html = self.fetch_text(url)
         if "We detected that your IP is blocked" in html:
-            self._log.warning(
-                "we are blocked at MusixMatch: url %s failed" % url
-            )
+            self.warn("Failed: Blocked IP address")
             return None
         html_parts = html.split('<p class="mxm-lyrics__content')
         # Sometimes lyrics come in 2 or more parts
@@ -504,7 +510,7 @@ def check_match(
         if math.isclose(max_dist, self.dist_thresh, abs_tol=0.4):
             # log out the candidate that did not make it but was close.
             # This may show a matching candidate with some noise in the name
-            self._log.debug(
+            self.debug(
                 "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
                 artist,
                 title,
@@ -539,9 +545,6 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         then attempt to scrape that url for the lyrics.
         """
         json = self._search(artist, title)
-        if not json:
-            self._log.debug("Genius API request returned invalid JSON")
-            return None
 
         check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
@@ -585,7 +588,7 @@ def _scrape_lyrics_from_html(self, html: str) -> str | None:
 
         lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
         if not lyrics_divs:
-            self._log.debug("Received unusual song page html")
+            self.debug("Received unusual song page html")
             return self._try_extracting_lyrics_from_non_data_lyrics_container(
                 soup
             )
@@ -608,10 +611,10 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
                 class_=re.compile("LyricsPlaceholder__Message"),
                 string="This song is an instrumental",
             ):
-                self._log.debug("Detected instrumental")
+                self.debug("Detected instrumental")
                 return INSTRUMENTAL_LYRICS
             else:
-                self._log.debug("Couldn't scrape page using known layouts")
+                self.debug("Couldn't scrape page using known layouts")
                 return None
 
         lyrics_div = verse_div.parent
@@ -744,7 +747,7 @@ def is_lyrics(self, text, artist=None):
         bad_triggers_occ = []
         nb_lines = text.count("\n")
         if nb_lines <= 1:
-            self._log.debug("Ignoring too short lyrics '{0}'", text)
+            self.debug("Ignoring too short lyrics '{}'", text)
             return False
         elif nb_lines < 5:
             bad_triggers_occ.append("too_short")
@@ -763,7 +766,7 @@ def is_lyrics(self, text, artist=None):
             )
 
         if bad_triggers_occ:
-            self._log.debug("Bad triggers detected: {0}", bad_triggers_occ)
+            self.debug("Bad triggers detected: {}", bad_triggers_occ)
         return len(bad_triggers_occ) < 2
 
     def slugify(self, text):
@@ -776,7 +779,7 @@ def slugify(self, text):
             text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
             text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
         except UnicodeDecodeError:
-            self._log.exception("Failing to normalize '{0}'", text)
+            self.debug("Failed to normalize '{}'", text)
         return text
 
     BY_TRANS = ["by", "par", "de", "von"]
@@ -828,7 +831,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
                     continue
 
                 if self.is_lyrics(lyrics, artist):
-                    self._log.debug("got lyrics from {0}", item["displayLink"])
+                    self.debug("Got lyrics from {}", item["displayLink"])
                     return lyrics
 
         return None
@@ -897,9 +900,7 @@ def __init__(self):
                 # configuration includes `google`. This way, the source
                 # is silent by default but can be enabled just by
                 # setting an API key.
-                self._log.debug(
-                    "Disabling google source: " "no API key configured."
-                )
+                self.debug("Disabling google source: " "no API key configured.")
                 sources.remove("google")
 
         self.config["bing_lang_from"] = [
@@ -907,15 +908,14 @@ def __init__(self):
         ]
 
         if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
-            self._log.warning(
-                "To use bing translations, you need to "
-                "install the langdetect module. See the "
-                "documentation for further details."
+            self.warn(
+                "To use bing translations, you need to install the langdetect "
+                "module. See the documentation for further details."
             )
 
         self.backends = [
-            self.SOURCE_BACKENDS[source](self.config, self._log)
-            for source in sources
+            self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s))
+            for s in sources
         ]
 
     def sanitize_bs_sources(self, sources):
@@ -946,8 +946,6 @@ def bing_access_token(self) -> str | None:
             r = r_session.post(oauth_url, params=params)
             return r.json()["access_token"]
 
-        return None
-
     def commands(self):
         cmd = ui.Subcommand("lyrics", help="fetch song lyrics")
         cmd.parser.add_option(
@@ -1092,7 +1090,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
         """
         # Skip if the item already has lyrics.
         if not force and item.lyrics:
-            self._log.info("lyrics already present: {0}", item)
+            self.info("Lyrics already present: {}", item)
             return
 
         lyrics_matches = []
@@ -1108,7 +1106,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
         lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
 
         if lyrics:
-            self._log.info("fetched lyrics: {0}", item)
+            self.info("Lyrics found: {}", item)
             if HAS_LANGDETECT and self.config["bing_client_secret"].get():
                 lang_from = langdetect.detect(lyrics)
                 if self.config["bing_lang_to"].get() != lang_from and (
@@ -1119,7 +1117,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
                         lyrics, self.config["bing_lang_to"]
                     )
         else:
-            self._log.info("lyrics not found: {0}", item)
+            self.info("Lyrics not found: {}", item)
             fallback = self.config["fallback"].get()
             if fallback:
                 lyrics = fallback
@@ -1134,13 +1132,10 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None:
         """Fetch lyrics, trying each source in turn. Return a string or
         None if no lyrics were found.
         """
+        self.info("Fetching lyrics for {} - {}", artist, title)
         for backend in self.backends:
             with backend.handle_request():
                 if lyrics := backend.fetch(artist, title, *args):
-                    self._log.debug(
-                        "got lyrics from backend: {0}",
-                        backend.__class__.__name__,
-                    )
                     return _scrape_strip_cruft(lyrics, True)
 
         return None
@@ -1149,7 +1144,7 @@ def append_translation(self, text, to_lang):
         from xml.etree import ElementTree
 
         if not (token := self.bing_access_token):
-            self._log.warning(
+            self.warn(
                 "Could not get Bing Translate API access token. "
                 "Check your 'bing_client_secret' password."
             )
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 73b661c0b2..6a3ee310a0 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -230,9 +230,9 @@ def backend_name(self):
         [
             (
                 {"status_code": HTTPStatus.BAD_GATEWAY},
-                r"lyrics: Request error: 502",
+                r"LRCLib: Request error: 502",
             ),
-            ({"text": "invalid"}, r"lyrics: Could not decode.*JSON"),
+            ({"text": "invalid"}, r"LRCLib: Could not decode.*JSON"),
         ],
     )
     def test_error_handling(
@@ -243,7 +243,7 @@ def test_error_handling(
         request_kwargs,
         expected_log_match,
     ):
-        """Errors are logged with the plugin name."""
+        """Errors are logged with the backend name."""
         requests_mock.get(lyrics.LRCLib.GET_URL, **request_kwargs)
 
         assert lyrics_plugin.get_lyrics("", "", "", 0.0) is None

From 53b5b19d75cfc297798dd8d9472b33e55e0eab72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Wed, 11 Sep 2024 07:51:43 +0100
Subject: [PATCH 06/23] Leave a single chef in the kitchen

---
 beetsplug/lyrics.py         | 79 ++++++++++---------------------------
 test/plugins/test_lyrics.py |  6 +--
 2 files changed, 23 insertions(+), 62 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 8bc1461984..4ea0e13d12 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -44,8 +44,7 @@
     from beets.library import Item
 
 try:
-    import bs4
-    from bs4 import SoupStrainer
+    from bs4 import BeautifulSoup
 
     HAS_BEAUTIFUL_SOUP = True
 except ImportError:
@@ -244,17 +243,6 @@ def slug(text):
     return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
 
 
-if HAS_BEAUTIFUL_SOUP:
-
-    def try_parse_html(html, **kwargs):
-        return bs4.BeautifulSoup(html, "html.parser", **kwargs)
-
-else:
-
-    def try_parse_html(html, **kwargs):
-        return None
-
-
 class RequestHandler:
     _log: beets.logging.Logger
 
@@ -550,9 +538,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         for hit in json["response"]["hits"]:
             result = hit["result"]
             if check(result["primary_artist"]["name"], result["title"]):
-                return self._scrape_lyrics_from_html(
-                    self.fetch_text(result["url"])
-                )
+                return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
 
         return None
 
@@ -569,17 +555,9 @@ def _search(self, artist, title):
             headers=self.headers,
         )
 
-    def replace_br(self, lyrics_div):
-        for br in lyrics_div.find_all("br"):
-            br.replace_with("\n")
-
-    def _scrape_lyrics_from_html(self, html: str) -> str | None:
+    def scrape_lyrics(self, html: str) -> str | None:
         """Scrape lyrics from a given genius.com html"""
-
-        soup = try_parse_html(html)
-
-        # Remove script tags that they put in the middle of the lyrics.
-        [h.extract() for h in soup("script")]
+        soup = get_soup(html)
 
         # Most of the time, the page contains a div with class="lyrics" where
         # all of the lyrics can be found already correctly formatted
@@ -594,7 +572,6 @@ def _scrape_lyrics_from_html(self, html: str) -> str | None:
             )
         lyrics = ""
         for lyrics_div in lyrics_divs:
-            self.replace_br(lyrics_div)
             lyrics += lyrics_div.get_text() + "\n\n"
         while lyrics[-1] == "\n":
             lyrics = lyrics[:-1]
@@ -618,7 +595,6 @@ def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
                 return None
 
         lyrics_div = verse_div.parent
-        self.replace_br(lyrics_div)
 
         ads = lyrics_div.find_all(
             "div", class_=re.compile("InreadAd__Container")
@@ -650,15 +626,12 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         # We are expecting to receive a 404 since we are guessing the URL.
         # Thus suppress the error so that it does not end up in the logs.
         with suppress(NotFoundError):
-            return self.extract_lyrics(
+            return self.scrape_lyrics(
                 self.fetch_text(self.build_url(artist, title))
             )
 
-    def extract_lyrics(self, html: str) -> str | None:
-        html = _scrape_strip_cruft(html)
-        html = _scrape_merge_paragraphs(html)
-
-        soup = try_parse_html(html)
+    def scrape_lyrics(self, html: str) -> str | None:
+        soup = get_soup(html)
 
         if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
             return lyrics_div.get_text()
@@ -706,33 +679,11 @@ def _scrape_merge_paragraphs(html):
     return re.sub(r"<div .*>\s*</div>", "\n", html)
 
 
-def scrape_lyrics_from_html(html: str) -> str | None:
-    """Scrape lyrics from a URL. If no lyrics can be found, return None
-    instead.
-    """
-
-    def is_text_notcode(text):
-        if not text:
-            return False
-        length = len(text)
-        return (
-            length > 20
-            and text.count(" ") > length / 25
-            and (text.find("{") == -1 or text.find(";") == -1)
-        )
-
+def get_soup(html: str) -> BeautifulSoup:
     html = _scrape_strip_cruft(html)
     html = _scrape_merge_paragraphs(html)
 
-    # extract all long text blocks that are not code
-    soup = try_parse_html(html, parse_only=SoupStrainer(string=is_text_notcode))
-
-    # Get the longest text element (if any).
-    strings = sorted(soup.stripped_strings, key=len, reverse=True)
-    if strings:
-        return strings[0]
-    else:
-        return None
+    return BeautifulSoup(html, "html.parser")
 
 
 class Google(SearchBackend):
@@ -740,6 +691,16 @@ class Google(SearchBackend):
 
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
+    @staticmethod
+    def scrape_lyrics(html: str) -> str | None:
+        soup = get_soup(html)
+
+        # Get the longest text element (if any).
+        strings = sorted(soup.stripped_strings, key=len, reverse=True)
+        if strings:
+            return strings[0]
+        return None
+
     def is_lyrics(self, text, artist=None):
         """Determine whether the text seems to be valid lyrics."""
         if not text:
@@ -826,7 +787,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
             if not check_candidate(url_link, item.get("title", "")):
                 continue
             with self.handle_request():
-                lyrics = scrape_lyrics_from_html(self.fetch_text(url_link))
+                lyrics = self.scrape_lyrics(self.fetch_text(url_link))
                 if not lyrics:
                     continue
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 6a3ee310a0..564b877ded 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -328,7 +328,7 @@ def file_name(self):
 
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
-        result = lyrics.scrape_lyrics_from_html(lyrics_html).lower()
+        result = backend.scrape_lyrics(lyrics_html).lower()
 
         assert result
         assert backend.is_lyrics(result)
@@ -390,7 +390,7 @@ def backend_name(self):
         ],
     )  # fmt: skip
     def test_scrape(self, backend, lyrics_html, expected_line_count):
-        result = backend._scrape_lyrics_from_html(lyrics_html) or ""
+        result = backend.scrape_lyrics(lyrics_html) or ""
 
         assert len(result.splitlines()) == expected_line_count
 
@@ -411,7 +411,7 @@ def backend_name(self):
         ],
     )
     def test_scrape(self, backend, lyrics_html, expecting_lyrics):
-        assert bool(backend.extract_lyrics(lyrics_html)) == expecting_lyrics
+        assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
 
 
 LYRICS_DURATION = 950

From aa3a1c5e99e9900f6e87b5b5a939275c37cf6674 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 19 Oct 2024 03:30:41 +0100
Subject: [PATCH 07/23] Do not try to strip cruft from the parsed lyrics text.

Having removed it I fuond that only the Genius lyrics changed: it had en
extra new line. Thus I defined a function 'collapse_newlines' which now
gets called for the Genius lyrics.
---
 beetsplug/lyrics.py         | 24 +++++++++++-------------
 test/plugins/test_lyrics.py |  5 ++---
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 4ea0e13d12..218cf7b020 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -57,9 +57,6 @@
 except ImportError:
     HAS_LANGDETECT = False
 
-DIV_RE = re.compile(r"<(/?)div>?", re.I)
-COMMENT_RE = re.compile(r"<!--.*-->", re.S)
-TAG_RE = re.compile(r"<[^>]*>")
 BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
@@ -537,8 +534,11 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         check = partial(self.check_match, artist, title)
         for hit in json["response"]["hits"]:
             result = hit["result"]
-            if check(result["primary_artist"]["name"], result["title"]):
-                return self.scrape_lyrics(self.fetch_text(hit["result"]["url"]))
+            url = hit["result"]["url"]
+            if check(result["primary_artist"]["name"], result["title"]) and (
+                lyrics := self.scrape_lyrics(self.fetch_text(url))
+            ):
+                return collapse_newlines(lyrics)
 
         return None
 
@@ -653,7 +653,10 @@ def remove_credits(text):
     return text
 
 
-def _scrape_strip_cruft(html, plain_text_out=False):
+collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
+
+
+def _scrape_strip_cruft(html: str) -> str:
     """Clean up HTML"""
     html = unescape(html)
 
@@ -665,13 +668,8 @@ def _scrape_strip_cruft(html, plain_text_out=False):
     html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
     html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove italics / bold
 
-    if plain_text_out:  # Strip remaining HTML tags
-        html = COMMENT_RE.sub("", html)
-        html = TAG_RE.sub("", html)
-
     html = "\n".join([x.strip() for x in html.strip().split("\n")])
-    html = re.sub(r"\n{3,}", r"\n\n", html)
-    return html
+    return collapse_newlines(html)
 
 
 def _scrape_merge_paragraphs(html):
@@ -1097,7 +1095,7 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None:
         for backend in self.backends:
             with backend.handle_request():
                 if lyrics := backend.fetch(artist, title, *args):
-                    return _scrape_strip_cruft(lyrics, True)
+                    return lyrics
 
         return None
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 564b877ded..8ee895d4d7 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -129,14 +129,13 @@ def test_remove_credits(self, initial_lyrics, expected):
                   two  !
                   <br><br \\>
                   <blink>four</blink>""",
-                "one\ntwo !\n\nfour",
+                "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
             ),
             ("foo<script>bar</script>baz", "foobaz"),
-            ("foo<!--<bar>-->qux", "fooqux"),
         ],
     )
     def test_scrape_strip_cruft(self, initial_text, expected):
-        assert lyrics._scrape_strip_cruft(initial_text, True) == expected
+        assert lyrics._scrape_strip_cruft(initial_text) == expected
 
     def test_scrape_merge_paragraphs(self):
         text = "one</p>   <p class='myclass'>two</p><p>three"

From d3aeed2afe4c0bd76483ff94a13be07556a20c2e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Fri, 6 Sep 2024 12:11:01 +0100
Subject: [PATCH 08/23] Use a single slug implementation

Tidy up 'Google.is_page_candidate' method and remove 'Google.sluggify'
method which was a duplicate of 'slug'.

Since 'GeniusFetchTest' only tested whether the artist name is cleaned
up (the rest of the functionality is patched), remove it and move its
test cases to the 'test_slug' test.
---
 beetsplug/lyrics.py         | 33 ++++++++-------------------------
 test/plugins/test_lyrics.py |  4 ----
 2 files changed, 8 insertions(+), 29 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 218cf7b020..d8c92e6940 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -23,7 +23,6 @@
 import os.path
 import re
 import struct
-import unicodedata
 from contextlib import contextmanager, suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
@@ -222,7 +221,7 @@ def generate_alternatives(string, patterns):
     return itertools.product(artists, multi_titles)
 
 
-def slug(text):
+def slug(text: str) -> str:
     """Make a URL-safe, human-readable version of the given text
 
     This will do the following:
@@ -232,10 +231,6 @@ def slug(text):
     3. strip whitespace
     4. replace other non-word characters with dashes
     5. strip extra dashes
-
-    This somewhat duplicates the :func:`Google.slugify` function but
-    slugify is not as generic as this one, which can be reused
-    elsewhere.
     """
     return re.sub(r"\W+", "-", unidecode(text).lower().strip()).strip("-")
 
@@ -728,19 +723,6 @@ def is_lyrics(self, text, artist=None):
             self.debug("Bad triggers detected: {}", bad_triggers_occ)
         return len(bad_triggers_occ) < 2
 
-    def slugify(self, text):
-        """Normalize a string and remove non-alphanumeric characters."""
-        text = re.sub(r"[-'_\s]", "_", text)
-        text = re.sub(r"_+", "_", text).strip("_")
-        pat = r"([^,\(]*)\((.*?)\)"  # Remove content within parentheses
-        text = re.sub(pat, r"\g<1>", text).strip()
-        try:
-            text = unicodedata.normalize("NFKD", text).encode("ascii", "ignore")
-            text = str(re.sub(r"[-\s]+", " ", text.decode("utf-8")))
-        except UnicodeDecodeError:
-            self.debug("Failed to normalize '{}'", text)
-        return text
-
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
@@ -750,23 +732,24 @@ def is_page_candidate(
         """Return True if the URL title makes it a good candidate to be a
         page that contains lyrics of title by artist.
         """
-        title_slug = self.slugify(title.lower())
-        url_title_slug = self.slugify(url_title.lower())
+        title_slug = slug(title)
+        url_title_slug = slug(url_title)
         if title_slug in url_title_slug:
             return True
 
-        artist = self.slugify(artist.lower())
+        artist = slug(artist)
         sitename = urlparse(url_link).netloc
 
         # or try extracting song title from URL title and check if
         # they are close enough
         tokens = (
-            [by + "_" + artist for by in self.BY_TRANS]
+            [by + "-" + artist for by in self.BY_TRANS]
             + [artist, sitename, sitename.replace("www.", "")]
             + self.LYRICS_TRANS
         )
-        tokens = [re.escape(t) for t in tokens]
-        song_title = re.sub("(%s)" % "|".join(tokens), "", url_title_slug)
+        song_title = re.sub(
+            "(%s)" % "|".join(tokens), "", url_title_slug
+        ).strip("-")
 
         return self.check_match(artist, title_slug, artist, song_title)
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 8ee895d4d7..4d3be149fb 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -370,10 +370,6 @@ def test_is_page_candidate(
     def test_bad_lyrics(self, backend, lyrics):
         assert not backend.is_lyrics(lyrics)
 
-    def test_slugify(self, backend):
-        text = "http://site.com/\xe7afe-au_lait(boisson)"
-        assert backend.slugify(text) == "http://site.com/cafe_au_lait"
-
 
 class TestGeniusLyrics(LyricsBackendTest):
     @pytest.fixture(scope="class")

From f7df3fbea3d6cba30d06c22ecc42ab6ad480435e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Thu, 19 Sep 2024 20:00:44 +0100
Subject: [PATCH 09/23] lyrics: Add symbols for better visual feedback in the
 logs

---
 beetsplug/lyrics.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index d8c92e6940..057cc4b581 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1032,7 +1032,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
         """
         # Skip if the item already has lyrics.
         if not force and item.lyrics:
-            self.info("Lyrics already present: {}", item)
+            self.info("🔵 Lyrics already present: {}", item)
             return
 
         lyrics_matches = []
@@ -1048,7 +1048,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
         lyrics = "\n\n---\n\n".join(filter(None, lyrics_matches))
 
         if lyrics:
-            self.info("Lyrics found: {}", item)
+            self.info("🟢 Found lyrics: {0}", item)
             if HAS_LANGDETECT and self.config["bing_client_secret"].get():
                 lang_from = langdetect.detect(lyrics)
                 if self.config["bing_lang_to"].get() != lang_from and (
@@ -1059,7 +1059,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
                         lyrics, self.config["bing_lang_to"]
                     )
         else:
-            self.info("Lyrics not found: {}", item)
+            self.info("🔴 Lyrics not found: {}", item)
             fallback = self.config["fallback"].get()
             if fallback:
                 lyrics = fallback

From 1b9aa3baf8c675fcdf25ced82189b029f7c28fda Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Fri, 27 Sep 2024 22:57:20 +0100
Subject: [PATCH 10/23] lyrics: Do not write item unless lyrics have changed

---
 beetsplug/lyrics.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 057cc4b581..7c8a575021 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -1060,15 +1060,13 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
                     )
         else:
             self.info("🔴 Lyrics not found: {}", item)
-            fallback = self.config["fallback"].get()
-            if fallback:
-                lyrics = fallback
-            else:
-                return
-        item.lyrics = lyrics
-        if write:
-            item.try_write()
-        item.store()
+            lyrics = self.config["fallback"].get()
+
+        if lyrics not in {None, item.lyrics}:
+            item.lyrics = lyrics
+            if write:
+                item.try_write()
+            item.store()
 
     def get_lyrics(self, artist: str, title: str, *args) -> str | None:
         """Fetch lyrics, trying each source in turn. Return a string or

From 4a33cc3ab6b10ca701acb646de6321978d49c546 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Mon, 7 Oct 2024 10:33:01 +0100
Subject: [PATCH 11/23] Replace custom unescape implementation by html.unescape

---
 beetsplug/lyrics.py | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 7c8a575021..3908edcc64 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -22,10 +22,10 @@
 import math
 import os.path
 import re
-import struct
 from contextlib import contextmanager, suppress
 from dataclasses import dataclass
 from functools import cached_property, partial, total_ordering
+from html import unescape
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
 from urllib.parse import quote, urlparse
@@ -125,27 +125,6 @@ def close_session():
 # Utilities.
 
 
-def unichar(i):
-    try:
-        return chr(i)
-    except ValueError:
-        return struct.pack("i", i).decode("utf-32")
-
-
-def unescape(text):
-    """Resolve &#xxx; HTML entities (and some others)."""
-    if isinstance(text, bytes):
-        text = text.decode("utf-8", "ignore")
-    out = text.replace("&nbsp;", " ")
-
-    def replchar(m):
-        num = m.group(1)
-        return unichar(int(num))
-
-    out = re.sub("&#(\\d+);", replchar, out)
-    return out
-
-
 def extract_text_between(html, start_marker, end_marker):
     try:
         _, html = html.split(start_marker, 1)

From cf2aeded32547a2e4ed8c548b08a001e6858c8bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Mon, 7 Oct 2024 18:24:22 +0100
Subject: [PATCH 12/23] Remove extract_text_between

---
 beetsplug/lyrics.py | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 3908edcc64..522df516b6 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -125,15 +125,6 @@ def close_session():
 # Utilities.
 
 
-def extract_text_between(html, start_marker, end_marker):
-    try:
-        _, html = html.split(start_marker, 1)
-        html, _ = html.split(end_marker, 1)
-    except ValueError:
-        return ""
-    return html
-
-
 def search_pairs(item):
     """Yield a pairs of artists and titles to search for.
 
@@ -433,7 +424,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         # Sometimes lyrics come in 2 or more parts
         lyrics_parts = []
         for html_part in html_parts:
-            lyrics_parts.append(extract_text_between(html_part, ">", "</p>"))
+            lyrics_parts.append(re.sub(r"^[^>]+>|</p>.*", "", html_part))
         lyrics = "\n".join(lyrics_parts)
         lyrics = lyrics.strip(',"').replace("\\n", "\n")
         # another odd case: sometimes only that string remains, for

From 9ce662bbf1d12f8c9c070b4a09d7910728823ad4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Wed, 9 Oct 2024 12:12:09 +0100
Subject: [PATCH 13/23] Genius: refactor and simplify

---
 beetsplug/_typing.py        |  86 +++++++++++++++++++++++++
 beetsplug/lyrics.py         | 123 ++++++++++--------------------------
 test/plugins/test_lyrics.py |   2 +-
 3 files changed, 121 insertions(+), 90 deletions(-)
 create mode 100644 beetsplug/_typing.py

diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
new file mode 100644
index 0000000000..f673b85bd5
--- /dev/null
+++ b/beetsplug/_typing.py
@@ -0,0 +1,86 @@
+from __future__ import annotations
+
+from typing import Any
+
+from typing_extensions import TypeAlias, TypedDict
+
+JSONDict: TypeAlias = "dict[str, Any]"
+
+
+class LRCLibAPI:
+    class Item(TypedDict):
+        """Lyrics data item returned by the LRCLib API."""
+
+        id: int
+        name: str
+        trackName: str
+        artistName: str
+        albumName: str
+        duration: float | None
+        instrumental: bool
+        plainLyrics: str
+        syncedLyrics: str | None
+
+
+class GeniusAPI:
+    """Genius API data types.
+
+    This documents *only* the fields that are used in the plugin.
+    :attr:`SearchResult` is an exception, since I thought some of the other
+    fields might be useful in the future.
+    """
+
+    class DateComponents(TypedDict):
+        year: int
+        month: int
+        day: int
+
+    class Artist(TypedDict):
+        api_path: str
+        header_image_url: str
+        id: int
+        image_url: str
+        is_meme_verified: bool
+        is_verified: bool
+        name: str
+        url: str
+
+    class Stats(TypedDict):
+        unreviewed_annotations: int
+        hot: bool
+
+    class SearchResult(TypedDict):
+        annotation_count: int
+        api_path: str
+        artist_names: str
+        full_title: str
+        header_image_thumbnail_url: str
+        header_image_url: str
+        id: int
+        lyrics_owner_id: int
+        lyrics_state: str
+        path: str
+        primary_artist_names: str
+        pyongs_count: int | None
+        relationships_index_url: str
+        release_date_components: GeniusAPI.DateComponents
+        release_date_for_display: str
+        release_date_with_abbreviated_month_for_display: str
+        song_art_image_thumbnail_url: str
+        song_art_image_url: str
+        stats: GeniusAPI.Stats
+        title: str
+        title_with_featured: str
+        url: str
+        featured_artists: list[GeniusAPI.Artist]
+        primary_artist: GeniusAPI.Artist
+        primary_artists: list[GeniusAPI.Artist]
+
+    class SearchHit(TypedDict):
+        result: GeniusAPI.SearchResult
+
+    class SearchResponse(TypedDict):
+        hits: list[GeniusAPI.SearchHit]
+
+    class Search(TypedDict):
+        response: GeniusAPI.SearchResponse
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 522df516b6..2431a89969 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -31,7 +31,6 @@
 from urllib.parse import quote, urlparse
 
 import requests
-from typing_extensions import TypedDict
 from unidecode import unidecode
 
 import beets
@@ -42,6 +41,8 @@
     from beets.importer import ImportTask
     from beets.library import Item
 
+    from ._typing import GeniusAPI, LRCLibAPI
+
 try:
     from bs4 import BeautifulSoup
 
@@ -253,20 +254,6 @@ def fetch(
         raise NotImplementedError
 
 
-class LRCLibItem(TypedDict):
-    """Lyrics data item returned by the LRCLib API."""
-
-    id: int
-    name: str
-    trackName: str
-    artistName: str
-    albumName: str
-    duration: float | None
-    instrumental: bool
-    plainLyrics: str
-    syncedLyrics: str | None
-
-
 @dataclass
 @total_ordering
 class LRCLyrics:
@@ -284,7 +271,9 @@ def __le__(self, other: LRCLyrics) -> bool:
         return self.dist < other.dist
 
     @classmethod
-    def make(cls, candidate: LRCLibItem, target_duration: float) -> LRCLyrics:
+    def make(
+        cls, candidate: LRCLibAPI.Item, target_duration: float
+    ) -> LRCLyrics:
         return cls(
             target_duration,
             candidate["duration"] or 0.0,
@@ -341,7 +330,7 @@ class LRCLib(Backend):
 
     def fetch_candidates(
         self, artist: str, title: str, album: str, length: int
-    ) -> Iterator[list[LRCLibItem]]:
+    ) -> Iterator[list[LRCLibAPI.Item]]:
         """Yield lyrics candidates for the given song data.
 
         Firstly, attempt to GET lyrics directly, and then search the API if
@@ -479,13 +468,15 @@ class Genius(SearchBackend):
     bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
     """
 
+    LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
+    remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
+
     base_url = "https://api.genius.com"
     search_url = f"{base_url}/search"
 
-    def __init__(self, config, log):
-        super().__init__(config, log)
-        self.api_key = config["genius_api_key"].as_str()
-        self.headers = {"Authorization": f"Bearer {self.api_key}"}
+    @cached_property
+    def headers(self) -> dict[str, str]:
+        return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
 
     def fetch(self, artist: str, title: str, *_) -> str | None:
         """Fetch lyrics from genius.com
@@ -494,85 +485,39 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         we first query the api for a url matching our artist & title,
         then attempt to scrape that url for the lyrics.
         """
-        json = self._search(artist, title)
 
-        check = partial(self.check_match, artist, title)
-        for hit in json["response"]["hits"]:
-            result = hit["result"]
-            url = hit["result"]["url"]
-            if check(result["primary_artist"]["name"], result["title"]) and (
-                lyrics := self.scrape_lyrics(self.fetch_text(url))
-            ):
-                return collapse_newlines(lyrics)
-
-        return None
-
-    def _search(self, artist, title):
-        """Searches the genius api for a given artist and title
-
-        https://docs.genius.com/#search-h2
-
-        :returns: json response
-        """
-        return self.fetch_json(
+        data = self.fetch_json(
             self.search_url,
-            params={"q": f"{title} {artist.lower()}"},
+            params={"q": f"{artist} {title}".lower()},
             headers=self.headers,
         )
+        if (url := self.find_lyrics_url(data, artist, title)) and (
+            lyrics := self.scrape_lyrics(self.fetch_text(url))
+        ):
+            return collapse_newlines(lyrics)
 
-    def scrape_lyrics(self, html: str) -> str | None:
-        """Scrape lyrics from a given genius.com html"""
-        soup = get_soup(html)
-
-        # Most of the time, the page contains a div with class="lyrics" where
-        # all of the lyrics can be found already correctly formatted
-        # Sometimes, though, it packages the lyrics into separate divs, most
-        # likely for easier ad placement
+        return None
 
-        lyrics_divs = soup.find_all("div", {"data-lyrics-container": True})
-        if not lyrics_divs:
-            self.debug("Received unusual song page html")
-            return self._try_extracting_lyrics_from_non_data_lyrics_container(
-                soup
-            )
-        lyrics = ""
-        for lyrics_div in lyrics_divs:
-            lyrics += lyrics_div.get_text() + "\n\n"
-        while lyrics[-1] == "\n":
-            lyrics = lyrics[:-1]
-        return lyrics
+    def find_lyrics_url(
+        self, data: GeniusAPI.Search, artist: str, title: str
+    ) -> str | None:
+        """Find URL to the lyrics of the given artist and title.
 
-    def _try_extracting_lyrics_from_non_data_lyrics_container(self, soup):
-        """Extract lyrics from a div without attribute data-lyrics-container
-        This is the second most common layout on genius.com
+        https://docs.genius.com/#search-h2.
         """
-        verse_div = soup.find("div", class_=re.compile("Lyrics__Container"))
-        if not verse_div:
-            if soup.find(
-                "div",
-                class_=re.compile("LyricsPlaceholder__Message"),
-                string="This song is an instrumental",
-            ):
-                self.debug("Detected instrumental")
-                return INSTRUMENTAL_LYRICS
-            else:
-                self.debug("Couldn't scrape page using known layouts")
-                return None
+        check = partial(self.check_match, artist, title)
+        for result in (hit["result"] for hit in data["response"]["hits"]):
+            if check(result["artist_names"], result["title"]):
+                return result["url"]
 
-        lyrics_div = verse_div.parent
+        return None
 
-        ads = lyrics_div.find_all(
-            "div", class_=re.compile("InreadAd__Container")
-        )
-        for ad in ads:
-            ad.replace_with("\n")
+    def scrape_lyrics(self, html: str) -> str | None:
+        if m := self.LYRICS_IN_JSON_RE.search(html):
+            html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+            return get_soup(html_text).get_text().strip()
 
-        footers = lyrics_div.find_all(
-            "div", class_=re.compile("Lyrics__Footer")
-        )
-        for footer in footers:
-            footer.replace_with("")
-        return lyrics_div.get_text()
+        return None
 
 
 class Tekstowo(DirectBackend):
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 4d3be149fb..1c961230e4 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -379,7 +379,7 @@ def backend_name(self):
     @pytest.mark.parametrize(
         "file_name, expected_line_count",
         [
-            ("geniuscom/2pacalleyezonmelyrics", 134),
+            ("geniuscom/2pacalleyezonmelyrics", 131),
             ("geniuscom/Ttngchinchillalyrics", 29),
             ("geniuscom/sample", 0),  # see https://github.com/beetbox/beets/issues/3535
         ],

From 61e414245d385763a6622d86e499bbc9c8a9334f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 17:04:58 +0100
Subject: [PATCH 14/23] Unite Genius, Tekstowo and Google backends under the
 same interface

---
 beetsplug/lyrics.py         | 166 ++++++++++++++++++------------------
 test/plugins/test_lyrics.py |  39 ++++-----
 2 files changed, 105 insertions(+), 100 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 2431a89969..e8ded949ab 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -27,7 +27,7 @@
 from functools import cached_property, partial, total_ordering
 from html import unescape
 from http import HTTPStatus
-from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator
+from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
 from urllib.parse import quote, urlparse
 
 import requests
@@ -427,6 +427,12 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         return lyrics
 
 
+class SearchResult(NamedTuple):
+    artist: str
+    title: str
+    url: str
+
+
 class SearchBackend(Backend):
     REQUIRES_BS = True
 
@@ -435,12 +441,12 @@ def dist_thresh(self) -> float:
         return self.config["dist_thresh"].get(float)
 
     def check_match(
-        self, target_artist: str, target_title: str, artist: str, title: str
+        self, target_artist: str, target_title: str, result: SearchResult
     ) -> bool:
-        """Check if the given artist and title are 'good enough' match."""
+        """Check if the given search result is a 'good enough' match."""
         max_dist = max(
-            string_dist(target_artist, artist),
-            string_dist(target_title, title),
+            string_dist(target_artist, result.artist),
+            string_dist(target_title, result.title),
         )
 
         if (max_dist := round(max_dist, 2)) <= self.dist_thresh:
@@ -451,8 +457,8 @@ def check_match(
             # This may show a matching candidate with some noise in the name
             self.debug(
                 "({}, {}) does not match ({}, {}) but dist was close: {:.2f}",
-                artist,
-                title,
+                result.artist,
+                result.title,
                 target_artist,
                 target_title,
                 max_dist,
@@ -460,61 +466,62 @@ def check_match(
 
         return False
 
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+        """Search for the given query and yield search results."""
+        raise NotImplementedError
+
+    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+        check_match = partial(self.check_match, artist, title)
+        for candidate in self.search(artist, title):
+            if check_match(candidate):
+                yield candidate
+
+    def fetch(self, artist: str, title: str, *_) -> str | None:
+        """Fetch lyrics for the given artist and title."""
+        for result in self.get_results(artist, title):
+            if lyrics := self.scrape(self.fetch_text(result.url)):
+                return lyrics
+
+        return None
+
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        """Scrape the lyrics from the given HTML."""
+        raise NotImplementedError
+
 
 class Genius(SearchBackend):
     """Fetch lyrics from Genius via genius-api.
 
-    Simply adapted from
-    bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping/
+    Because genius doesn't allow accessing lyrics via the api, we first query
+    the api for a url matching our artist & title, then scrape the HTML text
+    for the JSON data containing the lyrics.
+
+    Adapted from
+    bigishdata.com/2016/09/27/getting-song-lyrics-from-geniuss-api-scraping
     """
 
+    SEARCH_URL = "https://api.genius.com/search"
     LYRICS_IN_JSON_RE = re.compile(r'(?<=.\\"html\\":\\").*?(?=(?<!\\)\\")')
     remove_backslash = partial(re.compile(r"\\(?=[^\\])").sub, "")
 
-    base_url = "https://api.genius.com"
-    search_url = f"{base_url}/search"
-
     @cached_property
     def headers(self) -> dict[str, str]:
         return {"Authorization": f'Bearer {self.config["genius_api_key"]}'}
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
-        """Fetch lyrics from genius.com
-
-        Because genius doesn't allow accessing lyrics via the api,
-        we first query the api for a url matching our artist & title,
-        then attempt to scrape that url for the lyrics.
-        """
-
-        data = self.fetch_json(
-            self.search_url,
-            params={"q": f"{artist} {title}".lower()},
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
+        search_data: GeniusAPI.Search = self.fetch_json(
+            self.SEARCH_URL,
+            params={"q": f"{artist} {title}"},
             headers=self.headers,
         )
-        if (url := self.find_lyrics_url(data, artist, title)) and (
-            lyrics := self.scrape_lyrics(self.fetch_text(url))
-        ):
-            return collapse_newlines(lyrics)
-
-        return None
-
-    def find_lyrics_url(
-        self, data: GeniusAPI.Search, artist: str, title: str
-    ) -> str | None:
-        """Find URL to the lyrics of the given artist and title.
-
-        https://docs.genius.com/#search-h2.
-        """
-        check = partial(self.check_match, artist, title)
-        for result in (hit["result"] for hit in data["response"]["hits"]):
-            if check(result["artist_names"], result["title"]):
-                return result["url"]
-
-        return None
+        for r in (hit["result"] for hit in search_data["response"]["hits"]):
+            yield SearchResult(r["artist_names"], r["title"], r["url"])
 
-    def scrape_lyrics(self, html: str) -> str | None:
-        if m := self.LYRICS_IN_JSON_RE.search(html):
-            html_text = self.remove_backslash(m[0]).replace(r"\n", "\n")
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        if m := cls.LYRICS_IN_JSON_RE.search(html):
+            html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
             return get_soup(html_text).get_text().strip()
 
         return None
@@ -536,11 +543,12 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         # We are expecting to receive a 404 since we are guessing the URL.
         # Thus suppress the error so that it does not end up in the logs.
         with suppress(NotFoundError):
-            return self.scrape_lyrics(
-                self.fetch_text(self.build_url(artist, title))
-            )
+            return self.scrape(self.fetch_text(self.build_url(artist, title)))
 
-    def scrape_lyrics(self, html: str) -> str | None:
+        return None
+
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
         soup = get_soup(html)
 
         if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
@@ -599,16 +607,6 @@ class Google(SearchBackend):
 
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
-    @staticmethod
-    def scrape_lyrics(html: str) -> str | None:
-        soup = get_soup(html)
-
-        # Get the longest text element (if any).
-        strings = sorted(soup.stripped_strings, key=len, reverse=True)
-        if strings:
-            return strings[0]
-        return None
-
     def is_lyrics(self, text, artist=None):
         """Determine whether the text seems to be valid lyrics."""
         if not text:
@@ -641,17 +639,11 @@ def is_lyrics(self, text, artist=None):
     BY_TRANS = ["by", "par", "de", "von"]
     LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
 
-    def is_page_candidate(
-        self, artist: str, title: str, url_link: str, url_title: str
-    ) -> bool:
-        """Return True if the URL title makes it a good candidate to be a
-        page that contains lyrics of title by artist.
-        """
-        title_slug = slug(title)
+    def make_search_result(
+        self, artist: str, url_link: str, url_title: str
+    ) -> SearchResult:
+        """Parse artist and title from the URL title and return a search result."""
         url_title_slug = slug(url_title)
-        if title_slug in url_title_slug:
-            return True
-
         artist = slug(artist)
         sitename = urlparse(url_link).netloc
 
@@ -666,33 +658,45 @@ def is_page_candidate(
             "(%s)" % "|".join(tokens), "", url_title_slug
         ).strip("-")
 
-        return self.check_match(artist, title_slug, artist, song_title)
+        return SearchResult(artist, song_title, url_link)
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def search(self, artist: str, title: str) -> Iterable[SearchResult]:
         params = {
             "key": self.config["google_API_key"].as_str(),
             "cx": self.config["google_engine_ID"].as_str(),
             "q": f"{artist} {title}",
         }
 
-        check_candidate = partial(self.is_page_candidate, artist, title)
-        for item in self.fetch_json(self.SEARCH_URL, params=params).get(
-            "items", []
-        ):
-            url_link = item["link"]
-            if not check_candidate(url_link, item.get("title", "")):
-                continue
+        data = self.fetch_json(self.SEARCH_URL, params=params)
+        for item in data.get("items", []):
+            yield self.make_search_result(artist, item["link"], item["title"])
+
+    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
+        return super().get_results(artist, slug(title))
+
+    def fetch(self, artist: str, title: str, *_) -> str | None:
+        for result in self.get_results(artist, title):
             with self.handle_request():
-                lyrics = self.scrape_lyrics(self.fetch_text(url_link))
+                lyrics = self.scrape(self.fetch_text(result.url))
                 if not lyrics:
                     continue
 
                 if self.is_lyrics(lyrics, artist):
-                    self.debug("Got lyrics from {}", item["displayLink"])
+                    self.debug(
+                        "Got lyrics from {}", urlparse(result.url).netloc
+                    )
                     return lyrics
 
         return None
 
+    @classmethod
+    def scrape(cls, html: str) -> str | None:
+        # Get the longest text element (if any).
+        if strings := sorted(get_soup(html).stripped_strings, key=len):
+            return strings[-1]
+
+        return None
+
 
 class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
     SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 1c961230e4..4a23998a8b 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -191,9 +191,9 @@ def backend(self, dist_thresh):
         ],
     )
     def test_check_match(self, backend, target_artist, artist, should_match):
-        assert (
-            backend.check_match(target_artist, "", artist, "") == should_match
-        )
+        result = lyrics.SearchResult(artist, "", "")
+
+        assert backend.check_match(target_artist, "", result) == should_match
 
 
 @pytest.fixture(scope="module")
@@ -327,31 +327,32 @@ def file_name(self):
 
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
-        result = backend.scrape_lyrics(lyrics_html).lower()
+        result = backend.scrape(lyrics_html).lower()
 
         assert result
         assert backend.is_lyrics(result)
         assert PHRASE_BY_TITLE[self.TITLE] in result
 
     @pytest.mark.parametrize(
-        "url_title, artist, should_be_candidate",
+        "url_title, artist, expected_title",
         [
-            ("John Doe - beets song Lyrics", "John Doe", True),
-            ("example.com | Beats song by John doe", "John Doe", True),
-            ("example.com | seets bong lyrics by John doe", "John Doe", False),
-            ("foo", "Sun O)))", False),
+            ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
+            ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+            (
+                "example.com | seets bong lyrics by John doe",
+                "John Doe",
+                "seets-bong",
+            ),
+            ("foo", "Sun O)))", "foo"),
         ],
     )
-    def test_is_page_candidate(
-        self, backend, lyrics_html, url_title, artist, should_be_candidate
+    def test_make_search_result(
+        self, backend, url_title, artist, expected_title
     ):
-        result = backend.is_page_candidate(
-            artist,
-            self.TITLE,
-            "http://www.example.com/lyrics/beetssong",
-            url_title,
+        result = backend.make_search_result(
+            artist, "https://example.com", url_title
         )
-        assert bool(result) == should_be_candidate
+        assert result.title == expected_title
 
     @pytest.mark.parametrize(
         "lyrics",
@@ -385,7 +386,7 @@ def backend_name(self):
         ],
     )  # fmt: skip
     def test_scrape(self, backend, lyrics_html, expected_line_count):
-        result = backend.scrape_lyrics(lyrics_html) or ""
+        result = backend.scrape(lyrics_html) or ""
 
         assert len(result.splitlines()) == expected_line_count
 
@@ -406,7 +407,7 @@ def backend_name(self):
         ],
     )
     def test_scrape(self, backend, lyrics_html, expecting_lyrics):
-        assert bool(backend.scrape_lyrics(lyrics_html)) == expecting_lyrics
+        assert bool(backend.scrape(lyrics_html)) == expecting_lyrics
 
 
 LYRICS_DURATION = 950

From 99f3c67d36d2d4c28d82724c7a974fd8f149801f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 16:36:41 +0100
Subject: [PATCH 15/23] Google: Refactor and improve

* Type the response data that Google Custom Search API return.
* Exclude some 'letras.mus.br' pages that do not contain lyric.
* Exclude results from Musixmatch as we cannot access their pages.
* Improve parsing of the URL title:
  - Handle long URL titles that get truncated (end with ellipsis) for
    long searches
  - Remove domains starting with 'www'
  - Parse the title AND the artist. Previously this would only parse the
    title, and fetch lyrics even when the artist did not match.
* Remove now redundant credits cleanup and checks for valid lyrics.
---
 beetsplug/_typing.py        |  31 ++++++-
 beetsplug/lyrics.py         | 156 ++++++++++++++++++------------------
 test/plugins/test_lyrics.py |  84 ++++++++-----------
 3 files changed, 141 insertions(+), 130 deletions(-)

diff --git a/beetsplug/_typing.py b/beetsplug/_typing.py
index f673b85bd5..1aa288cbcb 100644
--- a/beetsplug/_typing.py
+++ b/beetsplug/_typing.py
@@ -2,7 +2,7 @@
 
 from typing import Any
 
-from typing_extensions import TypeAlias, TypedDict
+from typing_extensions import NotRequired, TypeAlias, TypedDict
 
 JSONDict: TypeAlias = "dict[str, Any]"
 
@@ -84,3 +84,32 @@ class SearchResponse(TypedDict):
 
     class Search(TypedDict):
         response: GeniusAPI.SearchResponse
+
+
+class GoogleCustomSearchAPI:
+    class Response(TypedDict):
+        """Search response from the Google Custom Search API.
+
+        If the search returns no results, the :attr:`items` field is not found.
+        """
+
+        items: NotRequired[list[GoogleCustomSearchAPI.Item]]
+
+    class Item(TypedDict):
+        """A Google Custom Search API result item.
+
+        :attr:`title` field is shown to the user in the search interface, thus
+        it gets truncated with an ellipsis for longer queries. For most
+        results, the full title is available as ``og:title`` metatag found
+        under the :attr:`pagemap` field. Note neither this metatag nor the
+        ``pagemap`` field is guaranteed to be present in the data.
+        """
+
+        title: str
+        link: str
+        pagemap: NotRequired[GoogleCustomSearchAPI.Pagemap]
+
+    class Pagemap(TypedDict):
+        """Pagemap data with a single meta tags dict in a list."""
+
+        metatags: list[JSONDict]
diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index e8ded949ab..35b724a2ac 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,7 +28,7 @@
 from html import unescape
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote, urlparse
+from urllib.parse import quote
 
 import requests
 from unidecode import unidecode
@@ -41,7 +41,7 @@
     from beets.importer import ImportTask
     from beets.library import Item
 
-    from ._typing import GeniusAPI, LRCLibAPI
+    from ._typing import GeniusAPI, GoogleCustomSearchAPI, LRCLibAPI
 
 try:
     from bs4 import BeautifulSoup
@@ -479,7 +479,9 @@ def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
     def fetch(self, artist: str, title: str, *_) -> str | None:
         """Fetch lyrics for the given artist and title."""
         for result in self.get_results(artist, title):
-            if lyrics := self.scrape(self.fetch_text(result.url)):
+            if (html := self.fetch_text(result.url)) and (
+                lyrics := self.scrape(html)
+            ):
                 return lyrics
 
         return None
@@ -557,20 +559,6 @@ def scrape(cls, html: str) -> str | None:
         return None
 
 
-def remove_credits(text):
-    """Remove first/last line of text if it contains the word 'lyrics'
-    eg 'Lyrics by songsdatabase.com'
-    """
-    textlines = text.split("\n")
-    credits = None
-    for i in (0, -1):
-        if textlines and "lyrics" in textlines[i].lower():
-            credits = textlines.pop(i)
-    if credits:
-        text = "\n".join(textlines)
-    return text
-
-
 collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
 
 
@@ -607,87 +595,97 @@ class Google(SearchBackend):
 
     SEARCH_URL = "https://www.googleapis.com/customsearch/v1"
 
-    def is_lyrics(self, text, artist=None):
-        """Determine whether the text seems to be valid lyrics."""
-        if not text:
-            return False
-        bad_triggers_occ = []
-        nb_lines = text.count("\n")
-        if nb_lines <= 1:
-            self.debug("Ignoring too short lyrics '{}'", text)
-            return False
-        elif nb_lines < 5:
-            bad_triggers_occ.append("too_short")
-        else:
-            # Lyrics look legit, remove credits to avoid being penalized
-            # further down
-            text = remove_credits(text)
+    #: Exclude some letras.mus.br pages which do not contain lyrics.
+    EXCLUDE_PAGES = [
+        "significado.html",
+        "traduccion.html",
+        "traducao.html",
+        "significados.html",
+    ]
 
-        bad_triggers = ["lyrics", "copyright", "property", "links"]
-        if artist:
-            bad_triggers += [artist]
+    #: Regular expression to match noise in the URL title.
+    URL_TITLE_NOISE_RE = re.compile(
+        r"""
+\b
+(
+      paroles(\ et\ traduction|\ de\ chanson)?
+    | letras?(\ de)?
+    | liedtexte
+    | original\ song\ full\ text\.
+    | official
+    | 20[12]\d\ version
+    | (absolute\ |az)?lyrics(\ complete)?
+    | www\S+
+    | \S+\.(com|net|mus\.br)
+)
+([^\w.]|$)
+""",
+        re.IGNORECASE | re.VERBOSE,
+    )
+    #: Split cleaned up URL title into artist and title parts.
+    URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
 
-        for item in bad_triggers:
-            bad_triggers_occ += [item] * len(
-                re.findall(r"\W%s\W" % item, text, re.I)
-            )
+    def fetch_text(self, *args, **kwargs) -> str:
+        """Handle an error so that we can continue with the next URL."""
+        with self.handle_request():
+            return super().fetch_text(*args, **kwargs)
 
-        if bad_triggers_occ:
-            self.debug("Bad triggers detected: {}", bad_triggers_occ)
-        return len(bad_triggers_occ) < 2
+    @staticmethod
+    def get_part_dist(artist: str, title: str, part: str) -> float:
+        """Return the distance between the given part and the artist and title.
 
-    BY_TRANS = ["by", "par", "de", "von"]
-    LYRICS_TRANS = ["lyrics", "paroles", "letras", "liedtexte"]
+        A number between -1 and 1 is returned, where -1 means the part is
+        closer to the artist and 1 means it is closer to the title.
+        """
+        return string_dist(artist, part) - string_dist(title, part)
 
+    @classmethod
     def make_search_result(
-        self, artist: str, url_link: str, url_title: str
+        cls, artist: str, title: str, item: GoogleCustomSearchAPI.Item
     ) -> SearchResult:
         """Parse artist and title from the URL title and return a search result."""
-        url_title_slug = slug(url_title)
-        artist = slug(artist)
-        sitename = urlparse(url_link).netloc
-
-        # or try extracting song title from URL title and check if
-        # they are close enough
-        tokens = (
-            [by + "-" + artist for by in self.BY_TRANS]
-            + [artist, sitename, sitename.replace("www.", "")]
-            + self.LYRICS_TRANS
+        url_title = (
+            # get full title from metatags if available
+            item.get("pagemap", {}).get("metatags", [{}])[0].get("og:title")
+            # default to the dispolay title
+            or item["title"]
         )
-        song_title = re.sub(
-            "(%s)" % "|".join(tokens), "", url_title_slug
-        ).strip("-")
+        clean_title = cls.URL_TITLE_NOISE_RE.sub("", url_title).strip(" .-|")
+        # split it into parts which may be part of the artist or the title
+        # `dict.fromkeys` removes duplicates keeping the order
+        parts = list(dict.fromkeys(cls.URL_TITLE_PARTS_RE.split(clean_title)))
+
+        if len(parts) == 1:
+            part = parts[0]
+            if m := re.search(rf"(?i)\W*({re.escape(title)})\W*", part):
+                # artist and title may not have a separator
+                result_title = m[1]
+                result_artist = part.replace(m[0], "")
+            else:
+                # assume that this is the title
+                result_artist, result_title = "", parts[0]
+        else:
+            # sort parts by their similarity to the artist
+            parts.sort(key=lambda p: cls.get_part_dist(artist, title, p))
+            result_artist, result_title = parts[0], " ".join(parts[1:])
 
-        return SearchResult(artist, song_title, url_link)
+        return SearchResult(result_artist, result_title, item["link"])
 
     def search(self, artist: str, title: str) -> Iterable[SearchResult]:
         params = {
             "key": self.config["google_API_key"].as_str(),
             "cx": self.config["google_engine_ID"].as_str(),
             "q": f"{artist} {title}",
+            "siteSearch": "www.musixmatch.com",
+            "siteSearchFilter": "e",
+            "excludeTerms": ", ".join(self.EXCLUDE_PAGES),
         }
 
-        data = self.fetch_json(self.SEARCH_URL, params=params)
+        data: GoogleCustomSearchAPI.Response = self.fetch_json(
+            self.SEARCH_URL, params=params
+        )
         for item in data.get("items", []):
-            yield self.make_search_result(artist, item["link"], item["title"])
-
-    def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
-        return super().get_results(artist, slug(title))
-
-    def fetch(self, artist: str, title: str, *_) -> str | None:
-        for result in self.get_results(artist, title):
-            with self.handle_request():
-                lyrics = self.scrape(self.fetch_text(result.url))
-                if not lyrics:
-                    continue
-
-                if self.is_lyrics(lyrics, artist):
-                    self.debug(
-                        "Got lyrics from {}", urlparse(result.url).netloc
-                    )
-                    return lyrics
-
-        return None
+            yield self.make_search_result(artist, title, item)
 
     @classmethod
     def scrape(cls, html: str) -> str | None:
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 4a23998a8b..c7a90683a5 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,24 +101,6 @@ def test_search_pairs_titles(self, title, expected_extra_titles):
 
         assert list(actual_titles) == [title, *expected_extra_titles]
 
-    @pytest.mark.parametrize(
-        "initial_lyrics, expected",
-        [
-            ("Verse\nLyrics credit in the last line", "Verse"),
-            ("Lyrics credit in the first line\nVerse", "Verse"),
-            (
-                """Verse
-                Lyrics mentioned somewhere in the middle
-                Verse""",
-                """Verse
-                Lyrics mentioned somewhere in the middle
-                Verse""",
-            ),
-        ],
-    )
-    def test_remove_credits(self, initial_lyrics, expected):
-        assert lyrics.remove_credits(initial_lyrics) == expected
-
     @pytest.mark.parametrize(
         "initial_text, expected",
         [
@@ -311,8 +293,6 @@ def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage):
 class TestGoogleLyrics(LyricsBackendTest):
     """Test scraping heuristics on a fake html page."""
 
-    TITLE = "Beets song"
-
     @pytest.fixture(scope="class")
     def backend_name(self):
         return "google"
@@ -325,51 +305,55 @@ def plugin_config(self):
     def file_name(self):
         return "examplecom/beetssong"
 
+    @pytest.fixture
+    def search_item(self, url_title, url):
+        return {"title": url_title, "link": url}
+
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
         result = backend.scrape(lyrics_html).lower()
 
         assert result
-        assert backend.is_lyrics(result)
-        assert PHRASE_BY_TITLE[self.TITLE] in result
+        assert PHRASE_BY_TITLE["Beets song"] in result
 
     @pytest.mark.parametrize(
-        "url_title, artist, expected_title",
+        "url_title, expected_artist, expected_title",
         [
-            ("John Doe - beets song Lyrics", "John Doe", "beets-song"),
-            ("example.com | Beats song by John doe", "John Doe", "beats-song"),
+            ("Artist - beets song Lyrics", "Artist", "beets song"),
+            ("www.azlyrics.com | Beats song by Artist", "Artist", "Beats song"),
+            ("lyric.com | seets bong lyrics by Artist", "Artist", "seets bong"),
+            ("foo", "", "foo"),
+            ("Artist - Beets Song lyrics | AZLyrics", "Artist", "Beets Song"),
+            ("Letra de Artist - Beets Song", "Artist", "Beets Song"),
+            ("Letra de Artist - Beets ...", "Artist", "Beets"),
+            ("Artist Beets Song", "Artist", "Beets Song"),
+            ("BeetsSong - Artist", "Artist", "BeetsSong"),
+            ("Artist - BeetsSong", "Artist", "BeetsSong"),
+            ("Beets Song", "", "Beets Song"),
+            ("Beets Song Artist", "Artist", "Beets Song"),
             (
-                "example.com | seets bong lyrics by John doe",
-                "John Doe",
-                "seets-bong",
+                "BeetsSong (feat. Other & Another) - Artist",
+                "Artist",
+                "BeetsSong (feat. Other & Another)",
+            ),
+            (
+                (
+                    "Beets song lyrics by Artist - original song full text. "
+                    "Official Beets song lyrics, 2024 version | LyricsMode.com"
+                ),
+                "Artist",
+                "Beets song",
             ),
-            ("foo", "Sun O)))", "foo"),
         ],
     )
+    @pytest.mark.parametrize("url", ["http://doesntmatter.com"])
     def test_make_search_result(
-        self, backend, url_title, artist, expected_title
+        self, backend, search_item, expected_artist, expected_title
     ):
-        result = backend.make_search_result(
-            artist, "https://example.com", url_title
-        )
-        assert result.title == expected_title
+        result = backend.make_search_result("Artist", "Beets song", search_item)
 
-    @pytest.mark.parametrize(
-        "lyrics",
-        [
-            "LyricsMania.com - Copyright (c) 2013 - All Rights Reserved",
-            """All material found on this site is property\n
-                     of mywickedsongtext brand""",
-            """
-Lyricsmania staff is working hard for you to add $TITLE lyrics as soon
-as they'll be released by $ARTIST, check back soon!
-In case you have the lyrics to $TITLE and want to send them to us, fill out
-the following form.
-""",
-        ],
-    )
-    def test_bad_lyrics(self, backend, lyrics):
-        assert not backend.is_lyrics(lyrics)
+        assert result.artist == expected_artist
+        assert result.title == expected_title
 
 
 class TestGeniusLyrics(LyricsBackendTest):

From 7d35c4cd78a328028b4c1809c428a0100efeeeaa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 13:34:12 +0100
Subject: [PATCH 16/23] Create Html class for cleaning up the html text

Additionally, improve HTML pre-processing:

* Ensure a new line between blocks of lyrics text from letras.mus.br.
* Parse a missing last block of lyrics text from lacocinelle.net.
* Parse a missing last block of lyrics text from paroles.net.
* Fix encoding issues with AZLyrics by setting response encoding to
  None, allowing `requests` to handle it.
---
 beetsplug/lyrics.py          | 105 ++++++++++++++++++++++-------------
 test/plugins/lyrics_pages.py |  35 +++++++++++-
 test/plugins/test_lyrics.py  |  41 +++++++-------
 3 files changed, 119 insertions(+), 62 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 35b724a2ac..680b659094 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -57,7 +57,6 @@
 except ImportError:
     HAS_LANGDETECT = False
 
-BREAK_RE = re.compile(r"\n?\s*<br([\s|/][^>]*)*>\s*\n?", re.I)
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
 
@@ -222,9 +221,15 @@ def warn(self, message: str, *args) -> None:
         self._log.warning(f"{self.__class__.__name__}: {message}", *args)
 
     def fetch_text(self, url: str, **kwargs) -> str:
-        """Return text / HTML data from the given URL."""
+        """Return text / HTML data from the given URL.
+
+        Set the encoding to None to let requests handle it because some sites
+        set it incorrectly.
+        """
         self.debug("Fetching HTML from {}", url)
-        return r_session.get(url, **kwargs).text
+        r = r_session.get(url, **kwargs)
+        r.encoding = None
+        return r.text
 
     def fetch_json(self, url: str, **kwargs):
         """Return JSON data from the given URL."""
@@ -427,13 +432,60 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         return lyrics
 
 
+class Html:
+    collapse_space = partial(re.compile(r"(^| ) +", re.M).sub, r"\1")
+    expand_br = partial(re.compile(r"\s*<br[^>]*>\s*", re.I).sub, "\n")
+    #: two newlines between paragraphs on the same line (musica, letras.mus.br)
+    merge_blocks = partial(re.compile(r"(?<!>)</p><p[^>]*>").sub, "\n\n")
+    #: a single new line between paragraphs on separate lines
+    #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
+    merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
+    #: remove empty divs (lacoccinelle.net)
+    remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
+    #: remove Google Ads tags (musica.com)
+    remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
+    #: remove adslot-Content_1 div from the lyrics text (paroles.net)
+    remove_adslot = partial(
+        re.compile(r"\n</div>[^\n]+-- Content_\d+ --.*?\n<div>", re.S).sub,
+        "\n",
+    )
+    #: remove text formatting (azlyrics.com, lacocinelle.net)
+    remove_formatting = partial(
+        re.compile(r" *</?(i|em|pre|strong)[^>]*>").sub, ""
+    )
+
+    @classmethod
+    def normalize_space(cls, text: str) -> str:
+        text = unescape(text).replace("\r", "").replace("\xa0", " ")
+        return cls.collapse_space(cls.expand_br(text))
+
+    @classmethod
+    def remove_ads(cls, text: str) -> str:
+        return cls.remove_adslot(cls.remove_aside(text))
+
+    @classmethod
+    def merge_paragraphs(cls, text: str) -> str:
+        return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
+
+
+class SoupMixin:
+    @classmethod
+    def pre_process_html(cls, html: str) -> str:
+        """Pre-process the HTML content before scraping."""
+        return Html.normalize_space(html)
+
+    @classmethod
+    def get_soup(cls, html: str) -> BeautifulSoup:
+        return BeautifulSoup(cls.pre_process_html(html), "html.parser")
+
+
 class SearchResult(NamedTuple):
     artist: str
     title: str
     url: str
 
 
-class SearchBackend(Backend):
+class SearchBackend(SoupMixin, Backend):
     REQUIRES_BS = True
 
     @cached_property
@@ -524,12 +576,12 @@ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
     def scrape(cls, html: str) -> str | None:
         if m := cls.LYRICS_IN_JSON_RE.search(html):
             html_text = cls.remove_backslash(m[0]).replace(r"\n", "\n")
-            return get_soup(html_text).get_text().strip()
+            return cls.get_soup(html_text).get_text().strip()
 
         return None
 
 
-class Tekstowo(DirectBackend):
+class Tekstowo(SoupMixin, DirectBackend):
     """Fetch lyrics from Tekstowo.pl."""
 
     REQUIRES_BS = True
@@ -551,7 +603,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
 
     @classmethod
     def scrape(cls, html: str) -> str | None:
-        soup = get_soup(html)
+        soup = cls.get_soup(html)
 
         if lyrics_div := soup.select_one("div.song-text > div.inner-text"):
             return lyrics_div.get_text()
@@ -559,37 +611,6 @@ def scrape(cls, html: str) -> str | None:
         return None
 
 
-collapse_newlines = partial(re.compile(r"\n{3,}").sub, r"\n\n")
-
-
-def _scrape_strip_cruft(html: str) -> str:
-    """Clean up HTML"""
-    html = unescape(html)
-
-    html = html.replace("\r", "\n")  # Normalize EOL.
-    html = re.sub(r" +", " ", html)  # Whitespaces collapse.
-    html = BREAK_RE.sub("\n", html)  # <br> eats up surrounding '\n'.
-    html = re.sub(r"(?s)<(script).*?</\1>", "", html)  # Strip script tags.
-    html = re.sub("\u2005", " ", html)  # replace unicode with regular space
-    html = re.sub("<aside .+?</aside>", "", html)  # remove Google Ads tags
-    html = re.sub(r"</?(em|strong)[^>]*>", "", html)  # remove italics / bold
-
-    html = "\n".join([x.strip() for x in html.strip().split("\n")])
-    return collapse_newlines(html)
-
-
-def _scrape_merge_paragraphs(html):
-    html = re.sub(r"</p>\s*<p(\s*[^>]*)>", "\n", html)
-    return re.sub(r"<div .*>\s*</div>", "\n", html)
-
-
-def get_soup(html: str) -> BeautifulSoup:
-    html = _scrape_strip_cruft(html)
-    html = _scrape_merge_paragraphs(html)
-
-    return BeautifulSoup(html, "html.parser")
-
-
 class Google(SearchBackend):
     """Fetch lyrics from Google search results."""
 
@@ -625,6 +646,12 @@ class Google(SearchBackend):
     #: Split cleaned up URL title into artist and title parts.
     URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
 
+    @classmethod
+    def pre_process_html(cls, html: str) -> str:
+        """Pre-process the HTML content before scraping."""
+        html = Html.remove_ads(super().pre_process_html(html))
+        return Html.remove_formatting(Html.merge_paragraphs(html))
+
     def fetch_text(self, *args, **kwargs) -> str:
         """Handle an error so that we can continue with the next URL."""
         with self.handle_request():
@@ -690,7 +717,7 @@ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
     @classmethod
     def scrape(cls, html: str) -> str | None:
         # Get the longest text element (if any).
-        if strings := sorted(get_soup(html).stripped_strings, key=len):
+        if strings := sorted(cls.get_soup(html).stripped_strings, key=len):
             return strings[-1]
 
         return None
diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index 84c2457ba4..bca66ef1b7 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -223,6 +223,20 @@ def backend(self) -> str:
         Mademoiselle Madonna, couchée sur votre lit
         Listen to the music playing in your head.
         Vous écoutez la musique qui joue dans votre tête
+
+        Tuesday afternoon is never ending.
+        Le mardi après-midi n'en finit pas
+        Wednesday morning papers didn't come.
+        Le mercredi matin les journaux ne sont pas arrivés
+        Thursday night you stockings needed mending.
+        Jeudi soir, vos bas avaient besoin d'être réparés
+        See how they run.
+        Regardez comme ils filent
+
+        Lady Madonna, children at your feet.
+        Mademoiselle Madonna, les enfants à vos pieds
+        Wonder how you manage to make ends meet.
+        Je me demande comment vous vous débrouillez pour joindre les deux bouts
         """,
         url_title="Paroles et traduction The Beatles : Lady Madonna - paroles de chanson",  # noqa: E501
     ),
@@ -235,29 +249,35 @@ def backend(self) -> str:
         Children at your feet
         Wonder how you manage
         To make ends meet
+
         Who finds the money
         When you pay the rent?
         Did you think that money
         Was Heaven sent?
+
         Friday night arrives without a suitcase
         Sunday morning creeping like a nun
         Monday's child has learned
         To tie his bootlace
         See how they run
+
         Lady Madonna
         Baby at your breast
         Wonders how you manage
         To feed the rest
         See how they run
+
         Lady Madonna
         Lying on the bed
         Listen to the music
         Playing in your head
+
         Tuesday afternoon is neverending
         Wednesday morning papers didn't come
         Thursday night your stockings
         Needed mending
         See how they run
+
         Lady Madonna
         Children at your feet
         Wonder how you manage
@@ -415,24 +435,29 @@ def backend(self) -> str:
     LyricsPage.make(
         "https://www.musica.com/letras.asp?letra=59862",
         """
-        Lady Madonna
         Lady Madonna, children at your feet
         Wonder how you manage to make ends meet
         Who finds the money when you pay the rent?
         Did you think that money was heaven sent?
+
         Friday night arrives without a suitcase
         Sunday morning creeping like a nun
         Monday's child has learned to tie his bootlace
         See how they run
+
         Lady Madonna, baby at your breast
         Wonders how you manage to feed the rest
+
         See how they run
+
         Lady Madonna lying on the bed
         Listen to the music playing in your head
+
         Tuesday afternoon is never ending
         Wednesday morning papers didn't come
         Thursday night your stockings needed mending
         See how they run
+
         Lady Madonna, children at your feet
         Wonder how you manage to make ends meet
         """,
@@ -457,6 +482,14 @@ def backend(self) -> str:
         See how they run.
         Lady Madonna, lying on the bed,
         Listen to the music playing in your head.
+
+        Tuesday afternoon is never ending.
+        Wednesday morning papers didn't come.
+        Thursday night your stockings needed mending.
+        See how they run.
+
+        Lady Madonna, children at your feet.
+        Wonder how you manage to make ends meet.
         """,
         url_title="Paroles Lady Madonna par The Beatles - Lyrics - Paroles.net",
     ),
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index c7a90683a5..e2085106d6 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -101,28 +101,6 @@ def test_search_pairs_titles(self, title, expected_extra_titles):
 
         assert list(actual_titles) == [title, *expected_extra_titles]
 
-    @pytest.mark.parametrize(
-        "initial_text, expected",
-        [
-            (
-                """<!--lyrics below-->
-                  &nbsp;one
-                  <br class='myclass'>
-                  two  !
-                  <br><br \\>
-                  <blink>four</blink>""",
-                "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>",
-            ),
-            ("foo<script>bar</script>baz", "foobaz"),
-        ],
-    )
-    def test_scrape_strip_cruft(self, initial_text, expected):
-        assert lyrics._scrape_strip_cruft(initial_text) == expected
-
-    def test_scrape_merge_paragraphs(self):
-        text = "one</p>   <p class='myclass'>two</p><p>three"
-        assert lyrics._scrape_merge_paragraphs(text) == "one\ntwo\nthree"
-
     @pytest.mark.parametrize(
         "text, expected",
         [
@@ -142,6 +120,25 @@ def test_slug(self, text, expected):
         assert lyrics.slug(text) == expected
 
 
+class TestHtml:
+    def test_scrape_strip_cruft(self):
+        initial = """<!--lyrics below-->
+                  &nbsp;one
+                  <br class='myclass'>
+                  two  !
+                  <br><br \\>
+                  <blink>four</blink>"""
+        expected = "<!--lyrics below-->\none\ntwo !\n\n<blink>four</blink>"
+
+        assert lyrics.Html.normalize_space(initial) == expected
+
+    def test_scrape_merge_paragraphs(self):
+        text = "one</p>   <p class='myclass'>two</p><p>three"
+        expected = "one\ntwo\n\nthree"
+
+        assert lyrics.Html.merge_paragraphs(text) == expected
+
+
 class TestSearchBackend:
     @pytest.fixture
     def backend(self, dist_thresh):

From 2ce3857be0ea024c7ab2aabd89349179d1ad3a48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 13:35:47 +0100
Subject: [PATCH 17/23] Google: prioritise Songlyrics and AZlyrics sources

---
 beetsplug/lyrics.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 680b659094..b30e812e8c 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -28,7 +28,7 @@
 from html import unescape
 from http import HTTPStatus
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
-from urllib.parse import quote
+from urllib.parse import quote, urlparse
 
 import requests
 from unidecode import unidecode
@@ -484,6 +484,10 @@ class SearchResult(NamedTuple):
     title: str
     url: str
 
+    @property
+    def source(self) -> str:
+        return urlparse(self.url).netloc
+
 
 class SearchBackend(SoupMixin, Backend):
     REQUIRES_BS = True
@@ -646,6 +650,8 @@ class Google(SearchBackend):
     #: Split cleaned up URL title into artist and title parts.
     URL_TITLE_PARTS_RE = re.compile(r" +(?:[ :|-]+|par|by) +")
 
+    SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6}
+
     @classmethod
     def pre_process_html(cls, html: str) -> str:
         """Pre-process the HTML content before scraping."""
@@ -714,6 +720,14 @@ def search(self, artist: str, title: str) -> Iterable[SearchResult]:
         for item in data.get("items", []):
             yield self.make_search_result(artist, title, item)
 
+    def get_results(self, *args) -> Iterable[SearchResult]:
+        """Try results from preferred sources first."""
+        for result in sorted(
+            super().get_results(*args),
+            key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1),
+        ):
+            yield result
+
     @classmethod
     def scrape(cls, html: str) -> str | None:
         # Get the longest text element (if any).

From a36eaee413fb124bafc1967a6a26cd76374b50bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 13 Oct 2024 15:57:09 +0100
Subject: [PATCH 18/23] Google: make sure we do not return the captcha text

If we get caught by Cloudfare, it forwards our request somewhere else
and returns some validation text response. To make sure that this text
does not get assumed for lyrics, we can disable redirects for the Google
backend, check the response code and raise if there's a redirect
attempt. This source will then be skipped and the backend continues with
the next one.
---
 beetsplug/lyrics.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index b30e812e8c..3d886b821c 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -100,6 +100,10 @@ class NotFoundError(requests.exceptions.HTTPError):
     pass
 
 
+class CaptchaError(requests.exceptions.HTTPError):
+    pass
+
+
 class TimeoutSession(requests.Session):
     def request(self, *args, **kwargs):
         """Wrap the request method to raise an exception on HTTP errors."""
@@ -107,6 +111,9 @@ def request(self, *args, **kwargs):
         r = super().request(*args, **kwargs)
         if r.status_code == HTTPStatus.NOT_FOUND:
             raise NotFoundError("HTTP Error: Not Found", response=r)
+        if 300 <= r.status_code < 400:
+            raise CaptchaError("Captcha is required", response=r)
+
         r.raise_for_status()
 
         return r
@@ -652,6 +659,8 @@ class Google(SearchBackend):
 
     SOURCE_DIST_FACTOR = {"www.azlyrics.com": 0.5, "www.songlyrics.com": 0.6}
 
+    ignored_domains: set[str] = set()
+
     @classmethod
     def pre_process_html(cls, html: str) -> str:
         """Pre-process the HTML content before scraping."""
@@ -660,8 +669,13 @@ def pre_process_html(cls, html: str) -> str:
 
     def fetch_text(self, *args, **kwargs) -> str:
         """Handle an error so that we can continue with the next URL."""
+        kwargs.setdefault("allow_redirects", False)
         with self.handle_request():
-            return super().fetch_text(*args, **kwargs)
+            try:
+                return super().fetch_text(*args, **kwargs)
+            except CaptchaError:
+                self.ignored_domains.add(urlparse(args[0]).netloc)
+                raise
 
     @staticmethod
     def get_part_dist(artist: str, title: str, part: str) -> float:
@@ -726,7 +740,8 @@ def get_results(self, *args) -> Iterable[SearchResult]:
             super().get_results(*args),
             key=lambda r: self.SOURCE_DIST_FACTOR.get(r.source, 1),
         ):
-            yield result
+            if result.source not in self.ignored_domains:
+                yield result
 
     @classmethod
     def scrape(cls, html: str) -> str | None:

From c7d6750cc7870608d21571de9133976f82fd85a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 26 Oct 2024 01:17:12 +0100
Subject: [PATCH 19/23] Remove dependency existence checks

I think we can make our life easier by removing these checks assuming
that users follow the instructions in the docs.
---
 beetsplug/lyrics.py | 46 +++------------------------------------------
 1 file changed, 3 insertions(+), 43 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 3d886b821c..dc31d54715 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -30,7 +30,9 @@
 from typing import TYPE_CHECKING, ClassVar, Iterable, Iterator, NamedTuple
 from urllib.parse import quote, urlparse
 
+import langdetect
 import requests
+from bs4 import BeautifulSoup
 from unidecode import unidecode
 
 import beets
@@ -43,20 +45,6 @@
 
     from ._typing import GeniusAPI, GoogleCustomSearchAPI, LRCLibAPI
 
-try:
-    from bs4 import BeautifulSoup
-
-    HAS_BEAUTIFUL_SOUP = True
-except ImportError:
-    HAS_BEAUTIFUL_SOUP = False
-
-try:
-    import langdetect
-
-    HAS_LANGDETECT = True
-except ImportError:
-    HAS_LANGDETECT = False
-
 USER_AGENT = f"beets/{beets.__version__}"
 INSTRUMENTAL_LYRICS = "[Instrumental]"
 
@@ -254,8 +242,6 @@ def handle_request(self) -> Iterator[None]:
 
 
 class Backend(RequestHandler):
-    REQUIRES_BS = False
-
     def __init__(self, config, log):
         self._log = log
         self.config = config
@@ -497,8 +483,6 @@ def source(self) -> str:
 
 
 class SearchBackend(SoupMixin, Backend):
-    REQUIRES_BS = True
-
     @cached_property
     def dist_thresh(self) -> float:
         return self.config["dist_thresh"].get(float)
@@ -595,7 +579,6 @@ def scrape(cls, html: str) -> str | None:
 class Tekstowo(SoupMixin, DirectBackend):
     """Fetch lyrics from Tekstowo.pl."""
 
-    REQUIRES_BS = True
     URL_TEMPLATE = "https://www.tekstowo.pl/piosenka,{},{}.html"
 
     non_alpha_to_underscore = partial(re.compile(r"\W").sub, "_")
@@ -806,9 +789,6 @@ def __init__(self):
             self.config["sources"].as_str_seq(), available_sources
         )
 
-        if not HAS_BEAUTIFUL_SOUP:
-            sources = self.sanitize_bs_sources(sources)
-
         if "google" in sources:
             if not self.config["google_API_key"].get():
                 # We log a *debug* message here because the default
@@ -822,31 +802,11 @@ def __init__(self):
             x.lower() for x in self.config["bing_lang_from"].as_str_seq()
         ]
 
-        if not HAS_LANGDETECT and self.config["bing_client_secret"].get():
-            self.warn(
-                "To use bing translations, you need to install the langdetect "
-                "module. See the documentation for further details."
-            )
-
         self.backends = [
             self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s))
             for s in sources
         ]
 
-    def sanitize_bs_sources(self, sources):
-        enabled_sources = []
-        for source in sources:
-            if self.SOURCE_BACKENDS[source].REQUIRES_BS:
-                self._log.debug(
-                    "To use the %s lyrics source, you must "
-                    "install the beautifulsoup4 module. See "
-                    "the documentation for further details." % source
-                )
-            else:
-                enabled_sources.append(source)
-
-        return enabled_sources
-
     @cached_property
     def bing_access_token(self) -> str | None:
         params = {
@@ -1022,7 +982,7 @@ def fetch_item_lyrics(self, item: Item, write: bool, force: bool) -> None:
 
         if lyrics:
             self.info("🟢 Found lyrics: {0}", item)
-            if HAS_LANGDETECT and self.config["bing_client_secret"].get():
+            if self.config["bing_client_secret"].get():
                 lang_from = langdetect.detect(lyrics)
                 if self.config["bing_lang_to"].get() != lang_from and (
                     not self.config["bing_lang_from"]

From db84aae341da6dae46c891aa33078595b0a58a52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 26 Oct 2024 02:42:45 +0100
Subject: [PATCH 20/23] Tidy up handling of backends

---
 beetsplug/lyrics.py         | 52 +++++++++++++++++--------------------
 test/plugins/test_lyrics.py |  6 ++++-
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index dc31d54715..6f53a15ad9 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -241,7 +241,14 @@ def handle_request(self) -> Iterator[None]:
             self.warn("Request error: {}", exc)
 
 
-class Backend(RequestHandler):
+class BackendClass(type):
+    @property
+    def name(cls) -> str:
+        """Return lowercase name of the backend class."""
+        return cls.__name__.lower()
+
+
+class Backend(RequestHandler, metaclass=BackendClass):
     def __init__(self, config, log):
         self._log = log
         self.config = config
@@ -736,15 +743,21 @@ def scrape(cls, html: str) -> str | None:
 
 
 class LyricsPlugin(RequestHandler, plugins.BeetsPlugin):
-    SOURCES = ["lrclib", "google", "musixmatch", "genius", "tekstowo"]
-    SOURCE_BACKENDS = {
-        "google": Google,
-        "musixmatch": MusiXmatch,
-        "genius": Genius,
-        "tekstowo": Tekstowo,
-        "lrclib": LRCLib,
+    BACKEND_BY_NAME = {
+        b.name: b for b in [LRCLib, Google, Genius, Tekstowo, MusiXmatch]
     }
 
+    @cached_property
+    def backends(self) -> list[Backend]:
+        user_sources = self.config["sources"].get()
+
+        chosen = plugins.sanitize_choices(user_sources, self.BACKEND_BY_NAME)
+        if "google" in chosen and not self.config["google_API_key"].get():
+            self.warn("Disabling Google source: no API key configured.")
+            chosen.remove("google")
+
+        return [self.BACKEND_BY_NAME[c](self.config, self._log) for c in chosen]
+
     def __init__(self):
         super().__init__()
         self.import_stages = [self.imported]
@@ -767,7 +780,9 @@ def __init__(self):
                 "synced": False,
                 # Musixmatch is disabled by default as they are currently blocking
                 # requests with the beets user agent.
-                "sources": [s for s in self.SOURCES if s != "musixmatch"],
+                "sources": [
+                    n for n in self.BACKEND_BY_NAME if n != "musixmatch"
+                ],
             }
         )
         self.config["bing_client_secret"].redact = True
@@ -784,29 +799,10 @@ def __init__(self):
         # open yet.
         self.rest = None
 
-        available_sources = list(self.SOURCES)
-        sources = plugins.sanitize_choices(
-            self.config["sources"].as_str_seq(), available_sources
-        )
-
-        if "google" in sources:
-            if not self.config["google_API_key"].get():
-                # We log a *debug* message here because the default
-                # configuration includes `google`. This way, the source
-                # is silent by default but can be enabled just by
-                # setting an API key.
-                self.debug("Disabling google source: " "no API key configured.")
-                sources.remove("google")
-
         self.config["bing_lang_from"] = [
             x.lower() for x in self.config["bing_lang_from"].as_str_seq()
         ]
 
-        self.backends = [
-            self.SOURCE_BACKENDS[s](self.config, self._log.getChild(s))
-            for s in sources
-        ]
-
     @cached_property
     def bing_access_token(self) -> str | None:
         params = {
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index e2085106d6..0c7359b17c 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -294,7 +294,7 @@ class TestGoogleLyrics(LyricsBackendTest):
     def backend_name(self):
         return "google"
 
-    @pytest.fixture(scope="class")
+    @pytest.fixture
     def plugin_config(self):
         return {"google_API_key": "test"}
 
@@ -306,6 +306,10 @@ def file_name(self):
     def search_item(self, url_title, url):
         return {"title": url_title, "link": url}
 
+    @pytest.mark.parametrize("plugin_config", [{}])
+    def test_disabled_without_api_key(self, lyrics_plugin):
+        assert not lyrics_plugin.backends
+
     def test_mocked_source_ok(self, backend, lyrics_html):
         """Test that lyrics of the mocked page are correctly scraped"""
         result = backend.scrape(lyrics_html).lower()

From 71895b2bf2d71c54682afa8ccef67393e5529648 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 19 Oct 2024 03:15:03 +0100
Subject: [PATCH 21/23] Append source to the lyrics

---
 beetsplug/lyrics.py         | 33 +++++++++++++++++++++------------
 test/plugins/test_lyrics.py | 18 ++++++++++++++----
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 6f53a15ad9..46a6e53c4b 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -255,7 +255,7 @@ def __init__(self, config, log):
 
     def fetch(
         self, artist: str, title: str, album: str, length: int
-    ) -> str | None:
+    ) -> tuple[str, str] | None:
         raise NotImplementedError
 
 
@@ -266,6 +266,7 @@ class LRCLyrics:
     DURATION_DIFF_TOLERANCE = 0.05
 
     target_duration: float
+    id: int
     duration: float
     instrumental: bool
     plain: str
@@ -281,6 +282,7 @@ def make(
     ) -> LRCLyrics:
         return cls(
             target_duration,
+            candidate["id"],
             candidate["duration"] or 0.0,
             candidate["instrumental"],
             candidate["plainLyrics"],
@@ -360,18 +362,20 @@ def pick_best_match(cls, lyrics: Iterable[LRCLyrics]) -> LRCLyrics | None:
 
     def fetch(
         self, artist: str, title: str, album: str, length: int
-    ) -> str | None:
+    ) -> tuple[str, str] | None:
         """Fetch lyrics text for the given song data."""
         fetch = partial(self.fetch_candidates, artist, title, album, length)
         make = partial(LRCLyrics.make, target_duration=length)
         pick = self.pick_best_match
         try:
-            return next(
+            item = next(
                 filter(None, map(pick, (map(make, x) for x in fetch())))
-            ).get_text(self.config["synced"])
+            )
         except StopIteration:
             return None
 
+        return item.get_text(self.config["synced"]), f"{self.GET_URL}/{item.id}"
+
 
 class DirectBackend(Backend):
     """A backend for fetching lyrics directly."""
@@ -407,7 +411,7 @@ def encode(cls, text: str) -> str:
 
         return quote(unidecode(text))
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
         url = self.build_url(artist, title)
 
         html = self.fetch_text(url)
@@ -429,7 +433,7 @@ def fetch(self, artist: str, title: str, *_) -> str | None:
         # sometimes there are non-existent lyrics with some content
         if "Lyrics | Musixmatch" in lyrics:
             return None
-        return lyrics
+        return lyrics, url
 
 
 class Html:
@@ -530,13 +534,13 @@ def get_results(self, artist: str, title: str) -> Iterable[SearchResult]:
             if check_match(candidate):
                 yield candidate
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
         """Fetch lyrics for the given artist and title."""
         for result in self.get_results(artist, title):
             if (html := self.fetch_text(result.url)) and (
                 lyrics := self.scrape(html)
             ):
-                return lyrics
+                return lyrics, result.url
 
         return None
 
@@ -594,11 +598,15 @@ class Tekstowo(SoupMixin, DirectBackend):
     def encode(cls, text: str) -> str:
         return cls.non_alpha_to_underscore(unidecode(text.lower()))
 
-    def fetch(self, artist: str, title: str, *_) -> str | None:
+    def fetch(self, artist: str, title: str, *_) -> tuple[str, str] | None:
+        url = self.build_url(artist, title)
         # We are expecting to receive a 404 since we are guessing the URL.
         # Thus suppress the error so that it does not end up in the logs.
         with suppress(NotFoundError):
-            return self.scrape(self.fetch_text(self.build_url(artist, title)))
+            if lyrics := self.scrape(self.fetch_text(url)):
+                return lyrics, url
+
+        return None
 
         return None
 
@@ -1004,8 +1012,9 @@ def get_lyrics(self, artist: str, title: str, *args) -> str | None:
         self.info("Fetching lyrics for {} - {}", artist, title)
         for backend in self.backends:
             with backend.handle_request():
-                if lyrics := backend.fetch(artist, title, *args):
-                    return lyrics
+                if lyrics_info := backend.fetch(artist, title, *args):
+                    lyrics, url = lyrics_info
+                    return f"{lyrics}\n\nSource: {url}"
 
         return None
 
diff --git a/test/plugins/test_lyrics.py b/test/plugins/test_lyrics.py
index 0c7359b17c..b33368b44d 100644
--- a/test/plugins/test_lyrics.py
+++ b/test/plugins/test_lyrics.py
@@ -279,11 +279,12 @@ def _patch_google_search(self, requests_mock, lyrics_page):
 
     def test_backend_source(self, lyrics_plugin, lyrics_page: LyricsPage):
         """Test parsed lyrics from each of the configured lyrics pages."""
-        lyrics = lyrics_plugin.get_lyrics(
+        lyrics_info = lyrics_plugin.get_lyrics(
             lyrics_page.artist, lyrics_page.track_title, "", 186
         )
 
-        assert lyrics
+        assert lyrics_info
+        lyrics, _ = lyrics_info.split("\n\nSource: ")
         assert lyrics == lyrics_page.lyrics
 
 
@@ -400,6 +401,7 @@ def test_scrape(self, backend, lyrics_html, expecting_lyrics):
 
 def lyrics_match(**overrides):
     return {
+        "id": 1,
         "instrumental": False,
         "duration": LYRICS_DURATION,
         "syncedLyrics": "synced",
@@ -428,7 +430,9 @@ def fetch_lyrics(self, backend, requests_mock, response_data):
         [({"synced": True}, "synced"), ({"synced": False}, "plain")],
     )
     def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
-        assert fetch_lyrics() == expected_lyrics
+        lyrics, _ = fetch_lyrics()
+
+        assert lyrics == expected_lyrics
 
     @pytest.mark.parametrize(
         "response_data, expected_lyrics",
@@ -475,4 +479,10 @@ def test_synced_config_option(self, fetch_lyrics, expected_lyrics):
     )
     @pytest.mark.parametrize("plugin_config", [{"synced": True}])
     def test_fetch_lyrics(self, fetch_lyrics, expected_lyrics):
-        assert fetch_lyrics() == expected_lyrics
+        lyrics_info = fetch_lyrics()
+        if lyrics_info is None:
+            assert expected_lyrics is None
+        else:
+            lyrics, _ = fetch_lyrics()
+
+            assert lyrics == expected_lyrics

From bfe45893ed9a445fa988e6ba71857eb4b97ff91f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sun, 20 Oct 2024 00:37:39 +0100
Subject: [PATCH 22/23] Xfail Songlyrics source

---
 test/plugins/lyrics_pages.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index bca66ef1b7..835803f0ea 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -522,6 +522,7 @@ def backend(self) -> str:
         Wonder how you manage to make ends meet
         """,
         url_title="THE BEATLES - LADY MADONNA LYRICS",
+        marks=[xfail_on_ci("Songlyrics is blocked by Cloudflare")],
     ),
     LyricsPage.make(
         "https://sweetslyrics.com/the-beatles/lady-madonna-lyrics",

From ebf136f05293d9ac34fff504478fd8a0814d0eb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C5=A0ar=C5=ABnas=20Nejus?= <snejus@protonmail.com>
Date: Sat, 26 Oct 2024 17:31:48 +0100
Subject: [PATCH 23/23] Google: add support for dainuzodziai.lt

---
 beetsplug/lyrics.py          |  7 +++++--
 test/plugins/lyrics_pages.py | 21 +++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/beetsplug/lyrics.py b/beetsplug/lyrics.py
index 46a6e53c4b..2ef1ac2f30 100644
--- a/beetsplug/lyrics.py
+++ b/beetsplug/lyrics.py
@@ -445,7 +445,9 @@ class Html:
     #: (paroles.net, sweetslyrics.com, lacoccinelle.net)
     merge_lines = partial(re.compile(r"</p>\s+<p[^>]*>(?!___)").sub, "\n")
     #: remove empty divs (lacoccinelle.net)
-    remove_empty_divs = partial(re.compile(r"<div[^>]*>\s*</div>").sub, "")
+    remove_empty_tags = partial(
+        re.compile(r"(<(div|span)[^>]*>\s*</\2>)").sub, ""
+    )
     #: remove Google Ads tags (musica.com)
     remove_aside = partial(re.compile("<aside .+?</aside>").sub, "")
     #: remove adslot-Content_1 div from the lyrics text (paroles.net)
@@ -469,7 +471,7 @@ def remove_ads(cls, text: str) -> str:
 
     @classmethod
     def merge_paragraphs(cls, text: str) -> str:
-        return cls.merge_blocks(cls.merge_lines(cls.remove_empty_divs(text)))
+        return cls.merge_blocks(cls.merge_lines(cls.remove_empty_tags(text)))
 
 
 class SoupMixin:
@@ -641,6 +643,7 @@ class Google(SearchBackend):
       paroles(\ et\ traduction|\ de\ chanson)?
     | letras?(\ de)?
     | liedtexte
+    | dainų\ žodžiai
     | original\ song\ full\ text\.
     | official
     | 20[12]\d\ version
diff --git a/test/plugins/lyrics_pages.py b/test/plugins/lyrics_pages.py
index 835803f0ea..84c9e24410 100644
--- a/test/plugins/lyrics_pages.py
+++ b/test/plugins/lyrics_pages.py
@@ -147,6 +147,27 @@ def backend(self) -> str:
         """,
         url_title="The Beatles Lady Madonna lyrics",
     ),
+    LyricsPage.make(
+        "https://www.dainuzodziai.lt/m/mergaites-nori-mylet-atlanta/",
+        """
+        Jos nesuspėja skriet paskui vėją
+        Bangos į krantą grąžina jas vėl
+        Jos karštą saulę paliesti norėjo
+        Ant kranto palikę visas negandas
+
+        Bet jos nori mylėt
+        Jos nenori liūdėt
+        Leisk mergaitėms mylėt
+        Kaip jos moka mylėt
+        Koks vakaras šiltas ir nieko nestinga
+        Veidus apšviečia žaisminga šviesa
+        Jos buvo laimingos prie jūros kur liko
+        Tik vėjas išmokęs visas jų dainas
+        """,
+        artist="Atlanta",
+        track_title="Mergaitės Nori Mylėt",
+        url_title="Mergaitės nori mylėt – Atlanta | Dainų Žodžiai",
+    ),
     LyricsPage.make(
         "https://genius.com/The-beatles-lady-madonna-lyrics",
         """