Fix: abstract limit (#268)

Patches sent by mail by @tiosgz. Applied with `git am`. It's my very first time in this kind of [git email workflow](https://git-send-email.io/). Pfiu !. Command used: ```sh git am --3way --ignore-space-change v2-0001-fix-retrieving-article-description.patch git am --3way --ignore-space-change v2-0002-docs-configuration-fix-update-abstract_-chars_cou.patch git am --3way --ignore-space-change v2-0003-tests-add-test-cases-for-abstract_delimiter.patch ``` Supersedes #202 cc @craigbox @YDX-2147483647
Guts · Apr 23, 2024 · 0fb63da · 0fb63da
2 parents a08052a + d81039d
commit 0fb63da
Show file tree

Hide file tree

Showing 5 changed files with 107 additions and 41 deletions.
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -240,28 +240,26 @@ Output:
 
 ### `abstract_chars_count`: item description length
 
-To fill each [item description element](https://www.w3schools.com/xml/rss_tag_title_link_description_item.asp):
+Used, in combination with `abstract_delimiter`, to determine each [item description element](https://www.w3schools.com/xml/rss_tag_title_link_description_item.asp):
 
 - If this value is set to `-1`, then the articles' full HTML content will be filled into the description element.
-- be careful: if set to `0` and there is no description, the feed's compliance is broken (an item must have a description)
 - Otherwise, the plugin first tries to retrieve the value of the keyword `description` from the [page metadata].
-- If the value is non-negative and no `description` meta is found, then the plugin retrieves the first number of characters of the page content defined by this setting. Retrieved content is the raw markdown converted roughly into HTML.
+- If that fails and `abstract_delimiter` is found in the page, the article content up to (but not including) the delimiter is used.
+- If the above has failed, then the plugin retrieves the first number of characters of the page content defined by this setting. Retrieved content is the raw markdown converted roughly into HTML.
+
+Be careful: if set to `0` and there is no description, the feed's compliance is broken (an item must have a description).
 
 `abstract_chars_count`: number of characters to use as item description.
 
 Default: `150`
 
 ----
 
-#### `abstract_delimiter`: abstract delimiter
-
-Used to fill each [item description element](https://www.w3schools.com/xml/rss_tag_title_link_description_item.asp):
+### `abstract_delimiter`: abstract delimiter
 
-- If this value is set to `-1`, then the full HTML content will be filled into the description element.
-- Otherwise, the plugin first tries to retrieve the value of the key `description` from the page metadata.
-- If the value is non-negative and no `description` meta is found, then the plugin retrieves the first number of characters of the page content defined by this setting. Retrieved content is the raw markdown converted rougthly into HTML (i.e. without extension, etc.).
+Please see `abstract_chars_count` for how this setting is used. A value of `""` (the empty string) disables this step.
 
-`abstract_delimiter`: string to mark .
+`abstract_delimiter`: string to mark where the description ends.
 
 Default: `<!-- more -->`
 

diff --git a/mkdocs_rss_plugin/util.py b/mkdocs_rss_plugin/util.py
@@ -455,7 +455,8 @@ def get_description_or_abstract(
         self, in_page: Page, chars_count: int = 160, abstract_delimiter: str = None
     ) -> str:
         """Returns description from page meta. If it doesn't exist, use the \
-        {chars_count} first characters from page content (in markdown).
+        page content up to {abstract_delimiter} or the {chars_count} first \
+        characters from page content (in markdown).
 
         :param Page in_page: page to look at
         :param int chars_count: if page.meta.description is not set, number of chars \
@@ -468,22 +469,16 @@ def get_description_or_abstract(
 
         description = in_page.meta.get("description")
 
-        # Set chars_count to None if it is set to be unlimited, for slicing.
-        if chars_count < 0:
-            chars_count = None
-
-        # If the abstract chars is not unlimited and the description exists,
-        # return the description.
-        if description and chars_count is not None:
+        # If the full page is wanted (unlimited chars count)
+        if chars_count == -1 and (in_page.content or in_page.markdown):
+            if in_page.content:
+                return in_page.content
+            else:
+                return markdown.markdown(in_page.markdown, output_format="html5")
+        # If the description is explicitly given
+        elif description:
             return description
-        # If no description and chars_count set to 0, return empty string
-        elif not description and chars_count == 0:
-            logger.warning(
-                f"No description set for page {in_page.file.src_uri} "
-                "and 'abstract_chars_count' set to 0. The feed won't be compliant, "
-                "because an item must have a description."
-            )
-            return ""
+        # If the abstract is cut by the delimiter
         elif (
             abstract_delimiter
             and (
@@ -495,24 +490,23 @@ def get_description_or_abstract(
                 in_page.markdown[:excerpt_separator_position],
                 output_format="html5",
             )
-        # If chars count is unlimited, use the html content
-        elif in_page.content and chars_count == -1:
-            if chars_count is None or len(in_page.content) < chars_count:
-                return in_page.content[:chars_count]
-        # Use markdown
-        elif in_page.markdown:
-            if chars_count is None or len(in_page.markdown) < chars_count:
-                return markdown.markdown(
-                    in_page.markdown[:chars_count], output_format="html5"
-                )
+        # Use first chars_count from the markdown
+        elif chars_count > 0 and in_page.markdown:
+            if len(in_page.markdown) <= chars_count:
+                return markdown.markdown(in_page.markdown, output_format="html5")
             else:
                 return markdown.markdown(
                     f"{in_page.markdown[: chars_count - 3]}...",
                     output_format="html5",
                 )
-        # Unlimited chars_count but no content is found, then return the description.
+        # No explicit description and no (or empty) abstract found
         else:
-            return description if description else ""
+            logger.warning(
+                f"No description generated from metadata or content of the page {in_page.file.src_uri}, "
+                "therefore the feed won't be compliant, "
+                "because an item must have a description."
+            )
+            return ""
 
     def get_image(self, in_page: Page, base_url: str) -> Optional[Tuple[str, str, int]]:
         """Get page's image from page meta or social cards and returns properties.

diff --git a/tests/fixtures/docs/page_without_meta_early_delimiter.md b/tests/fixtures/docs/page_without_meta_early_delimiter.md
@@ -0,0 +1,5 @@
+# Page without meta with early delimiter
+
+<!-- more -->
+
+Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
diff --git a/tests/fixtures/mkdocs_item_delimiter_empty.yml b/tests/fixtures/mkdocs_item_delimiter_empty.yml
@@ -0,0 +1,23 @@
+# Project information
+site_name: MkDocs RSS Plugin - TEST
+site_description: Basic setup to test against MkDocs RSS plugin
+site_author: Julien Moura (Guts)
+site_url: https://guts.github.io/mkdocs-rss-plugin
+copyright: "Guts - In Geo Veritas"
+
+# Repository
+repo_name: "guts/mkdocs-rss-plugin"
+repo_url: "https://github.com/guts/mkdocs-rss-plugin"
+
+use_directory_urls: true
+
+plugins:
+  - rss:
+      abstract_delimiter: ""
+
+theme:
+  name: readthedocs
+
+# Extensions to enhance markdown
+markdown_extensions:
+  - meta
diff --git a/tests/test_build.py b/tests/test_build.py
@@ -403,9 +403,55 @@ def test_simple_build_item_length_unlimited(self):
                     "Page without meta with short text",
                     "Blog sample",
                 ):
-                    self.assertGreaterEqual(
-                        len(feed_item.description), 150, feed_item.title
-                    )
+                    self.assertGreater(len(feed_item.description), 150, feed_item.title)
+
+    def test_simple_build_item_delimiter(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            cli_result = self.build_docs_setup(
+                testproject_path="docs",
+                mkdocs_yml_filepath=Path("tests/fixtures/mkdocs_minimal.yml"),
+                output_path=tmpdirname,
+                strict=True,
+            )
+            if cli_result.exception is not None:
+                e = cli_result.exception
+                logger.debug(format_exception(type(e), e, e.__traceback__))
+
+            self.assertEqual(cli_result.exit_code, 0)
+            self.assertIsNone(cli_result.exception)
+
+            # created items
+            feed_parsed = feedparser.parse(Path(tmpdirname) / OUTPUT_RSS_FEED_CREATED)
+            self.assertEqual(feed_parsed.bozo, 0)
+
+            for feed_item in feed_parsed.entries:
+                if feed_item.title in ("Page without meta with early delimiter",):
+                    self.assertLess(len(feed_item.description), 50, feed_item.title)
+
+    def test_simple_build_item_delimiter_empty(self):
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            cli_result = self.build_docs_setup(
+                testproject_path="docs",
+                mkdocs_yml_filepath=Path(
+                    "tests/fixtures/mkdocs_item_delimiter_empty.yml"
+                ),
+                output_path=tmpdirname,
+                strict=True,
+            )
+            if cli_result.exception is not None:
+                e = cli_result.exception
+                logger.debug(format_exception(type(e), e, e.__traceback__))
+
+            self.assertEqual(cli_result.exit_code, 0)
+            self.assertIsNone(cli_result.exception)
+
+            # created items
+            feed_parsed = feedparser.parse(Path(tmpdirname) / OUTPUT_RSS_FEED_CREATED)
+            self.assertEqual(feed_parsed.bozo, 0)
+
+            for feed_item in feed_parsed.entries:
+                if feed_item.title in ("Page without meta with early delimiter",):
+                    self.assertGreater(len(feed_item.description), 150, feed_item.title)
 
     def test_simple_build_locale_with_territory(self):
         with tempfile.TemporaryDirectory() as tmpdirname: