From 961d3bbd28d134280ebff30552ae209ee4f26b5e Mon Sep 17 00:00:00 2001
From: Seth Morton <seth.m.morton@gmail.com>
Date: Sat, 29 Jan 2022 17:17:15 -0800
Subject: [PATCH 1/3] Add tests to demonstrate the PATH ext bug

I'm not sure sure it is *actually* a bug, but the PATH algorithm's way
of splitting extensions was over-zealous and in practice will split off
more extensions that is probably desired. To fix this, we will need to
add a heuristic, but this commit adds tests to demonstrate the problem.
---
 tests/test_natsorted.py | 15 +++++++++++++++
 tests/test_utils.py     | 25 +++++++++++++++++++++----
 2 files changed, 36 insertions(+), 4 deletions(-)

diff --git a/tests/test_natsorted.py b/tests/test_natsorted.py
index 4a64a27..eb3aefe 100644
--- a/tests/test_natsorted.py
+++ b/tests/test_natsorted.py
@@ -182,6 +182,21 @@ def test_natsorted_handles_numbers_and_filesystem_paths_simultaneously() -> None
     assert natsorted(given, alg=ns.PATH) == expected
 
 
+def test_natsorted_path_extensions_heuristic() -> None:
+    # https://github.com/SethMMorton/natsort/issues/145
+    given = [
+        "Try.Me.Bug - 09 - One.Two.Three.[text].mkv",
+        "Try.Me.Bug - 07 - One.Two.5.[text].mkv",
+        "Try.Me.Bug - 08 - One.Two.Three[text].mkv",
+    ]
+    expected = [
+        "Try.Me.Bug - 07 - One.Two.5.[text].mkv",
+        "Try.Me.Bug - 08 - One.Two.Three[text].mkv",
+        "Try.Me.Bug - 09 - One.Two.Three.[text].mkv",
+    ]
+    assert natsorted(given, alg=ns.PATH) == expected
+
+
 @pytest.mark.parametrize(
     "alg, expected",
     [
diff --git a/tests/test_utils.py b/tests/test_utils.py
index bb229b9..b140682 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -6,7 +6,7 @@
 import string
 from itertools import chain
 from operator import neg as op_neg
-from typing import List, Pattern, Union
+from typing import List, Pattern, Tuple, Union
 
 import pytest
 from hypothesis import given
@@ -155,9 +155,26 @@ def test_path_splitter_splits_path_string_by_sep(x: List[str]) -> None:
     assert tuple(utils.path_splitter(z)) == tuple(pathlib.Path(z).parts)
 
 
-def test_path_splitter_splits_path_string_by_sep_and_removes_extension_example() -> None:
-    given = "/this/is/a/path/file.x1.10.tar.gz"
-    expected = (os.sep, "this", "is", "a", "path", "file.x1.10", ".tar", ".gz")
+@pytest.mark.parametrize(
+    "given, expected",
+    [
+        (
+            "/this/is/a/path/file.x1.10.tar.gz",
+            (os.sep, "this", "is", "a", "path", "file.x1.10", ".tar", ".gz"),
+        ),
+        (
+            "/this/is/a/path/file.x1.10.tar",
+            (os.sep, "this", "is", "a", "path", "file.x1.10", ".tar"),
+        ),
+        (
+            "/this/is/a/path/file.x1.threethousand.tar",
+            (os.sep, "this", "is", "a", "path", "file.x1.threethousand", ".tar"),
+        ),
+    ],
+)
+def test_path_splitter_splits_path_string_by_sep_and_removes_extension_example(
+    given: str, expected: Tuple[str, ...]
+) -> None:
     assert tuple(utils.path_splitter(given)) == tuple(expected)
 
 

From 9aad50ddc877cf97b9141604cd198b3b06d88e63 Mon Sep 17 00:00:00 2001
From: Seth Morton <seth.m.morton@gmail.com>
Date: Sat, 29 Jan 2022 17:20:14 -0800
Subject: [PATCH 2/3] Add some limiting heuristics to the PATH suffix splitting

The prior algorithm went as follows: Obtain ALL suffixes from the base
component of the filename. Then, starting from the back, keep the
suffixes split until a suffix is encountered that begins with the
regular expression /.\d/. It was assumed that this was intended to be a
floating point number, and not an extension, and thus the splitting
would stop at that point.

Some input has been seen where the filenames are composed nearly entirely
of Word.then.dot.and.then.dot. One entry amongst them contained
Word.then.dot.5.then.dot. This caused this one entry to be treated
differently from the rest of the entries due to the ".5", and the
sorting order was not as expected.

The new algorithm is as follows: Obtain a maxium of two suffixes. Keep
these suffixes until one of them has a length greater than 4 or starts
with the regular expression /.\d/.

This heuristic of course is not bullet-proof, but it will do a better
job on most real-world filenames than the previous algorithm.
---
 natsort/utils.py | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/natsort/utils.py b/natsort/utils.py
index 8d56b06..3832318 100644
--- a/natsort/utils.py
+++ b/natsort/utils.py
@@ -893,16 +893,21 @@ def path_splitter(
         path_parts = []
         base = str(s)
 
-    # Now, split off the file extensions until we reach a decimal number at
-    # the beginning of the suffix or there are no more extensions.
-    suffixes = PurePath(base).suffixes
-    try:
-        digit_index = next(i for i, x in enumerate(reversed(suffixes)) if _d_match(x))
-    except StopIteration:
-        pass
-    else:
-        digit_index = len(suffixes) - digit_index
-        suffixes = suffixes[digit_index:]
-
+    # Now, split off the file extensions until
+    #  - we reach a decimal number at the beginning of the suffix
+    #  - more than two suffixes have been seen
+    #  - a suffix is more than five characters (including leading ".")
+    #  - there are no more extensions
+    suffixes = []
+    for i, suffix in enumerate(reversed(PurePath(base).suffixes)):
+        if _d_match(suffix) or i > 1 or len(suffix) > 5:
+            break
+        suffixes.append(suffix)
+    suffixes.reverse()
+
+    # Remove the suffixes from the base component
     base = base.replace("".join(suffixes), "")
-    return filter(None, ichain(path_parts, [base], suffixes))
+    base_component = [base] if base else []
+
+    # Join all path comonents in an iterator
+    return filter(None, ichain(path_parts, base_component, suffixes))

From 4832c1506833ea40592bc79d638e233495eb3558 Mon Sep 17 00:00:00 2001
From: Seth Morton <seth.m.morton@gmail.com>
Date: Sat, 29 Jan 2022 22:30:49 -0800
Subject: [PATCH 3/3] Update changelog

---
 CHANGELOG.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2f81376..b1475fc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,18 +1,22 @@
 Unreleased
 ---
 
+### Changed
+- When using `ns.PATH`, only split off a maximum of two suffixes from
+  a file name (issues #145, #146).
+
 [8.0.2] - 2021-12-14
 ---
 
 ### Fixed
-- Bug where sorting paths fail if one of the paths is '.'.
+- Bug where sorting paths fail if one of the paths is '.' (issues #142, #143)
 
 [8.0.1] - 2021-12-10
 ---
 
 ### Fixed
 - Compose unicode characters when using locale to ensure sorting is correct
-  across all locales.
+  across all locales (issues #140, #141)
 
 [8.0.0] - 2021-11-03
 ---