From 2a948ee9ab0ddda9fe39e47a6d8071744ace4c11 Mon Sep 17 00:00:00 2001
From: Peter Lombaers <peter@idfuse.nl>
Date: Thu, 6 Feb 2025 13:47:03 +0100
Subject: [PATCH 1/6] Linting

---
 asreviewcontrib/datatools/dedup.py      | 136 ++++++++++++------------
 asreviewcontrib/datatools/entrypoint.py |  12 +--
 2 files changed, 72 insertions(+), 76 deletions(-)

diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py
index 1f263bd..7b2e0aa 100644
--- a/asreviewcontrib/datatools/dedup.py
+++ b/asreviewcontrib/datatools/dedup.py
@@ -13,19 +13,18 @@
 
 
 def _print_similar_list(
-        similar_list: list[tuple[int, int]],
-        data: pd.Series,
-        pid: str,
-        pids: pd.Series = None
-        ) -> None:
-
+    similar_list: list[tuple[int, int]],
+    data: pd.Series,
+    pid: str,
+    pids: pd.Series = None,
+) -> None:
     print_seq_matcher = SequenceMatcher()
     console = Console()
 
     if pids is not None:
-        print(f'Found similar titles or same {pid} at lines:')
+        print(f"Found similar titles or same {pid} at lines:")
     else:
-        print('Found similar titles at lines:')
+        print("Found similar titles at lines:")
 
     for i, j in similar_list:
         print_seq_matcher.set_seq1(data.iloc[i])
@@ -33,74 +32,75 @@ def _print_similar_list(
         text = Text()
 
         if pids is not None:
-            text.append(f'\nLines {i+1} and {j+1} ', style='bold')
+            text.append(f"\nLines {i + 1} and {j + 1} ", style="bold")
             if pids.iloc[i] == pids.iloc[j]:
-                text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style='dim')
+                text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style="dim")
             else:
-                text.append(f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n',
-                            style='dim')
+                text.append(
+                    f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', style="dim"
+                )
 
         else:
-            text.append(f'\nLines {i+1} and {j+1}:\n', style='bold')
+            text.append(f"\nLines {i + 1} and {j + 1}:\n", style="bold")
 
         for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
-            if tag == 'replace':
+            if tag == "replace":
                 # add rich strikethrough
-                text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
-                text.append(f'{data.iloc[j][j1:j2]}', style='green')
-            if tag == 'delete':
-                text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
-            if tag == 'insert':
-                text.append(f'{data.iloc[j][j1:j2]}', style='green')
-            if tag == 'equal':
-                text.append(f'{data.iloc[i][i1:i2]}', style='dim')
+                text.append(f"{data.iloc[i][i1:i2]}", style="red strike")
+                text.append(f"{data.iloc[j][j1:j2]}", style="green")
+            if tag == "delete":
+                text.append(f"{data.iloc[i][i1:i2]}", style="red strike")
+            if tag == "insert":
+                text.append(f"{data.iloc[j][j1:j2]}", style="green")
+            if tag == "equal":
+                text.append(f"{data.iloc[i][i1:i2]}", style="dim")
 
         console.print(text)
 
-    print('')
+    print("")
 
 
 def _drop_duplicates_by_similarity(
-        asdata: ASReviewData,
-        pid: str,
-        similarity: float = 0.98,
-        skip_abstract: bool = False,
-        discard_stopwords: bool = False,
-        stopwords_language: str = 'english',
-        strict_similarity: bool = False,
-        verbose: bool = False,
-        ) -> None:
-
+    asdata: ASReviewData,
+    pid: str,
+    similarity: float = 0.98,
+    skip_abstract: bool = False,
+    discard_stopwords: bool = False,
+    stopwords_language: str = "english",
+    strict_similarity: bool = False,
+    verbose: bool = False,
+) -> None:
     if skip_abstract:
-        data = asdata.df['title']
+        data = asdata.df["title"]
     else:
         data = pd.Series(asdata.texts)
 
-    symbols_regex = re.compile(r'[^ \w\d\-_]')
-    spaces_regex = re.compile(r'\s+')
+    symbols_regex = re.compile(r"[^ \w\d\-_]")
+    spaces_regex = re.compile(r"\s+")
 
     # clean the data
     s = (
-        data
-        .apply(ftfy.fix_text)
-        .str.replace(symbols_regex, '', regex=True)
-        .str.replace(spaces_regex, ' ', regex=True)
+        data.apply(ftfy.fix_text)
+        .str.replace(symbols_regex, "", regex=True)
+        .str.replace(spaces_regex, " ", regex=True)
         .str.lower()
         .str.strip()
-        .replace('', None)
+        .replace("", None)
     )
 
     if discard_stopwords:
         try:
             from nltk.corpus import stopwords
+
             stopwords_set = set(stopwords.words(stopwords_language))
         except LookupError:
             import nltk
-            nltk.download('stopwords')
+
+            nltk.download("stopwords")
             stopwords_set = set(stopwords.words(stopwords_language))
 
-        stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b')
-        s = s.str.replace(stopwords_regex, '', regex=True)
+        stopwords_regex = re.compile(rf"\b{'\\b|\\b'.join(stopwords_set)}\b")
+        s = s.str.replace(stopwords_regex, "", regex=True)
 
     seq_matcher = SequenceMatcher()
     duplicated = [False] * len(s)
@@ -121,21 +121,23 @@ def _drop_duplicates_by_similarity(
         else:
             pids = asdata.df[pid]
 
-        for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'):
+        for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"):
             seq_matcher.set_seq2(text)
 
             # loop through the rest of the data if it has the same pid or similar length
-            for j, t in s.iloc[i+1:][(asdata.df[pid] == asdata.df.iloc[i][pid]) |
-                                     (abs(s.str.len() - len(text)) < 5)].items():
+            for j, t in s.iloc[i + 1 :][
+                (asdata.df[pid] == asdata.df.iloc[i][pid])
+                | (abs(s.str.len() - len(text)) < 5)
+            ].items():
                 seq_matcher.set_seq1(t)
 
                 # if the texts have the same pid or are similar enough,
                 # mark the second one as duplicate
-                if pids.iloc[i] == pids.iloc[j] or \
-                    (seq_matcher.real_quick_ratio() > similarity and \
-                    seq_matcher.quick_ratio() > similarity and \
-                    (not strict_similarity or seq_matcher.ratio() > similarity)):
-
+                if pids.iloc[i] == pids.iloc[j] or (
+                    seq_matcher.real_quick_ratio() > similarity
+                    and seq_matcher.quick_ratio() > similarity
+                    and (not strict_similarity or seq_matcher.ratio() > similarity)
+                ):
                     if verbose and not duplicated[j]:
                         similar_list.append((i, j))
 
@@ -145,20 +147,21 @@ def _drop_duplicates_by_similarity(
             _print_similar_list(similar_list, data, pid, pids)
 
     else:
-        print(f'Not using {pid} for deduplication because there is no such data.')
+        print(f"Not using {pid} for deduplication because there is no such data.")
 
-        for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'):
+        for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"):
             seq_matcher.set_seq2(text)
 
             # loop through the rest of the data if it has similar length
-            for j, t in s.iloc[i+1:][abs(s.str.len() - len(text)) < 5].items():
+            for j, t in s.iloc[i + 1 :][abs(s.str.len() - len(text)) < 5].items():
                 seq_matcher.set_seq1(t)
 
                 # if the texts are similar enough, mark the second one as duplicate
-                if seq_matcher.real_quick_ratio() > similarity and \
-                    seq_matcher.quick_ratio() > similarity and \
-                    (not strict_similarity or seq_matcher.ratio() > similarity):
-
+                if (
+                    seq_matcher.real_quick_ratio() > similarity
+                    and seq_matcher.quick_ratio() > similarity
+                    and (not strict_similarity or seq_matcher.ratio() > similarity)
+                ):
                     if verbose and not duplicated[j]:
                         similar_list.append((i, j))
 
@@ -176,8 +179,7 @@ def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None:
     if not args.similar:
         if args.pid not in asdata.df.columns:
             print(
-                f'Not using {args.pid} for deduplication '
-                'because there is no such data.'
+                f"Not using {args.pid} for deduplication because there is no such data."
             )
 
         # retrieve deduplicated ASReview data object
@@ -193,19 +195,13 @@ def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None:
             args.stopwords_language,
             args.strict,
             args.verbose,
-            )
+        )
 
     # count duplicates
     n_dup = initial_length - len(asdata.df)
 
     if args.output_path:
         asdata.to_file(args.output_path)
-        print(
-            f'Removed {n_dup} duplicates from dataset with'
-            f' {initial_length} records.'
-        )
+        print(f"Removed {n_dup} duplicates from dataset with {initial_length} records.")
     else:
-        print(
-            f'Found {n_dup} duplicates in dataset with'
-            f' {initial_length} records.'
-        )
+        print(f"Found {n_dup} duplicates in dataset with {initial_length} records.")
diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py
index b38824a..28d33c0 100644
--- a/asreviewcontrib/datatools/entrypoint.py
+++ b/asreviewcontrib/datatools/entrypoint.py
@@ -62,7 +62,7 @@ def execute(self, argv):
                 )
                 dedup_parser.add_argument(
                     "--similar",
-                    action='store_true',
+                    action="store_true",
                     help="Drop similar records.",
                 )
                 dedup_parser.add_argument(
@@ -73,17 +73,17 @@ def execute(self, argv):
                 )
                 dedup_parser.add_argument(
                     "--title_only",
-                    action='store_true',
+                    action="store_true",
                     help="Use only title for deduplication.",
                 )
                 dedup_parser.add_argument(
                     "--stopwords",
-                    action='store_true',
+                    action="store_true",
                     help="Ignore stopwords for deduplication, focusing on main words.",
                 )
                 dedup_parser.add_argument(
                     "--strict",
-                    action='store_true',
+                    action="store_true",
                     help="Use a more strict similarity for deduplication.",
                 )
                 dedup_parser.add_argument(
@@ -94,7 +94,7 @@ def execute(self, argv):
                 )
                 dedup_parser.add_argument(
                     "--verbose",
-                    action='store_true',
+                    action="store_true",
                     help="Print verbose output.",
                 )
 
@@ -141,7 +141,7 @@ def execute(self, argv):
                 "subcommand",
                 nargs="?",
                 default=None,
-                help=f"The datatool to launch. Available commands:\n\n" f"{DATATOOLS}",
+                help=f"The datatool to launch. Available commands:\n\n{DATATOOLS}",
             )
             parser.add_argument(
                 "-V",

From da7e8c797cb50ef29b1c9e4cb9e5dfb16e895c36 Mon Sep 17 00:00:00 2001
From: Peter Lombaers <peter@idfuse.nl>
Date: Thu, 6 Feb 2025 14:27:15 +0100
Subject: [PATCH 2/6] Fix tests and improve dedup function arguments

---
 asreviewcontrib/datatools/dedup.py | 96 ++++++++++++++++++++----------
 tests/test_dedup.py                | 88 ++++++++-------------------
 2 files changed, 90 insertions(+), 94 deletions(-)

diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py
index 7b2e0aa..91d5990 100644
--- a/asreviewcontrib/datatools/dedup.py
+++ b/asreviewcontrib/datatools/dedup.py
@@ -1,5 +1,4 @@
 import re
-from argparse import Namespace
 from difflib import SequenceMatcher
 
 import ftfy
@@ -63,14 +62,13 @@ def _print_similar_list(
 def _drop_duplicates_by_similarity(
     asdata: ASReviewData,
     pid: str,
-    similarity: float = 0.98,
-    skip_abstract: bool = False,
-    discard_stopwords: bool = False,
-    stopwords_language: str = "english",
-    strict_similarity: bool = False,
+    threshold: float = 0.98,
+    title_only: bool = False,
+    stopwords_language: str = None,
+    strict: bool = False,
     verbose: bool = False,
 ) -> None:
-    if skip_abstract:
+    if title_only:
         data = asdata.df["title"]
     else:
         data = pd.Series(asdata.texts)
@@ -88,7 +86,7 @@ def _drop_duplicates_by_similarity(
         .replace("", None)
     )
 
-    if discard_stopwords:
+    if stopwords_language:
         try:
             from nltk.corpus import stopwords
 
@@ -134,9 +132,9 @@ def _drop_duplicates_by_similarity(
                 # if the texts have the same pid or are similar enough,
                 # mark the second one as duplicate
                 if pids.iloc[i] == pids.iloc[j] or (
-                    seq_matcher.real_quick_ratio() > similarity
-                    and seq_matcher.quick_ratio() > similarity
-                    and (not strict_similarity or seq_matcher.ratio() > similarity)
+                    seq_matcher.real_quick_ratio() > threshold
+                    and seq_matcher.quick_ratio() > threshold
+                    and (not strict or seq_matcher.ratio() > threshold)
                 ):
                     if verbose and not duplicated[j]:
                         similar_list.append((i, j))
@@ -158,9 +156,9 @@ def _drop_duplicates_by_similarity(
 
                 # if the texts are similar enough, mark the second one as duplicate
                 if (
-                    seq_matcher.real_quick_ratio() > similarity
-                    and seq_matcher.quick_ratio() > similarity
-                    and (not strict_similarity or seq_matcher.ratio() > similarity)
+                    seq_matcher.real_quick_ratio() > threshold
+                    and seq_matcher.quick_ratio() > threshold
+                    and (not strict or seq_matcher.ratio() > threshold)
                 ):
                     if verbose and not duplicated[j]:
                         similar_list.append((i, j))
@@ -173,35 +171,71 @@ def _drop_duplicates_by_similarity(
     asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True)
 
 
-def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None:
+def deduplicate_data(
+    asdata: ASReviewData,
+    output_path: str = None,
+    pid: str = "doi",
+    similar: bool = False,
+    threshold: float = 0.98,
+    title_only: bool = False,
+    stopwords_language: str = None,
+    strict: bool = False,
+    verbose: bool = False,
+) -> None:
+    """Deduplicate an ASReview data object.
+
+    Parameters
+    ----------
+    asdata : ASReviewData
+        The data object.
+    output_path : str, optional
+        If provided, the deduplicated data object is stored at this location. By
+        default None.
+    pid : str, optional
+        Principal identifier to use for deduplication, by default "doi"
+    similar : bool, optional
+        Where to deduplicate 'similar' record. The similarity of the records is
+        calculated using the `SequenceMatcher` from `difflib`. By default False.
+    threshold : float, optional
+        Threshold score above which two records are considered duplicate.
+        By default 0.98. Only applies if `similar` is set to `True`.
+    title_only : bool, optional
+        Only use the title for deduplication, by default False
+    stopwords_language : str, optional
+        Remove stopwords from this language before deduplicating, for example 'english'.
+        By default None. Only applies if `similar` is set to `True`.
+    strict : bool, optional
+        Use a stricter algorithm to calculate the similarity between records.
+        By default False. Only applies if `similar` is set to `True`.
+    verbose : bool, optional
+        Get verbose output during deduplicating. By default False. Only applies if
+        `similar` is set to `True`.
+    """
     initial_length = len(asdata.df)
 
-    if not args.similar:
-        if args.pid not in asdata.df.columns:
-            print(
-                f"Not using {args.pid} for deduplication because there is no such data."
-            )
+    if not similar:
+        if pid not in asdata.df.columns:
+            print(f"Not using {pid} for deduplication because there is no such data.")
 
         # retrieve deduplicated ASReview data object
-        asdata.drop_duplicates(pid=args.pid, inplace=True)
+        asdata.drop_duplicates(pid=pid, inplace=True)
 
     else:
         _drop_duplicates_by_similarity(
-            asdata,
-            args.pid,
-            args.threshold,
-            args.title_only,
-            args.stopwords,
-            args.stopwords_language,
-            args.strict,
-            args.verbose,
+            asdata=asdata,
+            pid=pid,
+            threshold=threshold,
+            title_only=title_only,
+            stopwords_language=stopwords_language,
+            strict=strict,
+            verbose=verbose,
         )
 
     # count duplicates
     n_dup = initial_length - len(asdata.df)
 
-    if args.output_path:
-        asdata.to_file(args.output_path)
+    if output_path:
+        asdata.to_file(output_path)
         print(f"Removed {n_dup} duplicates from dataset with {initial_length} records.")
     else:
         print(f"Found {n_dup} duplicates in dataset with {initial_length} records.")
diff --git a/tests/test_dedup.py b/tests/test_dedup.py
index b7be1e4..4547a43 100644
--- a/tests/test_dedup.py
+++ b/tests/test_dedup.py
@@ -1,4 +1,3 @@
-from argparse import Namespace
 from pathlib import Path
 
 from asreview.data import ASReviewData
@@ -10,7 +9,7 @@
 file_with_doi = Path(test_dir, "demo_data", "duplicate_data_with_doi.csv")
 
 
-def test_dedup_without_doi(tmpdir):
+def test_dedup_without_doi():
     """
     Test deduplication without DOI.
 
@@ -22,18 +21,21 @@ def test_dedup_without_doi(tmpdir):
     Not using doi for deduplication because there is no such data.
     Found 1 duplicates in dataset with 5 records.
     """
+    data = ASReviewData.from_file(file_without_doi)
+    deduplicate_data(data)
+    assert len(data.df) == 4
+
+
+def test_output(tmpdir):
     data = ASReviewData.from_file(file_without_doi)
     output_path = Path(tmpdir, "test_dedup.csv")
-    args = Namespace(similar=False, output_path=output_path)
-    deduplicate_data(data, args)
+    deduplicate_data(data, output_path=output_path)
     as_test = ASReviewData.from_file(output_path)
+    assert len(data.df) == 4
+    assert len(as_test.df) == 4
 
-    assert len(data.df) != len(as_test.df), "Data should have been deduplicated."
-    assert len(data.df) == 5, "Original data should have 5 records."
-    assert len(as_test.df) == 4, "Deduplicated data should have 4 records."
 
-
-def test_dedup_with_doi(tmpdir):
+def test_dedup_with_doi():
     """
     Test deduplication with DOI.
 
@@ -46,17 +48,11 @@ def test_dedup_with_doi(tmpdir):
     Found 2 duplicates in dataset with 5 records.
     """
     data = ASReviewData.from_file(file_with_doi)
-    output_path = Path(tmpdir, "test_dedup.csv")
-    args = Namespace(similar=False, output_path=output_path)
-    deduplicate_data(data, args)
-    as_test = ASReviewData.from_file(output_path)
-
-    assert len(data.df) != len(as_test.df), "Data should have been deduplicated."
-    assert len(data.df) == 5, "Original data should have 5 records."
-    assert len(as_test.df) == 3, "Deduplicated data should have 3 records."
+    deduplicate_data(data)
+    assert len(data.df) == 3
 
 
-def test_dedup_with_similarity_without_doi(tmpdir):
+def test_dedup_with_similarity_without_doi():
     """
     Test deduplication with similarity without DOI.
 
@@ -71,17 +67,11 @@ def test_dedup_with_similarity_without_doi(tmpdir):
     Found 2 duplicates in dataset with 5 records.
     """
     data = ASReviewData.from_file(file_without_doi)
-    output_path = Path(tmpdir, "test_dedup.csv")
-    args = Namespace(similar=True, output_path=output_path, threshold=0.95)
-    deduplicate_data(data, args)
-    as_test = ASReviewData.from_file(output_path)
-
-    assert len(data.df) != len(as_test.df), "Data should have been deduplicated."
-    assert len(data.df) == 5, "Original data should have 5 records."
-    assert len(as_test.df) == 3, "Deduplicated data should have 3 records."
+    deduplicate_data(data, similar=True, threshold=0.95)
+    assert len(data.df) == 3, "Original data should have 5 records."
 
 
-def test_dedup_with_similarity_with_doi(tmpdir):
+def test_dedup_with_similarity_with_doi():
     """
     Test deduplication with similarity with DOI.
 
@@ -96,17 +86,11 @@ def test_dedup_with_similarity_with_doi(tmpdir):
     Found 3 duplicates in dataset with 5 records.
     """
     data = ASReviewData.from_file(file_with_doi)
-    output_path = Path(tmpdir, "test_dedup.csv")
-    args = Namespace(similar=True, output_path=output_path, threshold=0.95)
-    deduplicate_data(data, args)
-    as_test = ASReviewData.from_file(output_path)
-
-    assert len(data.df) != len(as_test.df), "Data should have been deduplicated."
-    assert len(data.df) == 5, "Original data should have 5 records."
-    assert len(as_test.df) == 2, "Deduplicated data should have 2 records."
+    deduplicate_data(data, similar=True, threshold=0.95)
+    assert len(data.df) == 2
 
 
-def test_dedup_with_similarity_without_doi_stopwords(tmpdir):
+def test_dedup_with_similarity_without_doi_stopwords():
     """
     Test deduplication with similarity without DOI and removing stopwords.
 
@@ -122,22 +106,11 @@ def test_dedup_with_similarity_without_doi_stopwords(tmpdir):
     Found 3 duplicates in dataset with 5 records.
     """
     data = ASReviewData.from_file(file_without_doi)
-    output_path = Path(tmpdir, "test_dedup.csv")
-    args = Namespace(
-        similar=True,
-        output_path=output_path,
-        threshold=0.95,
-        stopwords=True,
-        )
-    deduplicate_data(data, args)
-    as_test = ASReviewData.from_file(output_path)
+    deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english")
+    assert len(data.df) == 2
 
-    assert len(data.df) != len(as_test.df), "Data should have been deduplicated."
-    assert len(data.df) == 5, "Original data should have 5 records."
-    assert len(as_test.df) == 2, "Deduplicated data should have 2 records."
 
-
-def test_dedup_with_similarity_with_doi_stopwords(tmpdir):
+def test_dedup_with_similarity_with_doi_stopwords():
     """
     Test deduplication with similarity with DOI and removing stopwords.
 
@@ -153,16 +126,5 @@ def test_dedup_with_similarity_with_doi_stopwords(tmpdir):
     Found 4 duplicates in dataset with 5 records.
     """
     data = ASReviewData.from_file(file_with_doi)
-    output_path = Path(tmpdir, "test_dedup.csv")
-    args = Namespace(
-        similar=True,
-        output_path=output_path,
-        threshold=0.95,
-        stopwords=True,
-        )
-    deduplicate_data(data, args)
-    as_test = ASReviewData.from_file(output_path)
-
-    assert len(data.df) != len(as_test.df), "Data should have been deduplicated."
-    assert len(data.df) == 5, "Original data should have 5 records."
-    assert len(as_test.df) == 1, "Deduplicated data should have 1 record."
+    deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english")
+    assert len(data.df) == 1

From f96625929c78530f400a7338834add312ae114d6 Mon Sep 17 00:00:00 2001
From: Peter Lombaers <peter@idfuse.nl>
Date: Thu, 6 Feb 2025 14:53:09 +0100
Subject: [PATCH 3/6] Fix dedup entrypoint

---
 asreviewcontrib/datatools/entrypoint.py | 54 ++++++++++++++++++-------
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py
index 28d33c0..e1094f3 100644
--- a/asreviewcontrib/datatools/entrypoint.py
+++ b/asreviewcontrib/datatools/entrypoint.py
@@ -52,7 +52,7 @@ def execute(self, argv):
                     "-o",
                     default=None,
                     type=str,
-                    help="The file path of the dataset.",
+                    help="The file path of the output dataset.",
                 )
                 dedup_parser.add_argument(
                     "--pid",
@@ -63,46 +63,72 @@ def execute(self, argv):
                 dedup_parser.add_argument(
                     "--similar",
                     action="store_true",
-                    help="Drop similar records.",
+                    help=(
+                        "Drop similar records, not only exactly matching records. The"
+                        " Ratcliff-Obershelp algorithm is used to calculate the"
+                        " similarity of records."
+                    ),
                 )
                 dedup_parser.add_argument(
                     "--threshold",
                     default=0.98,
                     type=float,
-                    help="Similarity threshold for deduplication. Default: 0.98.",
+                    help=(
+                        "Record with a similarity score above this threshold are"
+                        " considered duplicate. Default: 0.98. Only applies if"
+                        " similarity is set to True."
+                    ),
                 )
                 dedup_parser.add_argument(
                     "--title_only",
                     action="store_true",
-                    help="Use only title for deduplication.",
-                )
-                dedup_parser.add_argument(
-                    "--stopwords",
-                    action="store_true",
-                    help="Ignore stopwords for deduplication, focusing on main words.",
+                    help=(
+                        "Use only title for deduplication. Only applies if similarity"
+                        " is set to True"
+                    ),
                 )
                 dedup_parser.add_argument(
                     "--strict",
                     action="store_true",
-                    help="Use a more strict similarity for deduplication.",
+                    help=(
+                        "Use a more strict version of the similarity algorithm. Only"
+                        " applies if similarity is set to True."
+                    ),
                 )
                 dedup_parser.add_argument(
                     "--stopwords_language",
-                    default="english",
+                    default=None,
                     type=str,
-                    help="Language for stopwords. Default: english.",
+                    help=(
+                        "Remove stopwords from this language before calculating"
+                        " similarity. For example 'english'. Only applies if similarity"
+                        " is set to True."
+                    ),
                 )
                 dedup_parser.add_argument(
                     "--verbose",
                     action="store_true",
-                    help="Print verbose output.",
+                    help=(
+                        "Print verbose output. Only applies if similarity is set to"
+                        " True."
+                    ),
                 )
 
                 args_dedup = dedup_parser.parse_args(argv[1:])
 
                 # read data in ASReview data object
                 asdata = load_data(args_dedup.input_path)
-                deduplicate_data(asdata, args_dedup)
+                deduplicate_data(
+                    asdata=asdata,
+                    output_path=args_dedup.output_path,
+                    pid=args_dedup.pid,
+                    similar=args_dedup.similar,
+                    threshold=args_dedup.threshold,
+                    title_only=args_dedup.title_only,
+                    stopwords_language=args_dedup.stopwords_language,
+                    strict=args_dedup.strict,
+                    verbose=args_dedup.verbose,
+                )
 
             if argv[0] == "compose":
                 args_compose_parser = _parse_arguments_compose()

From a30ff6beefdbd528f9d0304ac9babb4e4d52d37c Mon Sep 17 00:00:00 2001
From: Peter Lombaers <peter@idfuse.nl>
Date: Thu, 6 Feb 2025 14:55:33 +0100
Subject: [PATCH 4/6] Add test for similarity threshold 0

---
 tests/test_dedup.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_dedup.py b/tests/test_dedup.py
index 4547a43..d6e2392 100644
--- a/tests/test_dedup.py
+++ b/tests/test_dedup.py
@@ -128,3 +128,9 @@ def test_dedup_with_similarity_with_doi_stopwords():
     data = ASReviewData.from_file(file_with_doi)
     deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english")
     assert len(data.df) == 1
+
+
+def test_threshold_zero():
+    data = ASReviewData.from_file(file_with_doi)
+    deduplicate_data(data, similar=True, threshold=0)
+    assert len(data.df) == 1

From ac640925f8bc409d1889aede61999cb894a90324 Mon Sep 17 00:00:00 2001
From: Peter Lombaers <peter@idfuse.nl>
Date: Thu, 6 Feb 2025 14:56:51 +0100
Subject: [PATCH 5/6] Simplify print logic

---
 asreviewcontrib/datatools/dedup.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py
index 91d5990..4783394 100644
--- a/asreviewcontrib/datatools/dedup.py
+++ b/asreviewcontrib/datatools/dedup.py
@@ -231,11 +231,9 @@ def deduplicate_data(
             verbose=verbose,
         )
 
-    # count duplicates
-    n_dup = initial_length - len(asdata.df)
-
     if output_path:
         asdata.to_file(output_path)
-        print(f"Removed {n_dup} duplicates from dataset with {initial_length} records.")
-    else:
-        print(f"Found {n_dup} duplicates in dataset with {initial_length} records.")
+
+    # count duplicates
+    n_dup = initial_length - len(asdata.df)
+    print(f"Found {n_dup} duplicates in dataset with {initial_length} records.")

From e28f3097044520430f14f60d9eccb5b7a565c1fb Mon Sep 17 00:00:00 2001
From: Peter Lombaers <peter@idfuse.nl>
Date: Thu, 6 Feb 2025 15:42:34 +0100
Subject: [PATCH 6/6] Simplify logic

---
 asreviewcontrib/datatools/dedup.py | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py
index 4783394..1d26ea3 100644
--- a/asreviewcontrib/datatools/dedup.py
+++ b/asreviewcontrib/datatools/dedup.py
@@ -103,11 +103,7 @@ def _drop_duplicates_by_similarity(
     seq_matcher = SequenceMatcher()
     duplicated = [False] * len(s)
 
-    if verbose:
-        similar_list = []
-    else:
-        similar_list = None
-
+    similar_list = []
     if pid in asdata.df.columns:
         if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]):
             pids = asdata.df[pid].str.strip().replace("", None)
@@ -115,7 +111,6 @@ def _drop_duplicates_by_similarity(
                 pids = pids.str.lower().str.replace(
                     r"^https?://(www\.)?doi\.org/", "", regex=True
                 )
-
         else:
             pids = asdata.df[pid]
 
@@ -136,14 +131,10 @@ def _drop_duplicates_by_similarity(
                     and seq_matcher.quick_ratio() > threshold
                     and (not strict or seq_matcher.ratio() > threshold)
                 ):
-                    if verbose and not duplicated[j]:
+                    if not duplicated[j]:
                         similar_list.append((i, j))
-
                     duplicated[j] = True
 
-        if verbose:
-            _print_similar_list(similar_list, data, pid, pids)
-
     else:
         print(f"Not using {pid} for deduplication because there is no such data.")
 
@@ -160,15 +151,12 @@ def _drop_duplicates_by_similarity(
                     and seq_matcher.quick_ratio() > threshold
                     and (not strict or seq_matcher.ratio() > threshold)
                 ):
-                    if verbose and not duplicated[j]:
+                    if not duplicated[j]:
                         similar_list.append((i, j))
-
                     duplicated[j] = True
-
-        if verbose:
-            _print_similar_list(similar_list, data, pid)
-
     asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True)
+    if verbose:
+        _print_similar_list(similar_list, data, pid)
 
 
 def deduplicate_data(