Fix tests and clean up new deduplication algorithm code (#54)

asreview · Feb 6, 2025 · 32c5e3b · 32c5e3b
1 parent 3d9b906
commit 32c5e3b
Show file tree

Hide file tree

Showing 3 changed files with 201 additions and 191 deletions.
diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py
@@ -1,5 +1,4 @@
 import re
-from argparse import Namespace
 from difflib import SequenceMatcher
 
 import ftfy
@@ -13,199 +12,216 @@
 
 
 def _print_similar_list(
-        similar_list: list[tuple[int, int]],
-        data: pd.Series,
-        pid: str,
-        pids: pd.Series = None
-        ) -> None:
-
+    similar_list: list[tuple[int, int]],
+    data: pd.Series,
+    pid: str,
+    pids: pd.Series = None,
+) -> None:
     print_seq_matcher = SequenceMatcher()
     console = Console()
 
     if pids is not None:
-        print(f'Found similar titles or same {pid} at lines:')
+        print(f"Found similar titles or same {pid} at lines:")
     else:
-        print('Found similar titles at lines:')
+        print("Found similar titles at lines:")
 
     for i, j in similar_list:
         print_seq_matcher.set_seq1(data.iloc[i])
         print_seq_matcher.set_seq2(data.iloc[j])
         text = Text()
 
         if pids is not None:
-            text.append(f'\nLines {i+1} and {j+1} ', style='bold')
+            text.append(f"\nLines {i + 1} and {j + 1} ", style="bold")
             if pids.iloc[i] == pids.iloc[j]:
-                text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style='dim')
+                text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style="dim")
             else:
-                text.append(f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n',
-                            style='dim')
+                text.append(
+                    f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', style="dim"
+                )
 
         else:
-            text.append(f'\nLines {i+1} and {j+1}:\n', style='bold')
+            text.append(f"\nLines {i + 1} and {j + 1}:\n", style="bold")
 
         for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes():
-            if tag == 'replace':
+            if tag == "replace":
                 # add rich strikethrough
-                text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
-                text.append(f'{data.iloc[j][j1:j2]}', style='green')
-            if tag == 'delete':
-                text.append(f'{data.iloc[i][i1:i2]}', style='red strike')
-            if tag == 'insert':
-                text.append(f'{data.iloc[j][j1:j2]}', style='green')
-            if tag == 'equal':
-                text.append(f'{data.iloc[i][i1:i2]}', style='dim')
+                text.append(f"{data.iloc[i][i1:i2]}", style="red strike")
+                text.append(f"{data.iloc[j][j1:j2]}", style="green")
+            if tag == "delete":
+                text.append(f"{data.iloc[i][i1:i2]}", style="red strike")
+            if tag == "insert":
+                text.append(f"{data.iloc[j][j1:j2]}", style="green")
+            if tag == "equal":
+                text.append(f"{data.iloc[i][i1:i2]}", style="dim")
 
         console.print(text)
 
-    print('')
+    print("")
 
 
 def _drop_duplicates_by_similarity(
-        asdata: ASReviewData,
-        pid: str,
-        similarity: float = 0.98,
-        skip_abstract: bool = False,
-        discard_stopwords: bool = False,
-        stopwords_language: str = 'english',
-        strict_similarity: bool = False,
-        verbose: bool = False,
-        ) -> None:
-
-    if skip_abstract:
-        data = asdata.df['title']
+    asdata: ASReviewData,
+    pid: str,
+    threshold: float = 0.98,
+    title_only: bool = False,
+    stopwords_language: str = None,
+    strict: bool = False,
+    verbose: bool = False,
+) -> None:
+    if title_only:
+        data = asdata.df["title"]
     else:
         data = pd.Series(asdata.texts)
 
-    symbols_regex = re.compile(r'[^ \w\d\-_]')
-    spaces_regex = re.compile(r'\s+')
+    symbols_regex = re.compile(r"[^ \w\d\-_]")
+    spaces_regex = re.compile(r"\s+")
 
     # clean the data
     s = (
-        data
-        .apply(ftfy.fix_text)
-        .str.replace(symbols_regex, '', regex=True)
-        .str.replace(spaces_regex, ' ', regex=True)
+        data.apply(ftfy.fix_text)
+        .str.replace(symbols_regex, "", regex=True)
+        .str.replace(spaces_regex, " ", regex=True)
         .str.lower()
         .str.strip()
-        .replace('', None)
+        .replace("", None)
     )
 
-    if discard_stopwords:
+    if stopwords_language:
         try:
             from nltk.corpus import stopwords
+
             stopwords_set = set(stopwords.words(stopwords_language))
         except LookupError:
             import nltk
-            nltk.download('stopwords')
+
+            nltk.download("stopwords")
             stopwords_set = set(stopwords.words(stopwords_language))
 
-        stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b')
-        s = s.str.replace(stopwords_regex, '', regex=True)
+        stopwords_regex = re.compile(rf"\b{'\\b|\\b'.join(stopwords_set)}\b")
+        s = s.str.replace(stopwords_regex, "", regex=True)
 
     seq_matcher = SequenceMatcher()
     duplicated = [False] * len(s)
 
-    if verbose:
-        similar_list = []
-    else:
-        similar_list = None
-
+    similar_list = []
     if pid in asdata.df.columns:
         if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]):
             pids = asdata.df[pid].str.strip().replace("", None)
             if pid == "doi":
                 pids = pids.str.lower().str.replace(
                     r"^https?://(www\.)?doi\.org/", "", regex=True
                 )
-
         else:
             pids = asdata.df[pid]
 
-        for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'):
+        for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"):
             seq_matcher.set_seq2(text)
 
             # loop through the rest of the data if it has the same pid or similar length
-            for j, t in s.iloc[i+1:][(asdata.df[pid] == asdata.df.iloc[i][pid]) |
-                                     (abs(s.str.len() - len(text)) < 5)].items():
+            for j, t in s.iloc[i + 1 :][
+                (asdata.df[pid] == asdata.df.iloc[i][pid])
+                | (abs(s.str.len() - len(text)) < 5)
+            ].items():
                 seq_matcher.set_seq1(t)
 
                 # if the texts have the same pid or are similar enough,
                 # mark the second one as duplicate
-                if pids.iloc[i] == pids.iloc[j] or \
-                    (seq_matcher.real_quick_ratio() > similarity and \
-                    seq_matcher.quick_ratio() > similarity and \
-                    (not strict_similarity or seq_matcher.ratio() > similarity)):
-
-                    if verbose and not duplicated[j]:
+                if pids.iloc[i] == pids.iloc[j] or (
+                    seq_matcher.real_quick_ratio() > threshold
+                    and seq_matcher.quick_ratio() > threshold
+                    and (not strict or seq_matcher.ratio() > threshold)
+                ):
+                    if not duplicated[j]:
                         similar_list.append((i, j))
-
                     duplicated[j] = True
 
-        if verbose:
-            _print_similar_list(similar_list, data, pid, pids)
-
     else:
-        print(f'Not using {pid} for deduplication because there is no such data.')
+        print(f"Not using {pid} for deduplication because there is no such data.")
 
-        for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'):
+        for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"):
             seq_matcher.set_seq2(text)
 
             # loop through the rest of the data if it has similar length
-            for j, t in s.iloc[i+1:][abs(s.str.len() - len(text)) < 5].items():
+            for j, t in s.iloc[i + 1 :][abs(s.str.len() - len(text)) < 5].items():
                 seq_matcher.set_seq1(t)
 
                 # if the texts are similar enough, mark the second one as duplicate
-                if seq_matcher.real_quick_ratio() > similarity and \
-                    seq_matcher.quick_ratio() > similarity and \
-                    (not strict_similarity or seq_matcher.ratio() > similarity):
-
-                    if verbose and not duplicated[j]:
+                if (
+                    seq_matcher.real_quick_ratio() > threshold
+                    and seq_matcher.quick_ratio() > threshold
+                    and (not strict or seq_matcher.ratio() > threshold)
+                ):
+                    if not duplicated[j]:
                         similar_list.append((i, j))
-
                     duplicated[j] = True
-
-        if verbose:
-            _print_similar_list(similar_list, data, pid)
-
     asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True)
-
-
-def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None:
+    if verbose:
+        _print_similar_list(similar_list, data, pid)
+
+
+def deduplicate_data(
+    asdata: ASReviewData,
+    output_path: str = None,
+    pid: str = "doi",
+    similar: bool = False,
+    threshold: float = 0.98,
+    title_only: bool = False,
+    stopwords_language: str = None,
+    strict: bool = False,
+    verbose: bool = False,
+) -> None:
+    """Deduplicate an ASReview data object.
+
+    Parameters
+    ----------
+    asdata : ASReviewData
+        The data object.
+    output_path : str, optional
+        If provided, the deduplicated data object is stored at this location. By
+        default None.
+    pid : str, optional
+        Principal identifier to use for deduplication, by default "doi"
+    similar : bool, optional
+        Where to deduplicate 'similar' record. The similarity of the records is
+        calculated using the `SequenceMatcher` from `difflib`. By default False.
+    threshold : float, optional
+        Threshold score above which two records are considered duplicate.
+        By default 0.98. Only applies if `similar` is set to `True`.
+    title_only : bool, optional
+        Only use the title for deduplication, by default False
+    stopwords_language : str, optional
+        Remove stopwords from this language before deduplicating, for example 'english'.
+        By default None. Only applies if `similar` is set to `True`.
+    strict : bool, optional
+        Use a stricter algorithm to calculate the similarity between records.
+        By default False. Only applies if `similar` is set to `True`.
+    verbose : bool, optional
+        Get verbose output during deduplicating. By default False. Only applies if
+        `similar` is set to `True`.
+    """
     initial_length = len(asdata.df)
 
-    if not args.similar:
-        if args.pid not in asdata.df.columns:
-            print(
-                f'Not using {args.pid} for deduplication '
-                'because there is no such data.'
-            )
+    if not similar:
+        if pid not in asdata.df.columns:
+            print(f"Not using {pid} for deduplication because there is no such data.")
 
         # retrieve deduplicated ASReview data object
-        asdata.drop_duplicates(pid=args.pid, inplace=True)
+        asdata.drop_duplicates(pid=pid, inplace=True)
 
     else:
         _drop_duplicates_by_similarity(
-            asdata,
-            args.pid,
-            args.threshold,
-            args.title_only,
-            args.stopwords,
-            args.stopwords_language,
-            args.strict,
-            args.verbose,
-            )
+            asdata=asdata,
+            pid=pid,
+            threshold=threshold,
+            title_only=title_only,
+            stopwords_language=stopwords_language,
+            strict=strict,
+            verbose=verbose,
+        )
+
+    if output_path:
+        asdata.to_file(output_path)
 
     # count duplicates
     n_dup = initial_length - len(asdata.df)
-
-    if args.output_path:
-        asdata.to_file(args.output_path)
-        print(
-            f'Removed {n_dup} duplicates from dataset with'
-            f' {initial_length} records.'
-        )
-    else:
-        print(
-            f'Found {n_dup} duplicates in dataset with'
-            f' {initial_length} records.'
-        )
+    print(f"Found {n_dup} duplicates in dataset with {initial_length} records.")