diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py index 1f263bd..1d26ea3 100644 --- a/asreviewcontrib/datatools/dedup.py +++ b/asreviewcontrib/datatools/dedup.py @@ -1,5 +1,4 @@ import re -from argparse import Namespace from difflib import SequenceMatcher import ftfy @@ -13,19 +12,18 @@ def _print_similar_list( - similar_list: list[tuple[int, int]], - data: pd.Series, - pid: str, - pids: pd.Series = None - ) -> None: - + similar_list: list[tuple[int, int]], + data: pd.Series, + pid: str, + pids: pd.Series = None, +) -> None: print_seq_matcher = SequenceMatcher() console = Console() if pids is not None: - print(f'Found similar titles or same {pid} at lines:') + print(f"Found similar titles or same {pid} at lines:") else: - print('Found similar titles at lines:') + print("Found similar titles at lines:") for i, j in similar_list: print_seq_matcher.set_seq1(data.iloc[i]) @@ -33,83 +31,79 @@ def _print_similar_list( text = Text() if pids is not None: - text.append(f'\nLines {i+1} and {j+1} ', style='bold') + text.append(f"\nLines {i + 1} and {j + 1} ", style="bold") if pids.iloc[i] == pids.iloc[j]: - text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style='dim') + text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style="dim") else: - text.append(f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', - style='dim') + text.append( + f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', style="dim" + ) else: - text.append(f'\nLines {i+1} and {j+1}:\n', style='bold') + text.append(f"\nLines {i + 1} and {j + 1}:\n", style="bold") for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes(): - if tag == 'replace': + if tag == "replace": # add rich strikethrough - text.append(f'{data.iloc[i][i1:i2]}', style='red strike') - text.append(f'{data.iloc[j][j1:j2]}', style='green') - if tag == 'delete': - text.append(f'{data.iloc[i][i1:i2]}', style='red strike') - if tag == 'insert': - text.append(f'{data.iloc[j][j1:j2]}', style='green') - if tag == 'equal': - text.append(f'{data.iloc[i][i1:i2]}', style='dim') + text.append(f"{data.iloc[i][i1:i2]}", style="red strike") + text.append(f"{data.iloc[j][j1:j2]}", style="green") + if tag == "delete": + text.append(f"{data.iloc[i][i1:i2]}", style="red strike") + if tag == "insert": + text.append(f"{data.iloc[j][j1:j2]}", style="green") + if tag == "equal": + text.append(f"{data.iloc[i][i1:i2]}", style="dim") console.print(text) - print('') + print("") def _drop_duplicates_by_similarity( - asdata: ASReviewData, - pid: str, - similarity: float = 0.98, - skip_abstract: bool = False, - discard_stopwords: bool = False, - stopwords_language: str = 'english', - strict_similarity: bool = False, - verbose: bool = False, - ) -> None: - - if skip_abstract: - data = asdata.df['title'] + asdata: ASReviewData, + pid: str, + threshold: float = 0.98, + title_only: bool = False, + stopwords_language: str = None, + strict: bool = False, + verbose: bool = False, +) -> None: + if title_only: + data = asdata.df["title"] else: data = pd.Series(asdata.texts) - symbols_regex = re.compile(r'[^ \w\d\-_]') - spaces_regex = re.compile(r'\s+') + symbols_regex = re.compile(r"[^ \w\d\-_]") + spaces_regex = re.compile(r"\s+") # clean the data s = ( - data - .apply(ftfy.fix_text) - .str.replace(symbols_regex, '', regex=True) - .str.replace(spaces_regex, ' ', regex=True) + data.apply(ftfy.fix_text) + .str.replace(symbols_regex, "", regex=True) + .str.replace(spaces_regex, " ", regex=True) .str.lower() .str.strip() - .replace('', None) + .replace("", None) ) - if discard_stopwords: + if stopwords_language: try: from nltk.corpus import stopwords + stopwords_set = set(stopwords.words(stopwords_language)) except LookupError: import nltk - nltk.download('stopwords') + + nltk.download("stopwords") stopwords_set = set(stopwords.words(stopwords_language)) - stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b') - s = s.str.replace(stopwords_regex, '', regex=True) + stopwords_regex = re.compile(rf"\b{'\\b|\\b'.join(stopwords_set)}\b") + s = s.str.replace(stopwords_regex, "", regex=True) seq_matcher = SequenceMatcher() duplicated = [False] * len(s) - if verbose: - similar_list = [] - else: - similar_list = None - + similar_list = [] if pid in asdata.df.columns: if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]): pids = asdata.df[pid].str.strip().replace("", None) @@ -117,95 +111,117 @@ def _drop_duplicates_by_similarity( pids = pids.str.lower().str.replace( r"^https?://(www\.)?doi\.org/", "", regex=True ) - else: pids = asdata.df[pid] - for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'): + for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"): seq_matcher.set_seq2(text) # loop through the rest of the data if it has the same pid or similar length - for j, t in s.iloc[i+1:][(asdata.df[pid] == asdata.df.iloc[i][pid]) | - (abs(s.str.len() - len(text)) < 5)].items(): + for j, t in s.iloc[i + 1 :][ + (asdata.df[pid] == asdata.df.iloc[i][pid]) + | (abs(s.str.len() - len(text)) < 5) + ].items(): seq_matcher.set_seq1(t) # if the texts have the same pid or are similar enough, # mark the second one as duplicate - if pids.iloc[i] == pids.iloc[j] or \ - (seq_matcher.real_quick_ratio() > similarity and \ - seq_matcher.quick_ratio() > similarity and \ - (not strict_similarity or seq_matcher.ratio() > similarity)): - - if verbose and not duplicated[j]: + if pids.iloc[i] == pids.iloc[j] or ( + seq_matcher.real_quick_ratio() > threshold + and seq_matcher.quick_ratio() > threshold + and (not strict or seq_matcher.ratio() > threshold) + ): + if not duplicated[j]: similar_list.append((i, j)) - duplicated[j] = True - if verbose: - _print_similar_list(similar_list, data, pid, pids) - else: - print(f'Not using {pid} for deduplication because there is no such data.') + print(f"Not using {pid} for deduplication because there is no such data.") - for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'): + for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"): seq_matcher.set_seq2(text) # loop through the rest of the data if it has similar length - for j, t in s.iloc[i+1:][abs(s.str.len() - len(text)) < 5].items(): + for j, t in s.iloc[i + 1 :][abs(s.str.len() - len(text)) < 5].items(): seq_matcher.set_seq1(t) # if the texts are similar enough, mark the second one as duplicate - if seq_matcher.real_quick_ratio() > similarity and \ - seq_matcher.quick_ratio() > similarity and \ - (not strict_similarity or seq_matcher.ratio() > similarity): - - if verbose and not duplicated[j]: + if ( + seq_matcher.real_quick_ratio() > threshold + and seq_matcher.quick_ratio() > threshold + and (not strict or seq_matcher.ratio() > threshold) + ): + if not duplicated[j]: similar_list.append((i, j)) - duplicated[j] = True - - if verbose: - _print_similar_list(similar_list, data, pid) - asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True) - - -def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None: + if verbose: + _print_similar_list(similar_list, data, pid) + + +def deduplicate_data( + asdata: ASReviewData, + output_path: str = None, + pid: str = "doi", + similar: bool = False, + threshold: float = 0.98, + title_only: bool = False, + stopwords_language: str = None, + strict: bool = False, + verbose: bool = False, +) -> None: + """Deduplicate an ASReview data object. + + Parameters + ---------- + asdata : ASReviewData + The data object. + output_path : str, optional + If provided, the deduplicated data object is stored at this location. By + default None. + pid : str, optional + Principal identifier to use for deduplication, by default "doi" + similar : bool, optional + Where to deduplicate 'similar' record. The similarity of the records is + calculated using the `SequenceMatcher` from `difflib`. By default False. + threshold : float, optional + Threshold score above which two records are considered duplicate. + By default 0.98. Only applies if `similar` is set to `True`. + title_only : bool, optional + Only use the title for deduplication, by default False + stopwords_language : str, optional + Remove stopwords from this language before deduplicating, for example 'english'. + By default None. Only applies if `similar` is set to `True`. + strict : bool, optional + Use a stricter algorithm to calculate the similarity between records. + By default False. Only applies if `similar` is set to `True`. + verbose : bool, optional + Get verbose output during deduplicating. By default False. Only applies if + `similar` is set to `True`. + """ initial_length = len(asdata.df) - if not args.similar: - if args.pid not in asdata.df.columns: - print( - f'Not using {args.pid} for deduplication ' - 'because there is no such data.' - ) + if not similar: + if pid not in asdata.df.columns: + print(f"Not using {pid} for deduplication because there is no such data.") # retrieve deduplicated ASReview data object - asdata.drop_duplicates(pid=args.pid, inplace=True) + asdata.drop_duplicates(pid=pid, inplace=True) else: _drop_duplicates_by_similarity( - asdata, - args.pid, - args.threshold, - args.title_only, - args.stopwords, - args.stopwords_language, - args.strict, - args.verbose, - ) + asdata=asdata, + pid=pid, + threshold=threshold, + title_only=title_only, + stopwords_language=stopwords_language, + strict=strict, + verbose=verbose, + ) + + if output_path: + asdata.to_file(output_path) # count duplicates n_dup = initial_length - len(asdata.df) - - if args.output_path: - asdata.to_file(args.output_path) - print( - f'Removed {n_dup} duplicates from dataset with' - f' {initial_length} records.' - ) - else: - print( - f'Found {n_dup} duplicates in dataset with' - f' {initial_length} records.' - ) + print(f"Found {n_dup} duplicates in dataset with {initial_length} records.") diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index b38824a..e1094f3 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -52,7 +52,7 @@ def execute(self, argv): "-o", default=None, type=str, - help="The file path of the dataset.", + help="The file path of the output dataset.", ) dedup_parser.add_argument( "--pid", @@ -62,47 +62,73 @@ def execute(self, argv): ) dedup_parser.add_argument( "--similar", - action='store_true', - help="Drop similar records.", + action="store_true", + help=( + "Drop similar records, not only exactly matching records. The" + " Ratcliff-Obershelp algorithm is used to calculate the" + " similarity of records." + ), ) dedup_parser.add_argument( "--threshold", default=0.98, type=float, - help="Similarity threshold for deduplication. Default: 0.98.", + help=( + "Record with a similarity score above this threshold are" + " considered duplicate. Default: 0.98. Only applies if" + " similarity is set to True." + ), ) dedup_parser.add_argument( "--title_only", - action='store_true', - help="Use only title for deduplication.", - ) - dedup_parser.add_argument( - "--stopwords", - action='store_true', - help="Ignore stopwords for deduplication, focusing on main words.", + action="store_true", + help=( + "Use only title for deduplication. Only applies if similarity" + " is set to True" + ), ) dedup_parser.add_argument( "--strict", - action='store_true', - help="Use a more strict similarity for deduplication.", + action="store_true", + help=( + "Use a more strict version of the similarity algorithm. Only" + " applies if similarity is set to True." + ), ) dedup_parser.add_argument( "--stopwords_language", - default="english", + default=None, type=str, - help="Language for stopwords. Default: english.", + help=( + "Remove stopwords from this language before calculating" + " similarity. For example 'english'. Only applies if similarity" + " is set to True." + ), ) dedup_parser.add_argument( "--verbose", - action='store_true', - help="Print verbose output.", + action="store_true", + help=( + "Print verbose output. Only applies if similarity is set to" + " True." + ), ) args_dedup = dedup_parser.parse_args(argv[1:]) # read data in ASReview data object asdata = load_data(args_dedup.input_path) - deduplicate_data(asdata, args_dedup) + deduplicate_data( + asdata=asdata, + output_path=args_dedup.output_path, + pid=args_dedup.pid, + similar=args_dedup.similar, + threshold=args_dedup.threshold, + title_only=args_dedup.title_only, + stopwords_language=args_dedup.stopwords_language, + strict=args_dedup.strict, + verbose=args_dedup.verbose, + ) if argv[0] == "compose": args_compose_parser = _parse_arguments_compose() @@ -141,7 +167,7 @@ def execute(self, argv): "subcommand", nargs="?", default=None, - help=f"The datatool to launch. Available commands:\n\n" f"{DATATOOLS}", + help=f"The datatool to launch. Available commands:\n\n{DATATOOLS}", ) parser.add_argument( "-V", diff --git a/tests/test_dedup.py b/tests/test_dedup.py index b7be1e4..d6e2392 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -1,4 +1,3 @@ -from argparse import Namespace from pathlib import Path from asreview.data import ASReviewData @@ -10,7 +9,7 @@ file_with_doi = Path(test_dir, "demo_data", "duplicate_data_with_doi.csv") -def test_dedup_without_doi(tmpdir): +def test_dedup_without_doi(): """ Test deduplication without DOI. @@ -22,18 +21,21 @@ def test_dedup_without_doi(tmpdir): Not using doi for deduplication because there is no such data. Found 1 duplicates in dataset with 5 records. """ + data = ASReviewData.from_file(file_without_doi) + deduplicate_data(data) + assert len(data.df) == 4 + + +def test_output(tmpdir): data = ASReviewData.from_file(file_without_doi) output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace(similar=False, output_path=output_path) - deduplicate_data(data, args) + deduplicate_data(data, output_path=output_path) as_test = ASReviewData.from_file(output_path) + assert len(data.df) == 4 + assert len(as_test.df) == 4 - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 4, "Deduplicated data should have 4 records." - -def test_dedup_with_doi(tmpdir): +def test_dedup_with_doi(): """ Test deduplication with DOI. @@ -46,17 +48,11 @@ def test_dedup_with_doi(tmpdir): Found 2 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_with_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace(similar=False, output_path=output_path) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) - - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 3, "Deduplicated data should have 3 records." + deduplicate_data(data) + assert len(data.df) == 3 -def test_dedup_with_similarity_without_doi(tmpdir): +def test_dedup_with_similarity_without_doi(): """ Test deduplication with similarity without DOI. @@ -71,17 +67,11 @@ def test_dedup_with_similarity_without_doi(tmpdir): Found 2 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_without_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace(similar=True, output_path=output_path, threshold=0.95) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) - - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 3, "Deduplicated data should have 3 records." + deduplicate_data(data, similar=True, threshold=0.95) + assert len(data.df) == 3, "Original data should have 5 records." -def test_dedup_with_similarity_with_doi(tmpdir): +def test_dedup_with_similarity_with_doi(): """ Test deduplication with similarity with DOI. @@ -96,17 +86,11 @@ def test_dedup_with_similarity_with_doi(tmpdir): Found 3 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_with_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace(similar=True, output_path=output_path, threshold=0.95) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) - - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 2, "Deduplicated data should have 2 records." + deduplicate_data(data, similar=True, threshold=0.95) + assert len(data.df) == 2 -def test_dedup_with_similarity_without_doi_stopwords(tmpdir): +def test_dedup_with_similarity_without_doi_stopwords(): """ Test deduplication with similarity without DOI and removing stopwords. @@ -122,22 +106,11 @@ def test_dedup_with_similarity_without_doi_stopwords(tmpdir): Found 3 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_without_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace( - similar=True, - output_path=output_path, - threshold=0.95, - stopwords=True, - ) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) + deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english") + assert len(data.df) == 2 - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 2, "Deduplicated data should have 2 records." - -def test_dedup_with_similarity_with_doi_stopwords(tmpdir): +def test_dedup_with_similarity_with_doi_stopwords(): """ Test deduplication with similarity with DOI and removing stopwords. @@ -153,16 +126,11 @@ def test_dedup_with_similarity_with_doi_stopwords(tmpdir): Found 4 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_with_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace( - similar=True, - output_path=output_path, - threshold=0.95, - stopwords=True, - ) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) + deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english") + assert len(data.df) == 1 - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 1, "Deduplicated data should have 1 record." + +def test_threshold_zero(): + data = ASReviewData.from_file(file_with_doi) + deduplicate_data(data, similar=True, threshold=0) + assert len(data.df) == 1