From 2a948ee9ab0ddda9fe39e47a6d8071744ace4c11 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 6 Feb 2025 13:47:03 +0100 Subject: [PATCH 1/6] Linting --- asreviewcontrib/datatools/dedup.py | 136 ++++++++++++------------ asreviewcontrib/datatools/entrypoint.py | 12 +-- 2 files changed, 72 insertions(+), 76 deletions(-) diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py index 1f263bd..7b2e0aa 100644 --- a/asreviewcontrib/datatools/dedup.py +++ b/asreviewcontrib/datatools/dedup.py @@ -13,19 +13,18 @@ def _print_similar_list( - similar_list: list[tuple[int, int]], - data: pd.Series, - pid: str, - pids: pd.Series = None - ) -> None: - + similar_list: list[tuple[int, int]], + data: pd.Series, + pid: str, + pids: pd.Series = None, +) -> None: print_seq_matcher = SequenceMatcher() console = Console() if pids is not None: - print(f'Found similar titles or same {pid} at lines:') + print(f"Found similar titles or same {pid} at lines:") else: - print('Found similar titles at lines:') + print("Found similar titles at lines:") for i, j in similar_list: print_seq_matcher.set_seq1(data.iloc[i]) @@ -33,74 +32,75 @@ def _print_similar_list( text = Text() if pids is not None: - text.append(f'\nLines {i+1} and {j+1} ', style='bold') + text.append(f"\nLines {i + 1} and {j + 1} ", style="bold") if pids.iloc[i] == pids.iloc[j]: - text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style='dim') + text.append(f'(same {pid} "{pids.iloc[i]}"):\n', style="dim") else: - text.append(f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', - style='dim') + text.append( + f'({pid} "{pids.iloc[i]}" and "{pids.iloc[j]}"):\n', style="dim" + ) else: - text.append(f'\nLines {i+1} and {j+1}:\n', style='bold') + text.append(f"\nLines {i + 1} and {j + 1}:\n", style="bold") for tag, i1, i2, j1, j2 in print_seq_matcher.get_opcodes(): - if tag == 'replace': + if tag == "replace": # add rich strikethrough - text.append(f'{data.iloc[i][i1:i2]}', style='red strike') - text.append(f'{data.iloc[j][j1:j2]}', style='green') - if tag == 'delete': - text.append(f'{data.iloc[i][i1:i2]}', style='red strike') - if tag == 'insert': - text.append(f'{data.iloc[j][j1:j2]}', style='green') - if tag == 'equal': - text.append(f'{data.iloc[i][i1:i2]}', style='dim') + text.append(f"{data.iloc[i][i1:i2]}", style="red strike") + text.append(f"{data.iloc[j][j1:j2]}", style="green") + if tag == "delete": + text.append(f"{data.iloc[i][i1:i2]}", style="red strike") + if tag == "insert": + text.append(f"{data.iloc[j][j1:j2]}", style="green") + if tag == "equal": + text.append(f"{data.iloc[i][i1:i2]}", style="dim") console.print(text) - print('') + print("") def _drop_duplicates_by_similarity( - asdata: ASReviewData, - pid: str, - similarity: float = 0.98, - skip_abstract: bool = False, - discard_stopwords: bool = False, - stopwords_language: str = 'english', - strict_similarity: bool = False, - verbose: bool = False, - ) -> None: - + asdata: ASReviewData, + pid: str, + similarity: float = 0.98, + skip_abstract: bool = False, + discard_stopwords: bool = False, + stopwords_language: str = "english", + strict_similarity: bool = False, + verbose: bool = False, +) -> None: if skip_abstract: - data = asdata.df['title'] + data = asdata.df["title"] else: data = pd.Series(asdata.texts) - symbols_regex = re.compile(r'[^ \w\d\-_]') - spaces_regex = re.compile(r'\s+') + symbols_regex = re.compile(r"[^ \w\d\-_]") + spaces_regex = re.compile(r"\s+") # clean the data s = ( - data - .apply(ftfy.fix_text) - .str.replace(symbols_regex, '', regex=True) - .str.replace(spaces_regex, ' ', regex=True) + data.apply(ftfy.fix_text) + .str.replace(symbols_regex, "", regex=True) + .str.replace(spaces_regex, " ", regex=True) .str.lower() .str.strip() - .replace('', None) + .replace("", None) ) if discard_stopwords: try: from nltk.corpus import stopwords + stopwords_set = set(stopwords.words(stopwords_language)) except LookupError: import nltk - nltk.download('stopwords') + + nltk.download("stopwords") stopwords_set = set(stopwords.words(stopwords_language)) - stopwords_regex = re.compile(rf'\b{"\\b|\\b".join(stopwords_set)}\b') - s = s.str.replace(stopwords_regex, '', regex=True) + stopwords_regex = re.compile(rf"\b{'\\b|\\b'.join(stopwords_set)}\b") + s = s.str.replace(stopwords_regex, "", regex=True) seq_matcher = SequenceMatcher() duplicated = [False] * len(s) @@ -121,21 +121,23 @@ def _drop_duplicates_by_similarity( else: pids = asdata.df[pid] - for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'): + for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"): seq_matcher.set_seq2(text) # loop through the rest of the data if it has the same pid or similar length - for j, t in s.iloc[i+1:][(asdata.df[pid] == asdata.df.iloc[i][pid]) | - (abs(s.str.len() - len(text)) < 5)].items(): + for j, t in s.iloc[i + 1 :][ + (asdata.df[pid] == asdata.df.iloc[i][pid]) + | (abs(s.str.len() - len(text)) < 5) + ].items(): seq_matcher.set_seq1(t) # if the texts have the same pid or are similar enough, # mark the second one as duplicate - if pids.iloc[i] == pids.iloc[j] or \ - (seq_matcher.real_quick_ratio() > similarity and \ - seq_matcher.quick_ratio() > similarity and \ - (not strict_similarity or seq_matcher.ratio() > similarity)): - + if pids.iloc[i] == pids.iloc[j] or ( + seq_matcher.real_quick_ratio() > similarity + and seq_matcher.quick_ratio() > similarity + and (not strict_similarity or seq_matcher.ratio() > similarity) + ): if verbose and not duplicated[j]: similar_list.append((i, j)) @@ -145,20 +147,21 @@ def _drop_duplicates_by_similarity( _print_similar_list(similar_list, data, pid, pids) else: - print(f'Not using {pid} for deduplication because there is no such data.') + print(f"Not using {pid} for deduplication because there is no such data.") - for i, text in tqdm(s.items(), total=len(s), desc='Deduplicating'): + for i, text in tqdm(s.items(), total=len(s), desc="Deduplicating"): seq_matcher.set_seq2(text) # loop through the rest of the data if it has similar length - for j, t in s.iloc[i+1:][abs(s.str.len() - len(text)) < 5].items(): + for j, t in s.iloc[i + 1 :][abs(s.str.len() - len(text)) < 5].items(): seq_matcher.set_seq1(t) # if the texts are similar enough, mark the second one as duplicate - if seq_matcher.real_quick_ratio() > similarity and \ - seq_matcher.quick_ratio() > similarity and \ - (not strict_similarity or seq_matcher.ratio() > similarity): - + if ( + seq_matcher.real_quick_ratio() > similarity + and seq_matcher.quick_ratio() > similarity + and (not strict_similarity or seq_matcher.ratio() > similarity) + ): if verbose and not duplicated[j]: similar_list.append((i, j)) @@ -176,8 +179,7 @@ def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None: if not args.similar: if args.pid not in asdata.df.columns: print( - f'Not using {args.pid} for deduplication ' - 'because there is no such data.' + f"Not using {args.pid} for deduplication because there is no such data." ) # retrieve deduplicated ASReview data object @@ -193,19 +195,13 @@ def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None: args.stopwords_language, args.strict, args.verbose, - ) + ) # count duplicates n_dup = initial_length - len(asdata.df) if args.output_path: asdata.to_file(args.output_path) - print( - f'Removed {n_dup} duplicates from dataset with' - f' {initial_length} records.' - ) + print(f"Removed {n_dup} duplicates from dataset with {initial_length} records.") else: - print( - f'Found {n_dup} duplicates in dataset with' - f' {initial_length} records.' - ) + print(f"Found {n_dup} duplicates in dataset with {initial_length} records.") diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index b38824a..28d33c0 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -62,7 +62,7 @@ def execute(self, argv): ) dedup_parser.add_argument( "--similar", - action='store_true', + action="store_true", help="Drop similar records.", ) dedup_parser.add_argument( @@ -73,17 +73,17 @@ def execute(self, argv): ) dedup_parser.add_argument( "--title_only", - action='store_true', + action="store_true", help="Use only title for deduplication.", ) dedup_parser.add_argument( "--stopwords", - action='store_true', + action="store_true", help="Ignore stopwords for deduplication, focusing on main words.", ) dedup_parser.add_argument( "--strict", - action='store_true', + action="store_true", help="Use a more strict similarity for deduplication.", ) dedup_parser.add_argument( @@ -94,7 +94,7 @@ def execute(self, argv): ) dedup_parser.add_argument( "--verbose", - action='store_true', + action="store_true", help="Print verbose output.", ) @@ -141,7 +141,7 @@ def execute(self, argv): "subcommand", nargs="?", default=None, - help=f"The datatool to launch. Available commands:\n\n" f"{DATATOOLS}", + help=f"The datatool to launch. Available commands:\n\n{DATATOOLS}", ) parser.add_argument( "-V", From da7e8c797cb50ef29b1c9e4cb9e5dfb16e895c36 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 6 Feb 2025 14:27:15 +0100 Subject: [PATCH 2/6] Fix tests and improve dedup function arguments --- asreviewcontrib/datatools/dedup.py | 96 ++++++++++++++++++++---------- tests/test_dedup.py | 88 ++++++++------------------- 2 files changed, 90 insertions(+), 94 deletions(-) diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py index 7b2e0aa..91d5990 100644 --- a/asreviewcontrib/datatools/dedup.py +++ b/asreviewcontrib/datatools/dedup.py @@ -1,5 +1,4 @@ import re -from argparse import Namespace from difflib import SequenceMatcher import ftfy @@ -63,14 +62,13 @@ def _print_similar_list( def _drop_duplicates_by_similarity( asdata: ASReviewData, pid: str, - similarity: float = 0.98, - skip_abstract: bool = False, - discard_stopwords: bool = False, - stopwords_language: str = "english", - strict_similarity: bool = False, + threshold: float = 0.98, + title_only: bool = False, + stopwords_language: str = None, + strict: bool = False, verbose: bool = False, ) -> None: - if skip_abstract: + if title_only: data = asdata.df["title"] else: data = pd.Series(asdata.texts) @@ -88,7 +86,7 @@ def _drop_duplicates_by_similarity( .replace("", None) ) - if discard_stopwords: + if stopwords_language: try: from nltk.corpus import stopwords @@ -134,9 +132,9 @@ def _drop_duplicates_by_similarity( # if the texts have the same pid or are similar enough, # mark the second one as duplicate if pids.iloc[i] == pids.iloc[j] or ( - seq_matcher.real_quick_ratio() > similarity - and seq_matcher.quick_ratio() > similarity - and (not strict_similarity or seq_matcher.ratio() > similarity) + seq_matcher.real_quick_ratio() > threshold + and seq_matcher.quick_ratio() > threshold + and (not strict or seq_matcher.ratio() > threshold) ): if verbose and not duplicated[j]: similar_list.append((i, j)) @@ -158,9 +156,9 @@ def _drop_duplicates_by_similarity( # if the texts are similar enough, mark the second one as duplicate if ( - seq_matcher.real_quick_ratio() > similarity - and seq_matcher.quick_ratio() > similarity - and (not strict_similarity or seq_matcher.ratio() > similarity) + seq_matcher.real_quick_ratio() > threshold + and seq_matcher.quick_ratio() > threshold + and (not strict or seq_matcher.ratio() > threshold) ): if verbose and not duplicated[j]: similar_list.append((i, j)) @@ -173,35 +171,71 @@ def _drop_duplicates_by_similarity( asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True) -def deduplicate_data(asdata: ASReviewData, args: Namespace) -> None: +def deduplicate_data( + asdata: ASReviewData, + output_path: str = None, + pid: str = "doi", + similar: bool = False, + threshold: float = 0.98, + title_only: bool = False, + stopwords_language: str = None, + strict: bool = False, + verbose: bool = False, +) -> None: + """Deduplicate an ASReview data object. + + Parameters + ---------- + asdata : ASReviewData + The data object. + output_path : str, optional + If provided, the deduplicated data object is stored at this location. By + default None. + pid : str, optional + Principal identifier to use for deduplication, by default "doi" + similar : bool, optional + Where to deduplicate 'similar' record. The similarity of the records is + calculated using the `SequenceMatcher` from `difflib`. By default False. + threshold : float, optional + Threshold score above which two records are considered duplicate. + By default 0.98. Only applies if `similar` is set to `True`. + title_only : bool, optional + Only use the title for deduplication, by default False + stopwords_language : str, optional + Remove stopwords from this language before deduplicating, for example 'english'. + By default None. Only applies if `similar` is set to `True`. + strict : bool, optional + Use a stricter algorithm to calculate the similarity between records. + By default False. Only applies if `similar` is set to `True`. + verbose : bool, optional + Get verbose output during deduplicating. By default False. Only applies if + `similar` is set to `True`. + """ initial_length = len(asdata.df) - if not args.similar: - if args.pid not in asdata.df.columns: - print( - f"Not using {args.pid} for deduplication because there is no such data." - ) + if not similar: + if pid not in asdata.df.columns: + print(f"Not using {pid} for deduplication because there is no such data.") # retrieve deduplicated ASReview data object - asdata.drop_duplicates(pid=args.pid, inplace=True) + asdata.drop_duplicates(pid=pid, inplace=True) else: _drop_duplicates_by_similarity( - asdata, - args.pid, - args.threshold, - args.title_only, - args.stopwords, - args.stopwords_language, - args.strict, - args.verbose, + asdata=asdata, + pid=pid, + threshold=threshold, + title_only=title_only, + stopwords_language=stopwords_language, + strict=strict, + verbose=verbose, ) # count duplicates n_dup = initial_length - len(asdata.df) - if args.output_path: - asdata.to_file(args.output_path) + if output_path: + asdata.to_file(output_path) print(f"Removed {n_dup} duplicates from dataset with {initial_length} records.") else: print(f"Found {n_dup} duplicates in dataset with {initial_length} records.") diff --git a/tests/test_dedup.py b/tests/test_dedup.py index b7be1e4..4547a43 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -1,4 +1,3 @@ -from argparse import Namespace from pathlib import Path from asreview.data import ASReviewData @@ -10,7 +9,7 @@ file_with_doi = Path(test_dir, "demo_data", "duplicate_data_with_doi.csv") -def test_dedup_without_doi(tmpdir): +def test_dedup_without_doi(): """ Test deduplication without DOI. @@ -22,18 +21,21 @@ def test_dedup_without_doi(tmpdir): Not using doi for deduplication because there is no such data. Found 1 duplicates in dataset with 5 records. """ + data = ASReviewData.from_file(file_without_doi) + deduplicate_data(data) + assert len(data.df) == 4 + + +def test_output(tmpdir): data = ASReviewData.from_file(file_without_doi) output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace(similar=False, output_path=output_path) - deduplicate_data(data, args) + deduplicate_data(data, output_path=output_path) as_test = ASReviewData.from_file(output_path) + assert len(data.df) == 4 + assert len(as_test.df) == 4 - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 4, "Deduplicated data should have 4 records." - -def test_dedup_with_doi(tmpdir): +def test_dedup_with_doi(): """ Test deduplication with DOI. @@ -46,17 +48,11 @@ def test_dedup_with_doi(tmpdir): Found 2 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_with_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace(similar=False, output_path=output_path) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) - - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 3, "Deduplicated data should have 3 records." + deduplicate_data(data) + assert len(data.df) == 3 -def test_dedup_with_similarity_without_doi(tmpdir): +def test_dedup_with_similarity_without_doi(): """ Test deduplication with similarity without DOI. @@ -71,17 +67,11 @@ def test_dedup_with_similarity_without_doi(tmpdir): Found 2 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_without_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace(similar=True, output_path=output_path, threshold=0.95) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) - - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 3, "Deduplicated data should have 3 records." + deduplicate_data(data, similar=True, threshold=0.95) + assert len(data.df) == 3, "Original data should have 5 records." -def test_dedup_with_similarity_with_doi(tmpdir): +def test_dedup_with_similarity_with_doi(): """ Test deduplication with similarity with DOI. @@ -96,17 +86,11 @@ def test_dedup_with_similarity_with_doi(tmpdir): Found 3 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_with_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace(similar=True, output_path=output_path, threshold=0.95) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) - - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 2, "Deduplicated data should have 2 records." + deduplicate_data(data, similar=True, threshold=0.95) + assert len(data.df) == 2 -def test_dedup_with_similarity_without_doi_stopwords(tmpdir): +def test_dedup_with_similarity_without_doi_stopwords(): """ Test deduplication with similarity without DOI and removing stopwords. @@ -122,22 +106,11 @@ def test_dedup_with_similarity_without_doi_stopwords(tmpdir): Found 3 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_without_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace( - similar=True, - output_path=output_path, - threshold=0.95, - stopwords=True, - ) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) + deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english") + assert len(data.df) == 2 - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 2, "Deduplicated data should have 2 records." - -def test_dedup_with_similarity_with_doi_stopwords(tmpdir): +def test_dedup_with_similarity_with_doi_stopwords(): """ Test deduplication with similarity with DOI and removing stopwords. @@ -153,16 +126,5 @@ def test_dedup_with_similarity_with_doi_stopwords(tmpdir): Found 4 duplicates in dataset with 5 records. """ data = ASReviewData.from_file(file_with_doi) - output_path = Path(tmpdir, "test_dedup.csv") - args = Namespace( - similar=True, - output_path=output_path, - threshold=0.95, - stopwords=True, - ) - deduplicate_data(data, args) - as_test = ASReviewData.from_file(output_path) - - assert len(data.df) != len(as_test.df), "Data should have been deduplicated." - assert len(data.df) == 5, "Original data should have 5 records." - assert len(as_test.df) == 1, "Deduplicated data should have 1 record." + deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english") + assert len(data.df) == 1 From f96625929c78530f400a7338834add312ae114d6 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 6 Feb 2025 14:53:09 +0100 Subject: [PATCH 3/6] Fix dedup entrypoint --- asreviewcontrib/datatools/entrypoint.py | 54 ++++++++++++++++++------- 1 file changed, 40 insertions(+), 14 deletions(-) diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py index 28d33c0..e1094f3 100644 --- a/asreviewcontrib/datatools/entrypoint.py +++ b/asreviewcontrib/datatools/entrypoint.py @@ -52,7 +52,7 @@ def execute(self, argv): "-o", default=None, type=str, - help="The file path of the dataset.", + help="The file path of the output dataset.", ) dedup_parser.add_argument( "--pid", @@ -63,46 +63,72 @@ def execute(self, argv): dedup_parser.add_argument( "--similar", action="store_true", - help="Drop similar records.", + help=( + "Drop similar records, not only exactly matching records. The" + " Ratcliff-Obershelp algorithm is used to calculate the" + " similarity of records." + ), ) dedup_parser.add_argument( "--threshold", default=0.98, type=float, - help="Similarity threshold for deduplication. Default: 0.98.", + help=( + "Record with a similarity score above this threshold are" + " considered duplicate. Default: 0.98. Only applies if" + " similarity is set to True." + ), ) dedup_parser.add_argument( "--title_only", action="store_true", - help="Use only title for deduplication.", - ) - dedup_parser.add_argument( - "--stopwords", - action="store_true", - help="Ignore stopwords for deduplication, focusing on main words.", + help=( + "Use only title for deduplication. Only applies if similarity" + " is set to True" + ), ) dedup_parser.add_argument( "--strict", action="store_true", - help="Use a more strict similarity for deduplication.", + help=( + "Use a more strict version of the similarity algorithm. Only" + " applies if similarity is set to True." + ), ) dedup_parser.add_argument( "--stopwords_language", - default="english", + default=None, type=str, - help="Language for stopwords. Default: english.", + help=( + "Remove stopwords from this language before calculating" + " similarity. For example 'english'. Only applies if similarity" + " is set to True." + ), ) dedup_parser.add_argument( "--verbose", action="store_true", - help="Print verbose output.", + help=( + "Print verbose output. Only applies if similarity is set to" + " True." + ), ) args_dedup = dedup_parser.parse_args(argv[1:]) # read data in ASReview data object asdata = load_data(args_dedup.input_path) - deduplicate_data(asdata, args_dedup) + deduplicate_data( + asdata=asdata, + output_path=args_dedup.output_path, + pid=args_dedup.pid, + similar=args_dedup.similar, + threshold=args_dedup.threshold, + title_only=args_dedup.title_only, + stopwords_language=args_dedup.stopwords_language, + strict=args_dedup.strict, + verbose=args_dedup.verbose, + ) if argv[0] == "compose": args_compose_parser = _parse_arguments_compose() From a30ff6beefdbd528f9d0304ac9babb4e4d52d37c Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 6 Feb 2025 14:55:33 +0100 Subject: [PATCH 4/6] Add test for similarity threshold 0 --- tests/test_dedup.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_dedup.py b/tests/test_dedup.py index 4547a43..d6e2392 100644 --- a/tests/test_dedup.py +++ b/tests/test_dedup.py @@ -128,3 +128,9 @@ def test_dedup_with_similarity_with_doi_stopwords(): data = ASReviewData.from_file(file_with_doi) deduplicate_data(data, similar=True, threshold=0.95, stopwords_language="english") assert len(data.df) == 1 + + +def test_threshold_zero(): + data = ASReviewData.from_file(file_with_doi) + deduplicate_data(data, similar=True, threshold=0) + assert len(data.df) == 1 From ac640925f8bc409d1889aede61999cb894a90324 Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 6 Feb 2025 14:56:51 +0100 Subject: [PATCH 5/6] Simplify print logic --- asreviewcontrib/datatools/dedup.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py index 91d5990..4783394 100644 --- a/asreviewcontrib/datatools/dedup.py +++ b/asreviewcontrib/datatools/dedup.py @@ -231,11 +231,9 @@ def deduplicate_data( verbose=verbose, ) - # count duplicates - n_dup = initial_length - len(asdata.df) - if output_path: asdata.to_file(output_path) - print(f"Removed {n_dup} duplicates from dataset with {initial_length} records.") - else: - print(f"Found {n_dup} duplicates in dataset with {initial_length} records.") + + # count duplicates + n_dup = initial_length - len(asdata.df) + print(f"Found {n_dup} duplicates in dataset with {initial_length} records.") From e28f3097044520430f14f60d9eccb5b7a565c1fb Mon Sep 17 00:00:00 2001 From: Peter Lombaers Date: Thu, 6 Feb 2025 15:42:34 +0100 Subject: [PATCH 6/6] Simplify logic --- asreviewcontrib/datatools/dedup.py | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/asreviewcontrib/datatools/dedup.py b/asreviewcontrib/datatools/dedup.py index 4783394..1d26ea3 100644 --- a/asreviewcontrib/datatools/dedup.py +++ b/asreviewcontrib/datatools/dedup.py @@ -103,11 +103,7 @@ def _drop_duplicates_by_similarity( seq_matcher = SequenceMatcher() duplicated = [False] * len(s) - if verbose: - similar_list = [] - else: - similar_list = None - + similar_list = [] if pid in asdata.df.columns: if is_string_dtype(asdata.df[pid]) or is_object_dtype(asdata.df[pid]): pids = asdata.df[pid].str.strip().replace("", None) @@ -115,7 +111,6 @@ def _drop_duplicates_by_similarity( pids = pids.str.lower().str.replace( r"^https?://(www\.)?doi\.org/", "", regex=True ) - else: pids = asdata.df[pid] @@ -136,14 +131,10 @@ def _drop_duplicates_by_similarity( and seq_matcher.quick_ratio() > threshold and (not strict or seq_matcher.ratio() > threshold) ): - if verbose and not duplicated[j]: + if not duplicated[j]: similar_list.append((i, j)) - duplicated[j] = True - if verbose: - _print_similar_list(similar_list, data, pid, pids) - else: print(f"Not using {pid} for deduplication because there is no such data.") @@ -160,15 +151,12 @@ def _drop_duplicates_by_similarity( and seq_matcher.quick_ratio() > threshold and (not strict or seq_matcher.ratio() > threshold) ): - if verbose and not duplicated[j]: + if not duplicated[j]: similar_list.append((i, j)) - duplicated[j] = True - - if verbose: - _print_similar_list(similar_list, data, pid) - asdata.df = asdata.df[~pd.Series(duplicated)].reset_index(drop=True) + if verbose: + _print_similar_list(similar_list, data, pid) def deduplicate_data(