Skip to content

Commit

Permalink
improvments, feature complete for now
Browse files Browse the repository at this point in the history
  • Loading branch information
JordanSlaman committed Jul 31, 2022
1 parent ac1ac4c commit d89da3b
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 59 deletions.
66 changes: 51 additions & 15 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@ I recommend [asdf-vm](https://asdf-vm.com/) to manage local python versions, or
`python rm_duplicates.py --help`

```
usage: rm_duplicates.py [-h] [--dry_run | --no-dry_run | -d] [--outfile OUTFILE] [--cleanup_outfile | --no-cleanup_outfile | -c]
[--infile INFILE] [--recurse | --no-recurse | -r] [--keep_empty_subdirs | --no-keep_empty_subdirs | -k]
[--progress | --no-progress | -p] [--verbose | --no-verbose | -v]
usage: rm_duplicates.py [-h] [--dry_run | --no-dry_run | -d] [--found_duplicates_csv_filepath ./duplicates_found.csv] [--skip_identification | --no-skip_identification | -s] [--recurse | --no-recurse | -r]
[--keep_empty_subdirectories | --no-keep_empty_subdirectories | -k] [--progress | --no-progress | -p] [--verbose | --no-verbose | -v]
~/path1 ~/path2 [~/path1 ~/path2 ...]
Remove duplicate files across paths.
Expand All @@ -40,22 +39,59 @@ positional arguments:
options:
-h, --help show this help message and exit
--dry_run, --no-dry_run, -d
Skips file removal. You will be able to view and verify duplicates found with verbose mode or by providing
viewing the output csv file. Duplicates found in the outfile are removed from right to left. Only the first
filepath in the list will be kept, so the order you pass your paths is important. (default: False)
--outfile OUTFILE, -o OUTFILE
Pass in a filepath other than "./found_duplicates.csv"
--cleanup_outfile, --no-cleanup_outfile, -c
Will remove the outfile from the identify step if flagged. (default: False)
--infile INFILE, -i INFILE
Pass in a filepath to process removals from. This option will skip the identification step.
Skips file removal. You will be able to view and verify duplicates found with verbose mode or by providing viewing the output csv file. Running this command with verbose mode on will
log the removal steps. Running it with verbose off completely skips removal. Duplicates found in the outfile are removed from right to left. Only the first filepath in the list will be
kept, so the order you pass your paths is important. (default: False)
--found_duplicates_csv_filepath ./duplicates_found.csv, -f ./duplicates_found.csv
Pass in a filepath to output identified duplicates to. The output format is a .csv of duplicated paths. Only the first row (first file found in path order.) is preserved. Removal will
proceed using this file unless "--dry_run" is specified.
--skip_identification, --no-skip_identification, -s
Uses the file provided by --found_duplicates_csv_filepath to process removals. This saves a lot of time iterating and hashing all files in the provided paths. (default: False)
--recurse, --no-recurse, -r
recurse into subdirectories (default: False)
--keep_empty_subdirs, --no-keep_empty_subdirs, -k
--keep_empty_subdirectories, --no-keep_empty_subdirectories, -k
Will not delete a directory or if it is empty after file deduplication. (default: False)
--progress, --no-progress, -p
Shows a crude form of progress for both steps, will add additional time to the overall operation as it needs
to iterate over the paths. (default: False)
Shows a crude form of progress for both steps, will add additional time to the overall operation as it needs to iterate over the paths. (default: False)
--verbose, --no-verbose, -v
Logs additional information while running. (default: False)
```

# Advanced Usage - Review & Safety

This script contains 2 primary steps.

1. Identify
This step walks the paths provided in order and hashes all the files found. (and optionally continues into subdirectories with `--recurse`)
It saves the identified paths to a .csv in a temporary folder if unspecified.

You can provide a filepath for your own .csv to review before you commit to a removal.
You would need to also use the `--dry_run` flag for this to prevent the

Relevant options:
- paths (positional)
- `-r` recurse
- `-p` progress
- `-v` verbose
- `-f` found_duplicates_csv_filepath


2. Remove

Running this command with verbose mode on will log the removal steps. Running it with verbose off completely skips removal.

- `-d` dry_run
- `-r` recurse
- `-k` keep_empty_subdirectories
- `-p` progress
- `-v` verbose
- `-f` found_duplicates_csv_filepath


## Examples

### Identify to file
> -f ./duplicates.csv -r -p -d "/Volumes/Backup/Photos/Jordan's Phone" "~/Jordan Dropbox/media/From Phone"
### Remove from file
> -f ./duplicates.csv -r -p -s "/Volumes/Backup/Photos/Jordan's Phone" "~/Jordan Dropbox/media/From Phone"
86 changes: 42 additions & 44 deletions rm_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ def validate_and_return_passed_search_paths(path_names):
path_names = [path_names]

for path_name in path_names:
folder_path = Path(path_name)
folder_path = Path(path_name).expanduser()

if not folder_path.exists():
raise ValueError('Cannot find path:', path_name)
else:
Expand All @@ -27,27 +28,18 @@ def validate_and_return_passed_search_paths(path_names):
return search_paths


def validate_identity_filepath(identify_pathname):
def validate_identify_csv_path(identify_pathname):
if not identify_pathname:
identify_file = tempfile.NamedTemporaryFile(delete=False)
identify_pathname = identify_file.name

# Validate Identify/Outfile Path
identify_path = Path(identify_pathname)
if identify_path:
if not identify_path.parent.exists():
raise ValueError('Cannot find path for identity file:', identify_path.parent)
return identify_path


def validate_in_out_identify_csv_paths(infile_name, outfile_name):
if not infile_name and not outfile_name:
identity_file = tempfile.NamedTemporaryFile()
return identity_file, identity_file
if not identify_path.parent.exists():
raise ValueError('Cannot find path for identity file:', identify_path.parent)

if infile_name:
identity_in_path = validate_identity_filepath(infile_name)
return identity_in_path, None

if outfile_name:
identify_out_path = validate_identity_filepath(outfile_name)
return identify_out_path, identify_out_path
identify_path.touch(exist_ok=True)
return identify_path


def human_timedelta(delta):
Expand Down Expand Up @@ -94,7 +86,7 @@ def progress_count_files(passed_paths, recurse=False):


def identify(passed_path_names,
outfile=False,
outfile,
recurse=False,
progress=False):
files_seen = defaultdict(list)
Expand Down Expand Up @@ -138,10 +130,10 @@ def identify(passed_path_names,
start_datetime=progress_init)

else:
logging.info(f'Processed {files_processed} total files from {search_path}.')
logging.info(f'Processed {files_processed} total files from {search_path}')

duplicate_list = [v for v in files_seen.values() if len(v) != 1]
logging.info(f'Files with copies found:', len(duplicate_list))
logging.info(f'Files with copies found: {len(duplicate_list)}')

logging.info('Writing identified duplicates:', str(outfile))

Expand All @@ -162,7 +154,7 @@ def remove(infile,
directories_seen = set()
progress_init = datetime.datetime.now()

logging.info('Removing found duplicates from output:', infile)
logging.info(f'Removing found duplicates from identified duplicates file: {infile}')

with open(infile, newline='') as csvfile:
identity_reader = csv.reader(csvfile)
Expand All @@ -179,7 +171,7 @@ def remove(infile,
progress_filecount = len(paths_to_remove)

for file_path in paths_to_remove:
logging.info('Unlinking file:', file_path)
logging.info(f'Unlinking file: {file_path}{" - Dry Run!" if dry_run else ""}')

if not dry_run:
file_path.unlink()
Expand All @@ -197,7 +189,7 @@ def remove(infile,
total=progress_filecount,
start_datetime=progress_init)

logging.info(f'Removed {files_removed} files total.')
logging.info(f'Removed {files_removed} files total.')


if __name__ == '__main__':
Expand All @@ -209,25 +201,28 @@ def remove(infile,
Skips file removal. You will be able to view and verify duplicates found with verbose mode or by
providing viewing the output csv file.
Running this command with verbose mode on will log the removal steps. Running it with verbose off
completely skips removal.
Duplicates found in the outfile are removed from right to left.
Only the first filepath in the list will be kept, so the order you pass your paths is important.
'''))
parser.add_argument('--outfile', '-o', type=str,
parser.add_argument('--found_duplicates_csv_filepath', '-f', type=str, metavar='./duplicates_found.csv',
default=None,
help=inspect.cleandoc('''
Pass in a filepath to output identified duplicates to.
The output format is a .csv of duplicated paths.
Only the first row (first file found in path order.) is preserved.
Removal will proceed using this file unless "--dry_run" is specified.
'''))
parser.add_argument('--cleanup_outfile', '-c', action=argparse.BooleanOptionalAction, default=False,
help='Will remove the outfile from the identify step if flagged.')
parser.add_argument('--infile', '-i', type=str,
parser.add_argument('--skip_identification', '-s', action=argparse.BooleanOptionalAction, default=False,
help=inspect.cleandoc('''
Pass in a filepath to process removals from.
This option will skip the identification step.
Uses the file provided by --found_duplicates_csv_filepath to process removals.
This saves a lot of time iterating and hashing all files in the provided paths.
'''))
parser.add_argument('--recurse', '-r', action=argparse.BooleanOptionalAction, default=False,
help='recurse into subdirectories')
parser.add_argument('--keep_empty_subdirs', '-k', action=argparse.BooleanOptionalAction, default=False,
parser.add_argument('--keep_empty_subdirectories', '-k', action=argparse.BooleanOptionalAction, default=False,
help='Will not delete a directory or if it is empty after file deduplication.')
parser.add_argument('--progress', '-p', action=argparse.BooleanOptionalAction, default=False,
help=inspect.cleandoc('''
Expand All @@ -239,26 +234,29 @@ def remove(infile,

args = parser.parse_args()
script_start = datetime.datetime.now()
identity_in_file, identify_out_file = validate_in_out_identify_csv_paths(args.infile, args.outfile)
identified_csv_filepath = validate_identify_csv_path(args.found_duplicates_csv_filepath)

if args.verbose:
logging.basicConfig(level=logging.INFO)
logging.info("Beginning deduplication!")

if not args.infile:
if not args.skip_identification:
identify(args.paths,
outfile=identify_out_file,
outfile=identified_csv_filepath,
recurse=args.recurse,
progress=args.progress)
Path(identify_out_file).unlink() # deletes temporary directory

elif args.cleanup_outfile:
Path(identify_out_file).unlink()

remove(infile=identity_in_file,
dry_run=args.dry_run,
rm_empty_dirs=not args.keep_empty_subdirs,
progress=args.progress)
else:
logging.info(f"Skipping deduplication - Will remove from {identified_csv_filepath}")

if not args.found_duplicates_csv_filepath:
Path(identified_csv_filepath).unlink() # deletes temporary directory

skip_removal = args.dry_run and not args.verbose
if not skip_removal:
remove(infile=identified_csv_filepath,
dry_run=args.dry_run,
rm_empty_dirs=not args.keep_empty_subdirectories,
progress=args.progress)

if args.verbose:
script_elapsed = datetime.datetime.now() - script_start
Expand Down

0 comments on commit d89da3b

Please sign in to comment.