-
Notifications
You must be signed in to change notification settings - Fork 1
/
deduplicate.py
executable file
·207 lines (167 loc) · 7.13 KB
/
deduplicate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/env python3
import hashlib
import os
import subprocess
import glob
from subprocess import CalledProcessError
import sys
from os.path import isfile, islink
import argparse
import shutil
import filecmp
def chunk_reader(fobj, chunk_size=65536):
"""Generator that reads a file in chunks of bytes"""
while True:
chunk = fobj.read(chunk_size)
if not chunk:
return
yield chunk
def get_hash(filename, first_chunk_only=False, hash=hashlib.sha1):
hashobj = hash()
try:
file_object = open(filename, 'rb')
except PermissionError:
return
if first_chunk_only:
hashobj.update(file_object.read(1024))
else:
for chunk in chunk_reader(file_object):
hashobj.update(chunk)
hashed = hashobj.hexdigest()
file_object.close()
return hashed
def check_for_duplicates(paths, dry_run, force, verbose, compare, hash=hashlib.sha1):
hashes_by_size = {}
hashes_on_1k = {}
hashes_full = {}
pre_stat = shutil.disk_usage("/")
if dry_run:
print ("Dry run! no change will be applied")
print("Disk Used: %d bytes Free: %d bytes" % (pre_stat.used, pre_stat.free))
visited_dirs = set()
for path in paths:
print("Scanning %s ..." % (path))
for path in glob.iglob(path, recursive=True):
full_path = os.path.abspath(path)
if not isfile(full_path) or islink(full_path):
continue
try:
file_size = os.path.getsize(full_path)
except (OSError,):
# not accessible (permissions, etc)
continue
if file_size < 1024:
continue
dirname = os.path.dirname(full_path)
if dirname not in visited_dirs:
if verbose > 1:
print("Scanning %s/ ..." % (dirname))
visited_dirs.add(dirname)
duplicate = hashes_by_size.get(file_size)
if duplicate:
hashes_by_size[file_size].append(full_path)
else:
hashes_by_size[file_size] = [] # create the list for this file size
hashes_by_size[file_size].append(full_path)
# For all files with the same file size, get their hash on the 1st 1024 bytes
print("Hashing headers...")
visited_dirs = set()
for __, files in hashes_by_size.items():
if len(files) < 2:
continue # this file size is unique, no need to spend cpu cycles on it
for filename in files:
dirname = os.path.dirname(filename)
if dirname not in visited_dirs:
if verbose > 1:
print("Header hashing %s/ ..." % (dirname))
visited_dirs.add(dirname)
small_hash = get_hash(filename, first_chunk_only=True)
duplicate = hashes_on_1k.get(small_hash)
if duplicate:
hashes_on_1k[small_hash].append(filename)
else:
hashes_on_1k[small_hash] = [] # create the list for this 1k hash
hashes_on_1k[small_hash].append(filename)
# For all files with the hash on the 1st 1024 bytes, get their hash on the full file - collisions will be duplicates
print("Hashing...")
visited_dirs = set()
for __, files in hashes_on_1k.items():
if len(files) < 2:
continue # this hash of fist 1k file bytes is unique, no need to spend cpu cycles on it
for filename in files:
dirname = os.path.dirname(filename)
if dirname not in visited_dirs:
if verbose > 1:
print("Hashing %s/ ..." % (dirname))
visited_dirs.add(dirname)
full_hash = get_hash(filename, first_chunk_only=False)
duplicate = hashes_full.get(full_hash)
if duplicate:
duplicate = hashes_full[full_hash].append(filename)
else:
hashes_full[full_hash] = [] # create the list for this 1k hash
hashes_full[full_hash].append(filename)
total_bytes = 0
unique_bytes = 0
total_hashes = 0
errors = 0
# Issue dedupes
print("Deduping...")
for full_hash, files in hashes_full.items():
if len(files) < 2:
continue # this hash of fist 1k file bytes is unique, no need to spend cpu cycles on it
duplicate = files[0]
file_size = os.path.getsize(duplicate)
total_bytes += file_size * len(files)
unique_bytes += file_size
total_hashes += 1
if verbose > 0:
print("Hash:%s Size:%d" % (full_hash, file_size))
for filename in files:
if filename == duplicate:
if verbose > 0:
print("\t> %s" % (filename))
continue
if verbose > 0:
print("\t%s" % (filename))
if not dry_run:
if compare and not filecmp.cmp(duplicate, filename, shallow=False):
continue
try:
args = ["cp", "-cv"]
if force:
args.append("-f")
args.append(duplicate)
args.append(filename)
copyCommand = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, check=True)
if verbose > 1:
print(copyCommand)
except CalledProcessError:
errors += 1
if verbose > 0:
print('Could not dedupe file: %s. Skipping ...' % filename)
print("Deduped %d clusters" % total_hashes)
print("Skipped due to errors %d files" % errors)
print("Total potential deduped: %d bytes" % (total_bytes - unique_bytes))
post_stat = shutil.disk_usage("/")
print("Disk Used: %d bytes Free: %d bytes" % (post_stat.used, post_stat.free))
print("Freed %d bytes" % (post_stat.free - pre_stat.free))
parser = argparse.ArgumentParser(description='Deduplicate files in apfs',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="Example usage:\n./deduplicate.py 'Applications/Unity*/**'")
parser.add_argument('paths', metavar='path', nargs='+',
help='Paths to scan, glob accepted')
parser.add_argument('--dry-run', dest='dry_run', action='store_const',
const=True, default=False,
help='Do not actually perform deduplication')
parser.add_argument('--force', '-f', dest='force', action='store_const',
const=True, default=False,
help='Copy with -f')
parser.add_argument('--verbose', '-v', dest='verbose', action='count',
default=0,
help='Verbosity level')
parser.add_argument('--compare', '-c', dest='compare', action='store_const',
const=True, default=False,
help='Only copy if file contents perfectly matches')
args = parser.parse_args()
check_for_duplicates(args.paths, args.dry_run, args.force, args.verbose, args.compare)