-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathindex.js
277 lines (237 loc) · 7.09 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
"use strict";
import fs from "fs";
import path from "path";
import walk from "walkdir";
import md5File from "md5-file";
import log from "npmlog";
function isString(o) {
return typeof o === 'string';
}
function defaultCallback(err, groups) {
if (err) {
log.error("fdf", err);
} else {
groups.forEach(group => {
log.info("fdf", '');
group.forEach(item => {
log.info("fdf", item.path);
});
});
}
}
function containsFileItemByPath(list, path) {
for (let i = 0; i < list.length; i++) {
if (list[i].path === path) return true;
}
return false;
}
function getAllFiles(pathes) {
const files = [];
for (let i = 0; i < pathes.length; i++) {
const ff = getFiles(pathes[i]);
for (let j = 0; j < ff.length; j++) {
// We dont want to have duplicates in our file array:
if (files.indexOf(ff[j]) === -1) files.push(ff[j]);
}
}
return files;
}
function getFiles(dir) {
// We want get files only (no directories):
const files = [];
try {
let filesObject = walk.sync(dir, {
"return_object": true,
"no_return": false
});
for (let file in filesObject) {
// noinspection JSBitwiseOperatorUsage
if (filesObject.hasOwnProperty(file) && filesObject[file]["mode"] & 0x8000) files.push(file);
}
} catch (e) {
log.error('fdf', 'Error reading %s', dir, e);
}
return files;
}
function loadMd5File(options) {
let ret = [];
if (!options.md5SkipLoading) {
try {
ret = JSON.parse(fs.readFileSync(options.md5File));
log.info('fdf', 'File loaded : %s', options.md5File);
} catch (e) {
log.info('fdf', 'Can\'t load %s (%s)', options.md5File, e.message);
}
}
return ret;
}
function saveMd5File(md5List, options) {
if (!options.md5SkipSaving) {
fs.writeFile(options.md5File, JSON.stringify(md5List) /*, null, 2)*/, err => {
if (err) return console.log.error(err);
log.info('fdf', 'JSON (md5) saved at : %s', options.md5File);
});
}
}
function removeOutdatedMd5Entries(files, md5List) {
// remove outdated entries from md5List:
if (md5List.length) {
for (let i = 0; i < md5List.length; i++) {
if (files.indexOf(md5List[i].path) === -1) {
md5List.splice(i, 1); // remove item from md5List
}
}
}
}
function createMissingMd5(files, md5List, options) {
const t2 = Date.now();
for (let i = 0; i < files.length; i++) {
// Logging:
if (!options.silent && !options.silent && i > 0 && i % 1000 === 0) {
const dt = (Date.now() - t2);
const eta = Math.max(0, (dt * files.length / i - dt) / 1000).toFixed(0);
log.info('fdf', '%s %% (%d files) in %d millis (ETA: %s secs).', (100 * i / files.length).toFixed(1), i, dt, eta);
}
const p = files[i];
if (!options.checkPattern || p.match(options.checkPattern)) {
if (!md5List.length || !containsFileItemByPath(md5List, p)) {
const item = {path: p};
try {
item.md5 = md5File(p);
md5List.push(item);
} catch (e) {
log.error('fdf', '%d) %s (%s)', i, p, e.message);
}
}
}
} // for
}
function findDuplicates(md5List) {
const dublicatMd5s = [];
let fileCount = 0;
const ret = [];
for (let k = 0; k < md5List.length; k++) {
const foundDublicates = [];
const item1 = md5List[k];
for (let l = (k + 1); l < md5List.length; l++) {
const item2 = md5List[l];
if (item1.md5 === item2.md5) {
if (dublicatMd5s.indexOf(item1.md5) === -1) {
if (!containsFileItemByPath(foundDublicates, item1.path)) {
foundDublicates.push(item1);
}
if (!containsFileItemByPath(foundDublicates, item2.path)) {
foundDublicates.push(item2);
}
}
}
}
if (foundDublicates.length) {
ret.push(foundDublicates);
fileCount += foundDublicates.length;
dublicatMd5s.push(item1.md5);
}
}
log.info('fdf', 'Dublicates : %d files in %d groups.', fileCount, ret.length);
return ret;
}
function simplified(name) {
return name
.replace(/(\s|_)/g, '')
.replace(/\(\D+\)/g, '')
.replace(/\[\D+]/g, '');
}
function findSimilarly(files) {
const tmp = {};
for (const file of files) {
let basename = path.basename(file);
const simple = simplified(basename);
if (tmp[simple]) {
tmp[simple].push(file);
} else {
tmp[simple] = [file];
}
}
const ret = [];
for (const key in tmp) {
if (tmp[key].length > 1) {
ret.push(tmp[key]);
}
}
console.info(ret);
return ret;
}
function checkArguments(dir, options, callback) {
if (!dir) {
return log.error('fdf', 'Parameter dir is missing');
}
if (!options) options = {};
options.pathes = Array.isArray(dir) ? dir : [dir]; // options.pathes is an array!
options.callback = callback;
if (!options.callback) {
options.callback = defaultCallback;
}
if (options.logLevel) {
log.level = options.logLevel;
} else if (options.silent || options.quite) {
log.level = 'error';
}
// Check:
for (let i; i < options.pathes.length; i++) {
if (!isString(options.pathes[i])) {
return log.error('fdf', 'Parameter dir must be a string or an array of strings.');
}
}
return false;
}
function logArguments(options) {
log.info('fdf', 'Starting at : %s', options.pathes);
log.info('fdf', 'Searching for : %s', options.checkPattern);
log.info('fdf', 'Start dirs : %s', options.pathes);
}
export function findSimilarlyNamedFiles(dir, options, callback) {
const argProblem = checkArguments(dir, options, callback);
if (argProblem) return;
logArguments(options);
// Scan directories recursive:
const t1 = Date.now();
const files = getAllFiles(options.pathes);
const t2 = Date.now();
log.info('fdf',
'Directory scan: %d entries (in %d start directories) in %d millis',
files.length, options.pathes.length, (t2 - t1));
// find similarly named files:
const ret = findSimilarly(files);
// call the caller:
options.callback(null, ret);
}
export function findDuplicateFiles(dir, options, callback) {
const argProblem = checkArguments(dir, options, callback);
if (argProblem) return;
if (!options.md5File) {
options.md5File = path.join(options.pathes[0], '/', 'md5.json');
}
logArguments(options);
// Scan directories recursive:
const t1 = Date.now();
const files = getAllFiles(options.pathes);
const t2 = Date.now();
log.info('fdf',
'Directory scan: %d entries (in %d start directories) in %d millis',
files.length, options.pathes.length, (t2 - t1));
// Load md5 file or create empty array:
const md5List = loadMd5File(options);
// remove outdated entries from md5List:
if (md5List.length) removeOutdatedMd5Entries(files, md5List);
// get md5 for each file in array 'files':
createMissingMd5(files, md5List, options);
const t3 = Date.now();
// Logging:
log.info('fdf', 'Creation of md5 list: %d entries in %d millis', md5List.length, (t3 - t2));
// Write file/md5 list to file:
saveMd5File(md5List, options);
// find duplicates by md5:
const ret = findDuplicates(md5List);
// call the caller:
options.callback(null, ret);
}