-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdeduplicate.py
98 lines (87 loc) · 4 KB
/
deduplicate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from recon.core.module import BaseModule
import codecs
import os
class Module(BaseModule):
meta = {
'name': 'DB Deduplicator',
'author': ' (@hljupkij)',
'version': '0.1',
'description': 'Removes duplicate records from the database.',
'options': (
('table', 'hosts', True, 'source table of data for the list'),
('column', 'ip_address', True, 'source column to compare items'),
('nulls', True, False, 'remove rows from the dataset with empty column'),
('filename', False, False, 'path and filename for remove strings'),
),
}
def deduplicate(self,table,column,item):
#self.output("Start to remove duplicated rows")
query = f'SELECT rowid,{table[:-1]},{column} FROM "{table}" WHERE {table[:-1]}=="{item[0]}" AND {column}=="{item[1]}" ORDER BY rowid'
#self.output(query)
rows = self.query(query)
count = 0
if len(rows) > 1:
print(f'Found {len(rows)} duplicates of {rows[0][1]}')
first_rowid = rows[0][0]
for row in rows:
#row = row if row else ''
#print(row)
if row[0] != first_rowid:
query = f'DELETE FROM {table} WHERE rowid == "{row[0]}"'
self.query(query)
count +=1
#print(f'{table[:-1]}:{row[1]}, {column}:{row[2]}')
self.output(f"{count} items removed from '{table}'.")
#self.output("End of remove duplicated rows")
def remove_special(self,table,column,filename):
self.output("Start to remove rows which contains defined strings")
with codecs.open(filename, 'r', encoding='utf-8') as infile:
for line in infile:
line = line.rstrip()
print(line)
query = f'SELECT rowid,{table[:-1]},{column} FROM "{table}" WHERE {table[:-1]} LIKE "%{line}%"'
#print(query)
rows = self.query(query)
for row in rows:
print(f'TO DELETE {row}')
query = f'DELETE FROM "{table}" WHERE {table[:-1]} LIKE "%{line}%"'
self.query(query)
self.output("End of remove rows which contains defined strings")
def remove_empty(self,table,column):
self.output("Start to remove empty rows")
query = f'SELECT rowid,{table[:-1]},{column} FROM "{table}" WHERE {column} IS NULL ORDER BY rowid'
rows = self.query(query)
for row in rows:
print(f'TO DELETE {row}')
query = f'DELETE FROM {table} WHERE {column} IS NULL'
# print(query)
result = self.query(query)
print(result)
self.output("End of remove empty rows")
def module_run(self):
# handle the source of information for the report
table = self.options['table']
column = self.options['column']
query = f'SELECT COUNT(rowid) FROM {table}'
count = self.query(query)
self.output(f'There are {count[0][0]} rows in table {table}')
self.output("Start deduplication")
# Optional: remove rows with empty data in column
if self.options['nulls']:
self.remove_empty(table,column)
# Optional: remove rows with defined strings in name
if self.options['filename']:
self.remove_special(table,column,self.options['filename'])
# Query to select distinct items
query = f'SELECT DISTINCT {table[:-1]},{column} FROM "{table}" ORDER BY "{table[:-1]}"'
#self.output(query)
rows = self.query(query)
for row in rows:
#print(row)
self.deduplicate(table,column,row)
#print(f'{table[:-1]}:{row[0]}, {column}:{row[1]}')
#self.output(f"{len(rows)} distinct items found in '{table}'.")
self.output("End deduplication")
query = f'SELECT COUNT(rowid) FROM {table}'
count = self.query(query)
self.output(f'There are {count[0][0]} rows in table {table}')