diff --git a/service/sybil_finder.py b/service/sybil_finder.py index 73c61bd..f83b143 100644 --- a/service/sybil_finder.py +++ b/service/sybil_finder.py @@ -96,45 +96,66 @@ def find_common_items(file1, file2): return f"An error occurred: {str(e)}" -def filter_addresses(db_path, file1, file2, output_file, output_table): - # Connect to the SQLite database - conn = sqlite3.connect(db_path) - cursor = conn.cursor() +def filter_addresses(db_path='data/dune_data.db', file1='data/sybil.txt', file2='data/not_sybil.txt', output_file='data/result.txt', + output_db='data/result.db', output_table='result'): + # Ensure the data directory exists + os.makedirs(os.path.dirname(output_db), exist_ok=True) + + # Connect to the source SQLite database + source_conn = sqlite3.connect(db_path) + source_cursor = source_conn.cursor() + + # Connect to the target SQLite database + target_conn = sqlite3.connect(output_db) + target_cursor = target_conn.cursor() try: - # Read the file contents into sets + # Read file contents into sets with open(file1, 'r', encoding='utf-8') as f1: items1 = set(f1.read().splitlines()) - with open(file2, 'r', encoding='utf-8') as f2: items2 = set(f2.read().splitlines()) - # Intersection of both files to find common addresses - common_addresses = items1.intersection(items2) + # Union of both files to find excluded addresses + excluded_addresses = items1.union(items2) - # Query the database for entries with these addresses - query = "SELECT * FROM dune_items WHERE ua IN ({})".format( - ','.join('?' for _ in common_addresses)) - cursor.execute(query, list(common_addresses)) - results = cursor.fetchall() + # Function to split a list into chunks + def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] - # Write the addresses to the output file - with open(output_file, 'w', encoding='utf-8') as f: - for result in results: - f.write(result[0] + '\n') + # Prepare the result set + results = [] + + # Execute queries in chunks + for chunk in chunks(list(excluded_addresses), 999): + query = "SELECT * FROM dune_items WHERE ua NOT IN ({})".format( + ','.join('?' for _ in chunk)) + source_cursor.execute(query, chunk) + results.extend(source_cursor.fetchall()) - # Create the new table with the same structure - cursor.execute(f"CREATE TABLE IF NOT EXISTS {output_table} LIKE dune_items") - conn.commit() + # Create a new table in the target database and insert data + target_cursor.execute(f"CREATE TABLE IF NOT EXISTS {output_table} (ua TEXT, tc INTEGER, amt REAL, amt_avg REAL, cc TEXT, dwm TEXT, lzd INTEGER)") + target_conn.commit() - # Insert filtered data into the new table insert_query = f"INSERT INTO {output_table} VALUES (?,?,?,?,?,?,?)" - cursor.executemany(insert_query, results) - conn.commit() + target_cursor.executemany(insert_query, results) + target_conn.commit() - return "Process completed successfully." + # Write only the addresses to a new text file + with open(output_file, 'w', encoding='utf-8') as f: + for result in results: + f.write(result[0] + '\n') + + return "Data filtered and output file created." except Exception as e: return f"An error occurred: {str(e)}" finally: - # Close the database connection - conn.close() + # Close both database connections + source_conn.close() + target_conn.close() + + +print(filter_addresses()) + +# print(find_common_items('data/sybil.txt', 'data/not_sybil.txt')) \ No newline at end of file