From 97d8cc3c3b2cf4f490a66eb4fe0ae571d6172b3a Mon Sep 17 00:00:00 2001 From: Roman Dovgan Date: Sat, 18 May 2024 10:31:39 +0300 Subject: [PATCH 1/2] #31 Fixed result storing in DB and file, closes #31 --- service/sybil_finder.py | 64 ++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/service/sybil_finder.py b/service/sybil_finder.py index 73c61bd..e1e7ffb 100644 --- a/service/sybil_finder.py +++ b/service/sybil_finder.py @@ -96,45 +96,57 @@ def find_common_items(file1, file2): return f"An error occurred: {str(e)}" -def filter_addresses(db_path, file1, file2, output_file, output_table): - # Connect to the SQLite database - conn = sqlite3.connect(db_path) - cursor = conn.cursor() +def filter_addresses(db_path='data/dune_data.db', file1='data/sybil.txt', file2='data/not_sybil.txt', output_file='data/result.txt', + output_db='data/result.db', output_table='result'): + # Ensure the data directory exists + os.makedirs(os.path.dirname(output_db), exist_ok=True) + + # Connect to the source SQLite database + source_conn = sqlite3.connect(db_path) + source_cursor = source_conn.cursor() + + # Connect to the target SQLite database + target_conn = sqlite3.connect(output_db) + target_cursor = target_conn.cursor() try: - # Read the file contents into sets + # Read file contents into sets with open(file1, 'r', encoding='utf-8') as f1: items1 = set(f1.read().splitlines()) - with open(file2, 'r', encoding='utf-8') as f2: items2 = set(f2.read().splitlines()) - # Intersection of both files to find common addresses - common_addresses = items1.intersection(items2) + # Union of both files to find excluded addresses + excluded_addresses = items1.union(items2) - # Query the database for entries with these addresses - query = "SELECT * FROM dune_items WHERE ua IN ({})".format( - ','.join('?' for _ in common_addresses)) - cursor.execute(query, list(common_addresses)) - results = cursor.fetchall() + # Query to select non-excluded addresses + query = "SELECT * FROM dune_items WHERE ua NOT IN ({})".format( + ','.join('?' for _ in excluded_addresses)) + source_cursor.execute(query, list(excluded_addresses)) + results = source_cursor.fetchall() + + # Create a new table in the target database and insert data + target_cursor.execute(f"CREATE TABLE IF NOT EXISTS {output_table} (ua TEXT, tc INTEGER, amt REAL, amt_avg REAL, cc TEXT, dwm TEXT, lzd INTEGER)") + target_conn.commit() + + insert_query = f"INSERT INTO {output_table} VALUES (?,?,?,?,?,?,?)" + target_cursor.executemany(insert_query, results) + target_conn.commit() - # Write the addresses to the output file + # Write only the addresses to a new text file with open(output_file, 'w', encoding='utf-8') as f: for result in results: f.write(result[0] + '\n') - # Create the new table with the same structure - cursor.execute(f"CREATE TABLE IF NOT EXISTS {output_table} LIKE dune_items") - conn.commit() - - # Insert filtered data into the new table - insert_query = f"INSERT INTO {output_table} VALUES (?,?,?,?,?,?,?)" - cursor.executemany(insert_query, results) - conn.commit() - - return "Process completed successfully." + return "Data filtered and output file created." except Exception as e: return f"An error occurred: {str(e)}" finally: - # Close the database connection - conn.close() + # Close both database connections + source_conn.close() + target_conn.close() + + +# print(filter_addresses()) + +print(find_common_items('data/sybil.txt', 'data/not_sybil.txt')) \ No newline at end of file From 61a9318e8858d4f782d5b4906e35f2383fa7f728 Mon Sep 17 00:00:00 2001 From: Roman Dovgan Date: Sat, 18 May 2024 23:09:05 +0300 Subject: [PATCH 2/2] #32 Implemented chunks processing, closes #32 --- service/sybil_finder.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/service/sybil_finder.py b/service/sybil_finder.py index e1e7ffb..f83b143 100644 --- a/service/sybil_finder.py +++ b/service/sybil_finder.py @@ -119,11 +119,20 @@ def filter_addresses(db_path='data/dune_data.db', file1='data/sybil.txt', file2= # Union of both files to find excluded addresses excluded_addresses = items1.union(items2) - # Query to select non-excluded addresses - query = "SELECT * FROM dune_items WHERE ua NOT IN ({})".format( - ','.join('?' for _ in excluded_addresses)) - source_cursor.execute(query, list(excluded_addresses)) - results = source_cursor.fetchall() + # Function to split a list into chunks + def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + + # Prepare the result set + results = [] + + # Execute queries in chunks + for chunk in chunks(list(excluded_addresses), 999): + query = "SELECT * FROM dune_items WHERE ua NOT IN ({})".format( + ','.join('?' for _ in chunk)) + source_cursor.execute(query, chunk) + results.extend(source_cursor.fetchall()) # Create a new table in the target database and insert data target_cursor.execute(f"CREATE TABLE IF NOT EXISTS {output_table} (ua TEXT, tc INTEGER, amt REAL, amt_avg REAL, cc TEXT, dwm TEXT, lzd INTEGER)") @@ -147,6 +156,6 @@ def filter_addresses(db_path='data/dune_data.db', file1='data/sybil.txt', file2= target_conn.close() -# print(filter_addresses()) +print(filter_addresses()) -print(find_common_items('data/sybil.txt', 'data/not_sybil.txt')) \ No newline at end of file +# print(find_common_items('data/sybil.txt', 'data/not_sybil.txt')) \ No newline at end of file