diff --git a/imrs/imrs_apnic_list.py b/imrs/imrs_apnic_list.py new file mode 100644 index 0000000..4981377 --- /dev/null +++ b/imrs/imrs_apnic_list.py @@ -0,0 +1,99 @@ +# +# This script will try to build a sample of the input file. +# The purpose of the sample is, get a realistic test file +# that is small enough for iterative development, measures, +# etc., yet big enough to obtain statistically significant +# results. +# +# Usage: imrs_sample.py +# + +import sys +import traceback +import random +import time +import concurrent.futures +import math +import os +from os import listdir +from os.path import isfile, isdir, join + +class imrs_apnic_item: + def __init__(self, ip, apnic_use, imrs_use): + self.ip = ip + self.apnic_use = apnic_use + self.imrs_use = imrs_use + + def head(): + s = "IP, apnic_use, imrs_use," + return s + + + def text(self): + s = ip + "," + str(apnic_use) + "," + str(imrs_use) + "," + return s + +class apnic_record: + def __init__(self): + self.ip = "" + self.use_count = 0 + self.seen_in_imrs = False + self.imrs_count = 0 + + def parse(self, line): + parts = line.split(",") + nb_parts = len(parts) + if nb_parts >= 4: + try: + self.ip = parts[0].strip() + self.use_count = int(parts[3].strip()) + except Exception as e: + traceback.print_exc() + print("Cannot parse APNIC Record:\n" + line.strip() + "\nException: " + str(e)) + return False + return True + +def parse_imrs(line): + ok = False + ip = "" + count = 0 + try: + parts = line.split(",") + ip = parts[0].strip() + count = int(parts[1].strip()) + ok = True + except Exception as e: + traceback.print_exc() + print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e)) + return ok, ip, count + + +# main + +if len(sys.argv) != 4: + print("Usage: imrs_apnic_list.py ") + exit(1) +imrs_file = sys.argv[1] +apnic_file = sys.argv[2] +output_file = sys.argv[3] + +apnic_dict = dict() + +for line in open(apnic_file,"r"): + apnic = apnic_record() + if apnic.parse(line): + apnic_dict[apnic.ip] = apnic + +for line in open(imrs_file,"r"): + ok, ip, count = parse_imrs(line) + if ok: + if ip in apnic_dict: + apnic_dict[ip].seen_in_imrs = True + apnic_dict[ip].imrs_count = count + +with open(output_file, "w") as F: + F.write("IP, apnic_use, imrs_use,\n") + for ip in apnic_dict: + apnic_entry = apnic_dict[ip] + if apnic_entry.seen_in_imrs: + F.write(apnic_entry.ip + "," + str(apnic_entry.use_count) + "," + str(apnic_entry.imrs_count) + "\n") diff --git a/imrs/imrs_frequency.py b/imrs/imrs_frequency.py new file mode 100644 index 0000000..97dca88 --- /dev/null +++ b/imrs/imrs_frequency.py @@ -0,0 +1,78 @@ +# +# This script will try to build a sample of the input file. +# The purpose of the sample is, get a realistic test file +# that is small enough for iterative development, measures, +# etc., yet big enough to obtain statistically significant +# results. +# +# Usage: imrs_sample.py +# + +import sys +import traceback +import random +import time +import concurrent.futures +import math +import os +from os import listdir +from os.path import isfile, isdir, join + +def parse_imrs(line): + ok = False + ip = "" + count = 0 + try: + parts = line.split(",") + ip = parts[0].strip() + count = int(parts[1].strip()) + ok = True + except Exception as e: + traceback.print_exc() + print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e)) + return ok, ip, count + + +# main + +if len(sys.argv) < 3 or len(sys.argv) > 4: + print("Usage: imrs_frequency.py [load_step%]") + exit(1) +imrs_file = sys.argv[1] +output_file = sys.argv[2] +load_step = 0 +if len(sys.argv) == 4: + s_load_step = sys.argv[3] + if not s_load_step.endswith("%"): + print("Load step should be %, e.g. 1%, 0.1%, not " + s_load_step) + exit(1) + else: + load_step = float(s_load_step[:-1])/100.0 + +load_vec = [] + +total_load = 0 +for line in open(imrs_file,"r"): + ok, ip, use_count = parse_imrs(line) + if ok: + load_vec.append(use_count) + total_load += use_count + +load_vec.sort(reverse=True) + +with open(output_file, "w") as F: + cumulative_use = 0 + cumulative_count = 0 + delta_threshold = int(total_load*load_step) + threshold = 0 + last_written = 0 + F.write("Count, Queries, frequency,\n") + for use_count in load_vec: + cumulative_count += 1 + cumulative_use += use_count + if cumulative_use >= threshold: + F.write(str(cumulative_count) + "," + str(cumulative_use) + "," + str(cumulative_use/total_load) + ",\n") + threshold += delta_threshold + last_written = cumulative_count + if last_written < cumulative_count: + F.write(str(cumulative_count) + "," + str(cumulative_use) + "," + str(cumulative_use/total_load) + ",\n") \ No newline at end of file diff --git a/imrs/imrs_record.py b/imrs/imrs_record.py new file mode 100644 index 0000000..9a6984c --- /dev/null +++ b/imrs/imrs_record.py @@ -0,0 +1,90 @@ +# +# This script will try to build a sample of the input file. +# The purpose of the sample is, get a realistic test file +# that is small enough for iterative development, measures, +# etc., yet big enough to obtain statistically significant +# results. +# +# Usage: imrs_sample.py +# + +import sys +import traceback +import random +import time +import concurrent.futures +import math +import os +from os import listdir +from os.path import isfile, isdir, join + + +def imrs_parse_one_number(parts, parsed): + v = 0 + p = parts[parsed].strip() + v = int(parts[parsed]) + parsed += 1 + return v, parsed + +def imrs_parse_one_vector(parts, parsed, v): + for i in range(0, len(v)): + v[i],parsed = imrs_parse_one_number(parts, parsed) + return parsed + +class imrs_hyperloglog: + def __init__(self): + self.E = 0.0 + self.hllv=[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0] + pass; + def parse(self, parts, parsed): + self.E = float(parts[parsed].strip()) + parsed += 1 + for i in range(0, len(self.hllv)): + self.hllv[i], parsed = imrs_parse_one_number(parts,parsed) + return parsed + +class imrs_record: + def __init__(self): + self.ip = "" + self.query_volume = 0 + self.hourly_volume = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + self.daily_volume = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] + self.arpa_count = 0 + self.no_such_domain_queries = 0 + self.no_such_domain_reserved = 0 + self.no_such_domain_frequent = 0 + self.no_such_domain_chromioids = 0 + self.tld_counts = [0,0,0,0,0,0,0,0] + self.tld_hyperlog = imrs_hyperloglog() + self.sld_counts = [0,0,0,0,0,0,0,0] + self.sld_hyperlog = imrs_hyperloglog() + self.name_parts = [0,0,0,0,0,0,0,0] + self.rr_types = [0,0,0,0,0,0,0,0] + self.locales = [0,0,0,0,0,0,0,0] + + def parse_imrs(self, line): + ok = False + try: + parts = line.split(",") + self.ip = parts[0].strip() + parsed = 1 + self.query_volume, parsed = imrs_parse_one_number(parts, parsed) + parsed = imrs_parse_one_vector(parts, parsed, self.hourly_volume) + parsed = imrs_parse_one_vector(parts, parsed, self.daily_volume) + self.arpa_count, parsed = imrs_parse_one_number(parts, parsed) + self.no_such_domain_queries, parsed = imrs_parse_one_number(parts, parsed) + self.no_such_domain_reserved, parsed = imrs_parse_one_number(parts, parsed) + self.no_such_domain_frequent, parsed = imrs_parse_one_number(parts, parsed) + self.no_such_domain_chromioids, parsed = imrs_parse_one_number(parts, parsed) + parsed = imrs_parse_one_vector(parts, parsed, self.tld_counts) + parsed = self.tld_hyperlog = imrs_hyperloglog(parts, parsed) + parsed = imrs_parse_one_vector(parts, parsed, self.sld_counts) + parsed = self.sld_hyperlog = imrs_hyperloglog(parts, parsed) + parsed = imrs_parse_one_vector(parts, parsed, self.name_parts) + parsed = imrs_parse_one_vector(parts, parsed, self.rr_types) + parsed = imrs_parse_one_vector(parts, parsed, self.locales) + ok = True + except Exception as e: + traceback.print_exc() + print("Cannot parse IMRS Record after " + str(parsed) + " parts:\n" + line.strip() + "\nException: " + str(e)) + return ok diff --git a/src/imrs_instances.py b/src/imrs_instances.py new file mode 100644 index 0000000..6051d40 --- /dev/null +++ b/src/imrs_instances.py @@ -0,0 +1,95 @@ +#!/usr/bin/python +# coding=utf-8 +# +# This computes the montly totals for all the instances found +# in the east and west folders. The raw data is organized as: +# - ipstats / west / one folder per instance / files per date +# / east / one folder per instance / files per date +# The first processing step is to collect the list of file names +# for each instance: one file per date, possibly more if the +# same instance is present in east and west. +# +# + +import sys +import traceback +import random +import time +import concurrent.futures +import os +from os import listdir +from os.path import isfile, isdir, join + +def prepare_instances_list(ipstats_folder, month): + instances = dict() + for pole in [ "east", "west" ]: + pole_dir = join(ipstats_folder, pole) + folder_pole = listdir(pole_dir) + for instance_id in folder_pole: + instance_folder = join(pole_dir, instance_id) + if isdir(instance_folder): + if not instance_id in instances: + instances[instance_id] = [] + file_list = listdir(instance_folder) + for file_name in file_list: + instances[instance_id].append(join(instance_folder, file_name)) + return instances + +def check_or_create_dir(dir_path): + if not isdir(dir_path): + try: + os.mkdir(dir_path) + except Exception as e: + traceback.print_exc() + print("Cannot create <" + dir_path + ">\nException: " + str(e)) + return False + return True + +def process_instance(instance_id, month, result_folder, tmp_folder, ithitool, instances, do_debug): + result_file = instance_id + "_" + month + "-ipstats.csv" + result_path = join(result_folder, result_file) + tmp_file = instance_id + "_" + month + "-file-list.txt" + tmp_path = join(tmp_folder, tmp_file) + with open(tmp_path,"wt") as F: + for file_name in instances[instance_id]: + F.write(file_name + "\n") + merge_cmd = ithitool + ' -I ' + result_path + " " + tmp_path + cmd_ret = os.system(merge_cmd) + if cmd_ret == 0: + if do_debug: + print(result_file + ": computed.") + else: + print(result_file + ": computation failed, error:" + str(cmd_ret)) + return False + return True + +# main +if len(sys.argv) < 4 or len(sys.argv) > 5 or \ + (len(sys.argv) == 5 and sys.argv[4] != "debug"): + print("Usage: imrs_instances [debug]") + print("There are just " + str(len(sys.argv)) + " arguments.") + exit (1) +ipstats_folder = sys.argv[1] +month = sys.argv[2] +ithitool = sys.argv[3] +do_debug = len(sys.argv) == 5 + +print("Writing instance monthly files for: " + ipstats_folder) +try: + instances = prepare_instances_list(ipstats_folder, month) + result_folder = join(ipstats_folder, "instances") + tmp_folder = join(ipstats_folder, "tmp") + if check_or_create_dir(result_folder) and \ + check_or_create_dir(tmp_folder): + for instance_id in instances: + if len(instances[instance_id]) > 0: + if not process_instance(instance_id, month, result_folder, tmp_folder, ithitool, instances, do_debug): + exit(1) +except Exception as exc: + traceback.print_exc() + print('\nCode generated an exception: %s' % (exc)) + + + + + diff --git a/src/imrs_monthly_ip.py b/src/imrs_monthly_ip.py new file mode 100644 index 0000000..7b365c1 --- /dev/null +++ b/src/imrs_monthly_ip.py @@ -0,0 +1,110 @@ +#!/usr/bin/python +# coding=utf-8 +# +# This processes a list of monthly files (instances or clusters) +# and extracts a table, with entries: +# instance, nb_ip, nb_queries +# or +# cluster, instances, nb_ip, nb_queries +# where "nb_ip" is the number of IP addresses seens by this cluster, +# and "nb_queries" is the total number of queries +# +# usage: imrs_montly_ip input_folder output_file + +import sys +import traceback +import random +import time +import concurrent.futures +import os +from os import listdir +from os.path import isfile, isdir, join + +def parse_imrs(line): + ok = False + ip = "" + count = 0 + try: + parts = line.split(",") + if len(parts) >= 2: + ip = parts[0].strip() + count = int(parts[1].strip()) + ok = True + else: + print("Line <" + line.strip() + " has only " + str(len(parts)) + " parts.") + except Exception as e: + traceback.print_exc() + print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e)) + return ok, ip, count + +# main +if len(sys.argv) != 3: + print("usage: imrs_montly_ip input_folder output_file") + exit(1) +input_folder = sys.argv[1] +output_file = sys.argv[2] +is_instances = not input_folder[:-1].endswith("monthly") +if is_instances: + print("From instances monthly, " + input_folder + " compute " + output_file) +else: + print("From cluster monthly, " + input_folder + " compute " + output_file) +clusters = dict() +nb_files = 0 +if is_instances: + file_list = listdir(input_folder) + for file_name in file_list: + parts = file_name.split("_") + first_parts = parts[0].split('-') + if len(first_parts) != 3 or \ + len(first_parts[0]) != 4 or \ + len(first_parts[1]) != 2 or \ + len(first_parts[2]) != 3: + print("Cannot get cluster ID from: " + file_name) + else: + cluster_id = first_parts[1] + "-" + first_parts[2] + if not cluster_id in clusters: + clusters[cluster_id] = [] + clusters[cluster_id].append(file_name) + nb_files += 1 +else: + file_list = listdir(input_folder) + for file_name in file_list: + parts = file_name.split(".") + cluster_id = parts[0] + if not cluster_id in clusters: + clusters[cluster_id] = [] + clusters[cluster_id].append(file_name) + nb_files +=1 + +print("Found " + str(len(clusters)) + " clusters, " + str(nb_files) + " files.") + +id_list = sorted(list(clusters.keys())) + +with open(output_file, "w") as F: + F.write("Cluster, Instance, nb_IP, nb_queries,\n") + for cluster_id in id_list: + sys.stdout.write(cluster_id) + total_ip = 0 + total_queries = 0 + ip_list = set() + for file_name in clusters[cluster_id]: + sys.stdout.write(".") + sys.stdout.flush() + file_path = join(input_folder, file_name) + nb_queries = 0 + nb_ip = 0 + for line in open(file_path, "r"): + ok,ip,count = parse_imrs(line) + if ok: + nb_ip += 1 + if not ip in ip_list: + ip_list.add(ip) + nb_queries += count + if is_instances: + file_parts = file_name.split("_") + instance_id = file_parts[0] + F.write(cluster_id + "," + instance_id + "," + str(nb_ip) + "," + str(nb_queries) + ",\n") + total_queries += nb_queries + total_ip = len(ip_list) + F.write(cluster_id + ",total," + str(total_ip) + "," + str(total_queries) + ",\n") + print("\nAll done.")