From 2d722545adbdeea94ceff83aecb5410b6694c5a6 Mon Sep 17 00:00:00 2001 From: Christian Huitema Date: Sun, 14 Jul 2024 19:05:43 -0700 Subject: [PATCH] Produce first batch of classifiers --- imrs/imrs_classifier.py | 182 ++++++--------------------------- imrs/imrs_classifier2.py | 123 ++++++++++++++++++++++ imrs/imrs_classifier3.py | 77 ++++++++++++++ imrs/imrs_pandas.py | 216 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 447 insertions(+), 151 deletions(-) create mode 100644 imrs/imrs_classifier2.py create mode 100644 imrs/imrs_classifier3.py create mode 100644 imrs/imrs_pandas.py diff --git a/imrs/imrs_classifier.py b/imrs/imrs_classifier.py index 2051467..eae6e17 100644 --- a/imrs/imrs_classifier.py +++ b/imrs/imrs_classifier.py @@ -18,78 +18,8 @@ import numpy as np from sklearn.linear_model import LinearRegression import random - -def file_has_header(imrs_file): - has_header = False - # get the first line - line = "" - for line in open(imrs_file, "r"): - break - if len(line) > 0: - parts = line.split(",") - if len(parts) > 1: - try: - queries = int(parts[1]) - print("No header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...") - has_header = False - except: - print("header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...") - has_header = True - return has_header - -def load_imrs_to_frame(imrs_file): - if file_has_header(imrs_file): - df = pd.read_csv(imrs_ratio_file) - else: - df = pd.read_csv(imrs_file, header=None, names=imrs.imrs_headers, dtype={"network": str}, index_col = False) - return df - -def protected_ratio(v, d): - r = 0 - if d > 0: - r = v/d - return r - -def protected_count(x, r, list): - s = r - if r > 0: - mx = 0 - for k in list: - if x[k] > mx: - mx = x[k] - s = r*mx - count = 0 - for k in list: - if x[k] > s: - count += 1 - return count - -def reset_d31(x, list): - s = 0 - for k in list: - s += x[k] - d31 = x["queries"] - s - if d31 < 0: - d31 = 0 - return d31 - -def compute_nb_tlds(x): - tld_count = 0 - for tld in [ "COM", "NET", "ORG", "INFO", "CN", "IN", "DE", "US" ]: - if x[tld] > 0: - tld_count += 1 - tld_count += int(x["TLDs"]) - return tld_count - -def compute_nb_slds(x): - sld_count = 0 - for sld in [ "RESOLVER", "EC2", "CLOUD", "WPAD", "CORP", "MAIL", "_TCP", "PROD" ]: - if x[sld] > 0: - sld_count += 1 - sld_count += int(x["SLDs"]) - if sld_count < 1: - sld_count = 1 - return sld_count +import imrs_pandas +from imrs_pandas import print_stats, plot_or_save, example_and_count, print_names, print_mean def compute_l10_sa(x, y, n, intercept): d = float(intercept) @@ -97,39 +27,6 @@ def compute_l10_sa(x, y, n, intercept): d += float(x[n[i]]*y[i]) return d -def print_stats(x_df, name): - print(name) - x_des = x_df.describe() - print(x_des.transpose()) - x_cor = x_df.corr() - print(x_cor) - -def plot_or_save(plot_dir, image_name): - if plot_dir == "-": - plt.show() - else: - image_path = join(plot_dir, image_name) - plt.savefig(image_path) - -def example_and_count(df, name): - count = df.shape[0] - all_rows = df.shape[1] - queries = 0 - network = "" - sample = df.sample(13) - nb_rows = sample.shape[0] - # print(name + ": samples = " + str(nb_rows) + ", out of " + str(all_rows)) - sdp = sample[["network", "queries"]] - sdp_np = sdp.to_numpy() - # print("Sample shape: " + str(sdp.shape)) - # print("Sdp_np shape: " + str(np.shape(sdp_np))) - for i in range(np.shape(sdp_np)[0]): - if sdp_np[i,1] > queries: - queries = sdp_np[i,1] - network = str(sdp_np[i,0]) - print(name + ": count=" + str(count) + ", network=" + network) - return count, network - # main if len(sys.argv) != 2 and len(sys.argv) != 3: for x in range(0, len(sys.argv)): @@ -141,54 +38,16 @@ def example_and_count(df, name): if len(sys.argv) == 3: plot_dir = sys.argv[2] -full_df = load_imrs_to_frame(imrs_file) +full_df = imrs_pandas.load_imrs_to_frame(imrs_file) print("Loaded full") -# apply corections for day overlow bug: -# ignore d00, it is always 0 -# compute d31 = queries - sum (d01..d30) -# compute arpa = arpa0 - d31 -days = [ - "d01", "d02", "d03", "d04", "d05", "d06", "d07", "d08", "d09", "d10", \ - "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", \ - "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", \ - "d31" ] -full_df["d31"] = full_df.apply(lambda x: reset_d31(x, days[:-1]), axis=1) -full_df["arpa"] = full_df["arpa0"] - full_df["d31"] - -print("Computed corrections") -# compute the good column -full_df["good"] = full_df.apply(lambda x: x["queries"] - x["no_such"], axis=1) -# compute the ratio of good over APNIC -full_df["r_good_apnic"] = full_df.apply(lambda x: protected_ratio(x["good"], x["APNIC"]), axis=1) - -# compute log10 column of queries and apnic -full_df["l10_q"] = np.log10(full_df["queries"]) -full_df["l10_a"] = np.log10(2*full_df["APNIC"] + 1) -full_df["l10_g"] = np.log10(2*full_df["good"] + 1) -full_df["l_tld"] = np.log10(2*full_df["TLDs"] + 1) -full_df["l_sld"] = np.log10(2*full_df["SLDs"] + 1) -# add columns for ratios -for d in [ "no_such", "AAAA", "NS", "PTR", "NSEC", "SOA", "APNIC" ]: - r_d = "r_" + d - full_df[r_d] = full_df[d] / full_df["queries"] - -full_df["r_arpa"] = full_df["arpa"] / (2*full_df["queries"]) - -full_df["r_COM"] = full_df.apply(lambda x: protected_ratio(x["COM"], x["queries"] - x["no_such"]), axis=1) -full_df["r_INFO"] = full_df.apply(lambda x: protected_ratio(x["INFO"], x["queries"] - x["no_such"]), axis=1) -print("Computed ratios") - -hours = ["h00", "h01", "h02", "h03", "h04", "h05", "h06", "h07", "h08", "h09", \ - "h10", "h11", "h12", "h13", "h14", "h15", "h16", "h17", "h18", "h19", \ - "h20", "h21", "h22", "h23" ] - -full_df["h_count"] = full_df.apply(lambda x: protected_count(x, 0, hours), axis=1) -full_df["d_count"] = full_df.apply(lambda x: protected_count(x, 0, days), axis=1) -print("Computed hours") + +imrs_pandas.imrs_corrections(full_df) +print("Applied corrections") # First, study the APNIC Data # get APNIC subset apnic_df = full_df[full_df["l10_a"] > 0] +apnic_selected = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM" ] # select 4 subsets, based on 2 variables apnic_loneg_df = apnic_df[apnic_df["r_no_such"] < 0.1] @@ -199,6 +58,13 @@ def example_and_count(df, name): apnic_hineg_loap_df = apnic_hineg_df[apnic_hineg_df["r_good_apnic"] < 300] apnic_hineg_hiap_df = apnic_hineg_df[apnic_hineg_df["r_good_apnic"] >= 300] +print_names(apnic_loneg_loap_df) +print_mean(apnic_loneg_loap_df,"apnic_loneg_loap_df", apnic_selected) +print_mean(apnic_loneg_hiap_df,"apnic_loneg_hiap_df", apnic_selected) +print_mean(apnic_hineg_loap_df,"apnic_hineg_loap_df", apnic_selected) +print_mean(apnic_hineg_hiap_df,"apnic_hineg_hiap_df", apnic_selected) + + # select 4 subsets, based on 2 variables apnic_loneg_df = apnic_df[apnic_df["r_no_such"] < 0.1] apnic_hineg_df = apnic_df[apnic_df["r_no_such"] >= 0.1] @@ -222,6 +88,14 @@ def example_and_count(df, name): apnic_hineg_hiap_df.plot.scatter(ax=axb, x="queries", y="APNIC", alpha=0.5, color="red") plot_or_save(plot_dir, "apnic-queries.jpg") +axtld = apnic_df.plot.scatter(x="APNIC", y="TLDs", alpha=0.5, logx=True, logy=False, color="blue") +plot_or_save(plot_dir, "tlds-apnic.jpg") +axcom = apnic_df.plot.scatter(x="APNIC", y="COM", alpha=0.5, logx=True, logy=False, color="blue") +plot_or_save(plot_dir, "com-apnic.jpg") +axnosuch = full_df.plot.scatter(x="queries", y="r_no_such", alpha=0.5, logx=True, logy=False, color="blue") +apnic_df.plot.scatter(ax=axnosuch, x="queries", y="r_no_such", alpha=0.5, color="orange") +plot_or_save(plot_dir, "no_such-queries.jpg") + # plot APNIC/Queries/no_such axb = apnic_loneg_loap_df.plot.scatter(x="queries", y="r_no_such", alpha=0.5, logx=True, logy=False, color="blue") apnic_loneg_hiap_df.plot.scatter(ax=axb, x="queries", y="r_no_such", alpha=0.5, color="green") @@ -236,7 +110,6 @@ def example_and_count(df, name): # study the APNIC correlations # get a view of only the important columns -apnic_selected = [ "network", "l10_q", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l_tld", "l_sld", "l10_a" ] full_selected_df = full_df[apnic_selected] apnic_selected_df = full_selected_df[full_selected_df["l10_a"] > 0] @@ -260,6 +133,7 @@ def example_and_count(df, name): full_df["l10_sa"] = full_df.apply(lambda x: compute_l10_sa(x, lr.coef_.T, apnic_coeffs[:-1], lr.intercept_[0]), axis=1) full_df["l10_gsa"] = full_df.apply(lambda x: x["l10_g"] - x["l10_sa"], axis=1) #print(list(full_df)) + apnic_coeffs_x = [ "network", "l10_sa", "l10_gsa", "l10_q", "r_no_such", "l10_a", "queries" ] full_data_x_df = full_df[apnic_coeffs_x] # print(list(full_data_x_df)) @@ -274,7 +148,7 @@ def example_and_count(df, name): #print_stats(full_df["l10_sa", "l10_a"], "full_df") # apply regression to classify not APNIC traffic -notap_df = full_data_x_df[full_data_x_df["l10_a"] == 0] +notap_df = full_df[full_df["l10_a"] == 0] axp = notap_df.plot.scatter(x="queries", y="r_no_such", alpha=0.1, logx=True, logy=False, color="blue") apnic_df.plot.scatter(ax=axp, x="queries", y="r_no_such", alpha=0.2, color="orange") @@ -318,4 +192,10 @@ def example_and_count(df, name): example_and_count(notap_loneg_loap_df, "notap_loneg_loap_df (blue)") example_and_count(notap_loneg_hiap_df, "notap_loneg_hiap_df (green)") example_and_count(notap_hineg_loap_df, "notap_hineg_loap_df (orange)") -example_and_count(notap_hineg_hiap_df, "notap_hineg_hiap_df (red)") \ No newline at end of file +example_and_count(notap_hineg_hiap_df, "notap_hineg_hiap_df (red)") + +print_names(notap_loneg_loap_df) +print_mean(notap_loneg_loap_df,"notap_loneg_loap_df", apnic_selected) +print_mean(notap_loneg_hiap_df,"notap_loneg_hiap_df", apnic_selected) +print_mean(notap_hineg_loap_df,"notap_hineg_loap_df", apnic_selected) +print_mean(notap_hineg_hiap_df,"notap_hineg_hiap_df", apnic_selected) \ No newline at end of file diff --git a/imrs/imrs_classifier2.py b/imrs/imrs_classifier2.py new file mode 100644 index 0000000..c87bae2 --- /dev/null +++ b/imrs/imrs_classifier2.py @@ -0,0 +1,123 @@ +# +# Exploration of the ipstats file for each network +# + +import sys +import traceback +import random +import time +import concurrent.futures +import math +import os +from os import listdir +from os.path import isfile, isdir, join +import imrs +from imrs import parse_imrs_volume_only, apnic_record +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn.linear_model import LinearRegression +import random +import imrs_pandas +from imrs_pandas import print_stats, save_stats, save_selected_stats, \ + plot_or_save, plot_and_explore, example_and_count, \ + print_names, print_mean + + +# main +if len(sys.argv) != 2 and len(sys.argv) != 3: + for x in range(0, len(sys.argv)): + print(str(x) + ":" + sys.argv[x]) + print("Usage: imrs_classifier.py []") + exit(1) +imrs_file = sys.argv[1] +plot_dir = "-" +out_file = sys.stdout +if len(sys.argv) == 3: + plot_dir = sys.argv[2] + csv_path = join(plot_dir, "stats.csv") + out_file = open(csv_path, "w") + out_file.write("frame, property, count, mean, std, min, c25%, c50%, c75%, max\n") + + +full_df = imrs_pandas.load_imrs_to_frame(imrs_file) +print("Loaded full") + +imrs_pandas.imrs_corrections(full_df) +print("Applied corrections") + +tracked = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM" ] +save_selected_stats(out_file, full_df, tracked, "full_df") +plot_and_explore(full_df, plot_dir, "full", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) + +# First, isolate the "small" nodes, defined +# as sending fewer than 40 queries. +small_df = full_df[full_df["queries"] <= 100] +big_df = full_df[full_df["queries"] > 100] + +save_selected_stats(out_file, small_df, tracked,"small_df") +save_selected_stats(out_file, big_df, tracked,"big_df") + +# then, create three subsets of the big sites: +# ns_low: no-such < 5% +# ns_high: no-such > 90% +# ns_mid: in_between + +ns_low = big_df[big_df["r_no_such"] < 0.05] +ns_other = big_df[big_df["r_no_such"] >= 0.05] +ns_high = ns_other[ns_other["r_no_such"] > 0.9] +ns_mid = ns_other[ns_other["r_no_such"] <= 0.9] + +save_selected_stats(out_file, ns_other, tracked,"ns_other") +plot_and_explore(ns_other, plot_dir, "other", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) + +save_selected_stats(out_file, ns_low, tracked,"ns_low") +save_selected_stats(out_file, ns_high, tracked,"ns_high") +save_selected_stats(out_file, ns_mid, tracked,"ns_mid") + +# At this stage, we have separated 4 groups. +# We will ignore the "small" group for now, because in the absence of +# traffic it is hard to classify anything. + +plot_and_explore(ns_low, plot_dir, "low", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) +plot_and_explore(ns_mid, plot_dir, "mid", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) +plot_and_explore(ns_high, plot_dir, "high", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) + +# In the "low NS" group, the plot of TLDs versus queries shows a break +# at somewhere between 500 and 1000 TLDs seen. Above that line we find +# very few APNIC servers but many large non APNIC nodes. This could +# be nodes engaged in some kind of scanning process. + +low_lt500t = ns_low[ns_low["TLDs"] <= 500] +low_gt500t = ns_low[ns_low["TLDs"] > 500] +save_selected_stats(out_file, low_lt500t, tracked,"low_lt500t") +save_selected_stats(out_file, low_gt500t, tracked,"low_gt500t") +plot_and_explore(low_lt500t, plot_dir, "low_lt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) +plot_and_explore(low_gt500t, plot_dir, "low_gt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) + +# In the "high NS" group, there seems +# to be two interesting subgroups: more than 500 TLDs, as in the +# "low" case, and more than about 10^6 queries, which separates +# a bunch of high values from the bulk of APNNIC resolvers. + +high_lt500t = ns_high[ns_high["TLDs"] <= 500] +high_gt500t = ns_high[ns_high["TLDs"] > 500] +save_selected_stats(out_file, high_lt500t, tracked,"high_lt500t") +save_selected_stats(out_file, high_gt500t, tracked,"high_gt500t") +plot_and_explore(high_lt500t, plot_dir, "high_lt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) +plot_and_explore(high_gt500t, plot_dir, "high_gt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) + +# In the "mid" group, the pictures are murky. There seems to be +# a separation between resolvers with more than 1 million +# queries and others. (or is it 100K?) + +mid_lt1Mq = ns_mid[ns_mid["queries"] <= 1000000] +mid_gt1Mq = ns_mid[ns_mid["queries"] > 1000000] + +save_selected_stats(out_file, mid_lt1Mq, tracked,"mid_lt1Mq") +save_selected_stats(out_file, mid_gt1Mq, tracked,"mid_gt1Mq") +plot_and_explore(mid_lt1Mq, plot_dir, "mid_lt1Mq", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) +plot_and_explore(mid_gt1Mq, plot_dir, "mid_gt1Mq", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False) + +if out_file != sys.stdout: + out_file.close() \ No newline at end of file diff --git a/imrs/imrs_classifier3.py b/imrs/imrs_classifier3.py new file mode 100644 index 0000000..b9e6ecb --- /dev/null +++ b/imrs/imrs_classifier3.py @@ -0,0 +1,77 @@ +# +# Exploration of the ipstats file for each network +# + +import sys +import traceback +import random +import time +import concurrent.futures +import math +import os +from os import listdir +from os.path import isfile, isdir, join +import imrs +from imrs import parse_imrs_volume_only, apnic_record +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn.linear_model import LinearRegression +import random +import imrs_pandas +from imrs_pandas import print_stats, save_stats, save_selected_stats, \ + plot_or_save, plot_and_explore, example_and_count, \ + print_names, print_mean + + +# main +if len(sys.argv) != 2 and len(sys.argv) != 3: + for x in range(0, len(sys.argv)): + print(str(x) + ":" + sys.argv[x]) + print("Usage: imrs_classifier.py []") + exit(1) +imrs_file = sys.argv[1] +plot_dir = "-" +out_file = sys.stdout +if len(sys.argv) == 3: + plot_dir = sys.argv[2] + csv_path = join(plot_dir, "stats-v3.csv") + out_file = open(csv_path, "w") + out_file.write("frame, property, count, mean, std, min, c25%, c50%, c75%, max\n") + + +full_df = imrs_pandas.load_imrs_to_frame(imrs_file) +print("Loaded full") + +imrs_pandas.imrs_corrections(full_df) +print("Applied corrections") +explored = [ 'r_arpa', 'l_tld', 'l_com', 'r_COM', 'r_no_such' ] + +tracked = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM" ] +save_selected_stats(out_file, full_df, tracked, "full_df") +plot_and_explore(full_df, plot_dir, "full", 'queries', explored, lx=True, ly=False) + +# First, isolate the "small" nodes, defined +# as sending fewer than 40 queries. +small_df = full_df[full_df["queries"] <= 100] +big_df = full_df[full_df["queries"] > 100] + +save_selected_stats(out_file, small_df, tracked,"small") +save_selected_stats(out_file, big_df, tracked,"big") +plot_and_explore(small_df, plot_dir, "small", 'queries', explored, lx=True, ly=False) +plot_and_explore(big_df, plot_dir, "big", 'queries', explored, lx=True, ly=False) + +# then, create two subsets of the big sites: +# ns_medium: < 1M queries +# ns_many: >= 1M + +ns_medium = big_df[big_df['queries'] < 100000 ] +ns_large = big_df[big_df['queries'] >= 100000 ] + +save_selected_stats(out_file, ns_medium, tracked,"medium") +save_selected_stats(out_file, ns_large, tracked,"large") +plot_and_explore(ns_medium, plot_dir, "medium", 'queries', explored, lx=True, ly=False) +plot_and_explore(ns_large, plot_dir, "large", 'queries', explored, lx=True, ly=False) + +if out_file != sys.stdout: + out_file.close() \ No newline at end of file diff --git a/imrs/imrs_pandas.py b/imrs/imrs_pandas.py new file mode 100644 index 0000000..4462e62 --- /dev/null +++ b/imrs/imrs_pandas.py @@ -0,0 +1,216 @@ +# +# Common functions for parsing the imrs CSV files +# in panda format. +# + +import sys +import traceback +import random +import time +import concurrent.futures +import math +import os +from os import listdir +from os.path import isfile, isdir, join +import imrs +from imrs import parse_imrs_volume_only, apnic_record +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn.linear_model import LinearRegression +import random + +def file_has_header(imrs_file): + has_header = False + # get the first line + line = "" + for line in open(imrs_file, "r"): + break + if len(line) > 0: + parts = line.split(",") + if len(parts) > 1: + try: + queries = int(parts[1]) + print("No header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...") + has_header = False + except: + print("header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...") + has_header = True + return has_header + +def load_imrs_to_frame(imrs_file): + if file_has_header(imrs_file): + df = pd.read_csv(imrs_ratio_file) + else: + df = pd.read_csv(imrs_file, header=None, names=imrs.imrs_headers, dtype={"network": str}, index_col = False) + return df + +def protected_ratio(v, d): + r = 0 + if d > 0: + r = v/d + return r + +def protected_count(x, r, list): + s = r + if r > 0: + mx = 0 + for k in list: + if x[k] > mx: + mx = x[k] + s = r*mx + count = 0 + for k in list: + if x[k] > s: + count += 1 + return count + +def reset_d31(x, list): + s = 0 + for k in list: + s += x[k] + d31 = x["queries"] - s + if d31 < 0: + d31 = 0 + return d31 + +def compute_nb_tlds(x): + tld_count = 0 + for tld in [ "COM", "NET", "ORG", "INFO", "CN", "IN", "DE", "US" ]: + if x[tld] > 0: + tld_count += 1 + tld_count += int(x["TLDs"]) + return tld_count + +def compute_nb_slds(x): + sld_count = 0 + for sld in [ "RESOLVER", "EC2", "CLOUD", "WPAD", "CORP", "MAIL", "_TCP", "PROD" ]: + if x[sld] > 0: + sld_count += 1 + sld_count += int(x["SLDs"]) + if sld_count < 1: + sld_count = 1 + return sld_count + +def save_stats(f, x_df, name): + x_des = x_df.describe() + # print(x_des.transpose()) + for h in x_des: + c = x_des[h] + l = c.transpose() + f.write(name + ", " + str(h)) + for w in l: + f.write(", " + str(w)) + f.write("\n") + +def save_selected_stats(out_file, df, selection, name): + df_view = df[selection] + save_stats(out_file, df_view, name) + +def print_stats(x_df, name): + print(name) + x_des = x_df.describe() + print(x_des.transpose()) + if 'queries' in x_df.columns: + print("Queries " + str(x_df['queries'].sum())) + if 'APNIC' in x_df.columns: + dfa = x_df[x_df['APNIC']>0] + print("APNIC " + str(dfa['APNIC'].sum()) + ", " + str(dfa['queries'].sum()) + ", " + str(dfa['APNIC'].count())) + # x_cor = x_df.corr() + # print(x_cor) + + +def plot_or_save(plot_dir, image_name): + if plot_dir == "-": + plt.show() + else: + image_path = join(plot_dir, image_name) + plt.savefig(image_path) + plt.close(fig="all") + +def plot_and_explore(df, plot_dir, name, x_key, y_keys, lx=False, ly=False): + dfa = df[ df['APNIC'] == 0 ] + dfo = df[ df['APNIC'] != 0 ] + for y_key in y_keys: + axa = dfo.plot.scatter(x=x_key, y=y_key, alpha=0.5, logx=lx, logy=ly, color="blue") + dfa.plot.scatter(ax=axa, x=x_key, y=y_key, alpha=0.5, color="orange") + plot_or_save(plot_dir, name +"-"+x_key+"-"+y_key+".jpg") + +def example_and_count(df, name): + count = df.shape[0] + all_rows = df.shape[1] + queries = 0 + network = "" + sample = df.sample(13) + nb_rows = sample.shape[0] + # print(name + ": samples = " + str(nb_rows) + ", out of " + str(all_rows)) + sdp = sample[["network", "queries"]] + sdp_np = sdp.to_numpy() + # print("Sample shape: " + str(sdp.shape)) + # print("Sdp_np shape: " + str(np.shape(sdp_np))) + for i in range(np.shape(sdp_np)[0]): + if sdp_np[i,1] > queries: + queries = sdp_np[i,1] + network = str(sdp_np[i,0]) + print(name + ": count=" + str(count) + ", network=" + network) + return count, network + +def print_names(df): + s = "" + for n in list(df): + if n != "network": + s += n + ", " + print(s) + +def print_mean(df, name, n_list): + s = name + ", " + for n in n_list: + if n != "network": + s += str(df[n].median()) + ", " + print(s) + + +def imrs_corrections(full_df): + # apply corections for day overlow bug: + # ignore d00, it is always 0 + # compute d31 = queries - sum (d01..d30) + # compute arpa = arpa0 - d31 + days = [ + "d01", "d02", "d03", "d04", "d05", "d06", "d07", "d08", "d09", "d10", \ + "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", \ + "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", \ + "d31" ] + full_df["d31"] = full_df.apply(lambda x: reset_d31(x, days[:-1]), axis=1) + full_df["arpa"] = full_df["arpa0"] - full_df["d31"] + + print("Computed corrections") + # compute the good column + full_df["good"] = full_df.apply(lambda x: x["queries"] - x["no_such"], axis=1) + # compute the ratio of good over APNIC + full_df["r_good_apnic"] = full_df.apply(lambda x: protected_ratio(x["good"], x["APNIC"]), axis=1) + + # compute log10 column of queries and apnic + full_df["l10_q"] = np.log10(full_df["queries"]) + full_df["l10_a"] = np.log10(2*full_df["APNIC"] + 1) + full_df["l10_g"] = np.log10(2*full_df["good"] + 1) + full_df["l_tld"] = np.log10(2*full_df["TLDs"] + 1) + full_df["l_sld"] = np.log10(2*full_df["SLDs"] + 1) + full_df["l_com"] = np.log10(2*full_df["COM"] + 1) + # add columns for ratios + for d in [ "no_such", "AAAA", "NS", "PTR", "NSEC", "SOA", "APNIC" ]: + r_d = "r_" + d + full_df[r_d] = full_df[d] / full_df["queries"] + + full_df["r_arpa"] = full_df["arpa"] / (2*full_df["queries"]) + + full_df["r_COM"] = full_df.apply(lambda x: protected_ratio(x["COM"], x["queries"] - x["no_such"]), axis=1) + full_df["r_INFO"] = full_df.apply(lambda x: protected_ratio(x["INFO"], x["queries"] - x["no_such"]), axis=1) + print("Computed ratios") + + hours = ["h00", "h01", "h02", "h03", "h04", "h05", "h06", "h07", "h08", "h09", \ + "h10", "h11", "h12", "h13", "h14", "h15", "h16", "h17", "h18", "h19", \ + "h20", "h21", "h22", "h23" ] + + full_df["h_count"] = full_df.apply(lambda x: protected_count(x, 0, hours), axis=1) + full_df["d_count"] = full_df.apply(lambda x: protected_count(x, 0, days), axis=1) + print("Computed hours") \ No newline at end of file