From 2d722545adbdeea94ceff83aecb5410b6694c5a6 Mon Sep 17 00:00:00 2001
From: Christian Huitema <huitema@huitema.net>
Date: Sun, 14 Jul 2024 19:05:43 -0700
Subject: [PATCH] Produce first batch of classifiers

---
 imrs/imrs_classifier.py  | 182 ++++++---------------------------
 imrs/imrs_classifier2.py | 123 ++++++++++++++++++++++
 imrs/imrs_classifier3.py |  77 ++++++++++++++
 imrs/imrs_pandas.py      | 216 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 447 insertions(+), 151 deletions(-)
 create mode 100644 imrs/imrs_classifier2.py
 create mode 100644 imrs/imrs_classifier3.py
 create mode 100644 imrs/imrs_pandas.py

diff --git a/imrs/imrs_classifier.py b/imrs/imrs_classifier.py
index 2051467..eae6e17 100644
--- a/imrs/imrs_classifier.py
+++ b/imrs/imrs_classifier.py
@@ -18,78 +18,8 @@
 import numpy as np
 from sklearn.linear_model import LinearRegression
 import random
-
-def file_has_header(imrs_file):
-    has_header = False
-    # get the first line
-    line = ""
-    for line in open(imrs_file, "r"):
-        break
-    if len(line) > 0:
-        parts = line.split(",")
-        if len(parts) > 1:
-            try:
-                queries = int(parts[1])
-                print("No header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...") 
-                has_header = False
-            except:
-                print("header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...") 
-                has_header = True
-    return has_header
-
-def load_imrs_to_frame(imrs_file):
-    if file_has_header(imrs_file):
-        df = pd.read_csv(imrs_ratio_file)
-    else:
-        df = pd.read_csv(imrs_file, header=None, names=imrs.imrs_headers, dtype={"network": str}, index_col = False)
-    return df
-
-def protected_ratio(v, d):
-    r = 0
-    if d > 0:
-        r = v/d
-    return r
-
-def protected_count(x, r, list):
-    s = r
-    if r > 0:
-        mx = 0
-        for k in list:
-            if x[k] > mx:
-                mx = x[k]
-        s = r*mx
-    count = 0
-    for k in list:
-        if x[k] > s:
-            count += 1
-    return count
-
-def reset_d31(x, list):
-    s = 0
-    for k in list:
-        s += x[k]
-    d31 = x["queries"] - s
-    if d31 < 0:
-        d31 = 0
-    return d31
-
-def compute_nb_tlds(x):
-    tld_count = 0
-    for tld in [ "COM", "NET", "ORG", "INFO", "CN", "IN", "DE", "US" ]:
-        if x[tld] > 0:
-            tld_count += 1
-    tld_count += int(x["TLDs"])
-    return tld_count
-
-def compute_nb_slds(x):
-    sld_count = 0
-    for sld in [ "RESOLVER", "EC2", "CLOUD", "WPAD", "CORP", "MAIL", "_TCP", "PROD" ]:
-        if x[sld] > 0:
-            sld_count += 1
-    sld_count += int(x["SLDs"])
-    if sld_count < 1:
-        sld_count = 1
-    return sld_count
+import imrs_pandas
+from imrs_pandas import print_stats, plot_or_save, example_and_count, print_names, print_mean
 
 def compute_l10_sa(x, y, n, intercept):
     d = float(intercept)
@@ -97,39 +27,6 @@ def compute_l10_sa(x, y, n, intercept):
         d += float(x[n[i]]*y[i])
     return d
 
-def print_stats(x_df, name):
-    print(name)
-    x_des = x_df.describe()
-    print(x_des.transpose())
-    x_cor = x_df.corr()
-    print(x_cor)
-
-def plot_or_save(plot_dir, image_name):
-    if plot_dir == "-":
-        plt.show()
-    else:
-        image_path = join(plot_dir, image_name)
-        plt.savefig(image_path)
-
-def example_and_count(df, name):
-    count = df.shape[0]
-    all_rows = df.shape[1]
-    queries = 0
-    network = ""
-    sample = df.sample(13)
-    nb_rows = sample.shape[0]
-    # print(name + ": samples = " + str(nb_rows) + ", out of " + str(all_rows))
-    sdp = sample[["network", "queries"]]
-    sdp_np = sdp.to_numpy()
-    # print("Sample shape: " + str(sdp.shape))
-    # print("Sdp_np shape: " + str(np.shape(sdp_np)))
-    for i in range(np.shape(sdp_np)[0]):
-        if sdp_np[i,1] > queries:
-            queries = sdp_np[i,1]
-            network = str(sdp_np[i,0])
-    print(name + ": count=" + str(count) + ", network=" + network)
-    return count, network
-
 # main
 if len(sys.argv) != 2 and len(sys.argv) != 3:
     for x in range(0, len(sys.argv)):
@@ -141,54 +38,16 @@ def example_and_count(df, name):
 if len(sys.argv) == 3:
     plot_dir = sys.argv[2]
 
-full_df = load_imrs_to_frame(imrs_file)
+full_df = imrs_pandas.load_imrs_to_frame(imrs_file)
 print("Loaded full")
-# apply corections for day overlow bug:
-# ignore d00, it is always 0
-# compute d31 = queries - sum (d01..d30)
-# compute arpa = arpa0 - d31
-days = [
-    "d01", "d02", "d03", "d04", "d05", "d06", "d07", "d08", "d09", "d10", \
-    "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", \
-    "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", \
-    "d31" ]
-full_df["d31"] = full_df.apply(lambda x: reset_d31(x, days[:-1]), axis=1)
-full_df["arpa"] = full_df["arpa0"] - full_df["d31"]
-
-print("Computed corrections")
-# compute the good column
-full_df["good"] = full_df.apply(lambda x: x["queries"] - x["no_such"], axis=1)
-# compute the ratio of good over APNIC
-full_df["r_good_apnic"] = full_df.apply(lambda x: protected_ratio(x["good"], x["APNIC"]), axis=1)
-
-# compute log10 column of queries and apnic
-full_df["l10_q"] = np.log10(full_df["queries"])
-full_df["l10_a"] = np.log10(2*full_df["APNIC"] + 1)
-full_df["l10_g"] = np.log10(2*full_df["good"] + 1)
-full_df["l_tld"] = np.log10(2*full_df["TLDs"] + 1)
-full_df["l_sld"] = np.log10(2*full_df["SLDs"] + 1)
-# add columns for ratios
-for d in [ "no_such", "AAAA", "NS", "PTR", "NSEC", "SOA", "APNIC" ]:
-    r_d = "r_" + d
-    full_df[r_d] = full_df[d] / full_df["queries"]
-
-full_df["r_arpa"] = full_df["arpa"] / (2*full_df["queries"])
-
-full_df["r_COM"] = full_df.apply(lambda x: protected_ratio(x["COM"], x["queries"] - x["no_such"]), axis=1)
-full_df["r_INFO"] = full_df.apply(lambda x: protected_ratio(x["INFO"], x["queries"] - x["no_such"]), axis=1)
-print("Computed ratios")
-
-hours = ["h00", "h01", "h02", "h03", "h04", "h05", "h06", "h07", "h08", "h09", \
-    "h10", "h11", "h12", "h13", "h14", "h15", "h16", "h17", "h18", "h19", \
-    "h20", "h21", "h22", "h23" ]
-
-full_df["h_count"] = full_df.apply(lambda x: protected_count(x, 0, hours), axis=1)
-full_df["d_count"] = full_df.apply(lambda x: protected_count(x, 0, days), axis=1)
-print("Computed hours")
+
+imrs_pandas.imrs_corrections(full_df)
+print("Applied corrections")
 
 # First, study the APNIC Data
 # get APNIC subset
 apnic_df = full_df[full_df["l10_a"] > 0]
+apnic_selected = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM"  ]
 
 # select 4 subsets, based on 2 variables
 apnic_loneg_df = apnic_df[apnic_df["r_no_such"] < 0.1]
@@ -199,6 +58,13 @@ def example_and_count(df, name):
 apnic_hineg_loap_df = apnic_hineg_df[apnic_hineg_df["r_good_apnic"] < 300]
 apnic_hineg_hiap_df = apnic_hineg_df[apnic_hineg_df["r_good_apnic"] >= 300]
 
+print_names(apnic_loneg_loap_df)
+print_mean(apnic_loneg_loap_df,"apnic_loneg_loap_df", apnic_selected)
+print_mean(apnic_loneg_hiap_df,"apnic_loneg_hiap_df", apnic_selected)
+print_mean(apnic_hineg_loap_df,"apnic_hineg_loap_df", apnic_selected)
+print_mean(apnic_hineg_hiap_df,"apnic_hineg_hiap_df", apnic_selected)
+
+
 # select 4 subsets, based on 2 variables
 apnic_loneg_df = apnic_df[apnic_df["r_no_such"] < 0.1]
 apnic_hineg_df = apnic_df[apnic_df["r_no_such"] >= 0.1]
@@ -222,6 +88,14 @@ def example_and_count(df, name):
 apnic_hineg_hiap_df.plot.scatter(ax=axb, x="queries", y="APNIC", alpha=0.5, color="red")
 plot_or_save(plot_dir, "apnic-queries.jpg")
 
+axtld = apnic_df.plot.scatter(x="APNIC", y="TLDs", alpha=0.5, logx=True, logy=False, color="blue")
+plot_or_save(plot_dir, "tlds-apnic.jpg")
+axcom = apnic_df.plot.scatter(x="APNIC", y="COM", alpha=0.5, logx=True, logy=False, color="blue")
+plot_or_save(plot_dir, "com-apnic.jpg")
+axnosuch = full_df.plot.scatter(x="queries", y="r_no_such", alpha=0.5, logx=True, logy=False, color="blue")
+apnic_df.plot.scatter(ax=axnosuch, x="queries", y="r_no_such", alpha=0.5, color="orange")
+plot_or_save(plot_dir, "no_such-queries.jpg")
+
 # plot APNIC/Queries/no_such
 axb = apnic_loneg_loap_df.plot.scatter(x="queries", y="r_no_such", alpha=0.5, logx=True, logy=False, color="blue")
 apnic_loneg_hiap_df.plot.scatter(ax=axb, x="queries", y="r_no_such", alpha=0.5, color="green")
@@ -236,7 +110,6 @@ def example_and_count(df, name):
 
 # study the APNIC correlations
 # get a view of only the important columns
-apnic_selected = [ "network", "l10_q", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l_tld", "l_sld", "l10_a" ]
 full_selected_df = full_df[apnic_selected]
 apnic_selected_df = full_selected_df[full_selected_df["l10_a"] > 0]
 
@@ -260,6 +133,7 @@ def example_and_count(df, name):
 full_df["l10_sa"] = full_df.apply(lambda x: compute_l10_sa(x, lr.coef_.T, apnic_coeffs[:-1], lr.intercept_[0]), axis=1)
 full_df["l10_gsa"] = full_df.apply(lambda x: x["l10_g"] - x["l10_sa"], axis=1)
 #print(list(full_df))
+
 apnic_coeffs_x = [ "network", "l10_sa", "l10_gsa", "l10_q", "r_no_such", "l10_a", "queries" ]
 full_data_x_df = full_df[apnic_coeffs_x]
 # print(list(full_data_x_df))
@@ -274,7 +148,7 @@ def example_and_count(df, name):
 #print_stats(full_df["l10_sa", "l10_a"], "full_df")
 
 # apply regression to classify not APNIC traffic
-notap_df = full_data_x_df[full_data_x_df["l10_a"] == 0]
+notap_df = full_df[full_df["l10_a"] == 0]
 
 axp = notap_df.plot.scatter(x="queries", y="r_no_such", alpha=0.1, logx=True, logy=False, color="blue")
 apnic_df.plot.scatter(ax=axp, x="queries", y="r_no_such", alpha=0.2, color="orange")
@@ -318,4 +192,10 @@ def example_and_count(df, name):
 example_and_count(notap_loneg_loap_df, "notap_loneg_loap_df (blue)")
 example_and_count(notap_loneg_hiap_df, "notap_loneg_hiap_df (green)")
 example_and_count(notap_hineg_loap_df, "notap_hineg_loap_df (orange)")
-example_and_count(notap_hineg_hiap_df, "notap_hineg_hiap_df (red)")
\ No newline at end of file
+example_and_count(notap_hineg_hiap_df, "notap_hineg_hiap_df (red)")
+
+print_names(notap_loneg_loap_df)
+print_mean(notap_loneg_loap_df,"notap_loneg_loap_df", apnic_selected)
+print_mean(notap_loneg_hiap_df,"notap_loneg_hiap_df", apnic_selected)
+print_mean(notap_hineg_loap_df,"notap_hineg_loap_df", apnic_selected)
+print_mean(notap_hineg_hiap_df,"notap_hineg_hiap_df", apnic_selected)
\ No newline at end of file
diff --git a/imrs/imrs_classifier2.py b/imrs/imrs_classifier2.py
new file mode 100644
index 0000000..c87bae2
--- /dev/null
+++ b/imrs/imrs_classifier2.py
@@ -0,0 +1,123 @@
+#
+# Exploration of the ipstats file for each network
+#
+
+import sys
+import traceback
+import random
+import time
+import concurrent.futures
+import math
+import os
+from os import listdir
+from os.path import isfile, isdir, join
+import imrs
+from imrs import parse_imrs_volume_only, apnic_record
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.linear_model import LinearRegression
+import random
+import imrs_pandas
+from imrs_pandas import print_stats, save_stats, save_selected_stats, \
+                 plot_or_save, plot_and_explore, example_and_count, \
+                 print_names, print_mean
+
+
+# main
+if len(sys.argv) != 2 and len(sys.argv) != 3:
+    for x in range(0, len(sys.argv)):
+        print(str(x) + ":" + sys.argv[x])
+    print("Usage: imrs_classifier.py <imrs_ratio csv file> [<img_folder>]")
+    exit(1)
+imrs_file = sys.argv[1]
+plot_dir = "-"
+out_file = sys.stdout
+if len(sys.argv) == 3:
+    plot_dir = sys.argv[2]
+    csv_path = join(plot_dir, "stats.csv")
+    out_file = open(csv_path, "w")
+    out_file.write("frame, property, count, mean, std, min, c25%, c50%, c75%, max\n")
+
+
+full_df = imrs_pandas.load_imrs_to_frame(imrs_file)
+print("Loaded full")
+
+imrs_pandas.imrs_corrections(full_df)
+print("Applied corrections")
+
+tracked = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM"  ]
+save_selected_stats(out_file, full_df, tracked, "full_df")
+plot_and_explore(full_df, plot_dir, "full", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+
+# First, isolate the "small" nodes, defined
+# as sending fewer than 40 queries.
+small_df = full_df[full_df["queries"] <= 100]
+big_df = full_df[full_df["queries"] > 100]
+
+save_selected_stats(out_file,  small_df, tracked,"small_df")
+save_selected_stats(out_file,  big_df, tracked,"big_df")
+
+# then, create three subsets of the big sites:
+# ns_low: no-such < 5%
+# ns_high: no-such > 90%
+# ns_mid: in_between
+
+ns_low = big_df[big_df["r_no_such"] < 0.05]
+ns_other = big_df[big_df["r_no_such"] >= 0.05]
+ns_high = ns_other[ns_other["r_no_such"] > 0.9]
+ns_mid = ns_other[ns_other["r_no_such"] <= 0.9]
+
+save_selected_stats(out_file,  ns_other, tracked,"ns_other")
+plot_and_explore(ns_other, plot_dir, "other", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+
+save_selected_stats(out_file,  ns_low, tracked,"ns_low")
+save_selected_stats(out_file,  ns_high, tracked,"ns_high")
+save_selected_stats(out_file,  ns_mid, tracked,"ns_mid")
+
+# At this stage, we have separated 4 groups.
+# We will ignore the "small" group for now, because in the absence of
+# traffic it is hard to classify anything.
+
+plot_and_explore(ns_low, plot_dir, "low", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+plot_and_explore(ns_mid, plot_dir, "mid", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+plot_and_explore(ns_high, plot_dir, "high", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+
+# In the "low NS" group, the plot of TLDs versus queries shows a break
+# at somewhere between 500 and 1000 TLDs seen. Above that line we find
+# very few APNIC servers but many large non APNIC nodes. This could
+# be nodes engaged in some kind of scanning process.
+
+low_lt500t = ns_low[ns_low["TLDs"] <= 500]
+low_gt500t = ns_low[ns_low["TLDs"] > 500]
+save_selected_stats(out_file,  low_lt500t, tracked,"low_lt500t")
+save_selected_stats(out_file,  low_gt500t, tracked,"low_gt500t")
+plot_and_explore(low_lt500t, plot_dir, "low_lt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+plot_and_explore(low_gt500t, plot_dir, "low_gt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+
+# In the "high NS" group, there seems
+# to be two interesting subgroups: more than 500 TLDs, as in the
+# "low" case, and more than about 10^6 queries, which separates
+# a bunch of high values from the bulk of APNNIC resolvers.
+
+high_lt500t = ns_high[ns_high["TLDs"] <= 500]
+high_gt500t = ns_high[ns_high["TLDs"] > 500]
+save_selected_stats(out_file,  high_lt500t, tracked,"high_lt500t")
+save_selected_stats(out_file,  high_gt500t, tracked,"high_gt500t")
+plot_and_explore(high_lt500t, plot_dir, "high_lt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+plot_and_explore(high_gt500t, plot_dir, "high_gt500t", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+
+# In the "mid" group, the pictures are murky. There seems to be
+# a separation between resolvers with more than 1 million
+# queries and others. (or is it 100K?)
+
+mid_lt1Mq = ns_mid[ns_mid["queries"] <= 1000000]
+mid_gt1Mq = ns_mid[ns_mid["queries"] > 1000000]
+
+save_selected_stats(out_file,  mid_lt1Mq, tracked,"mid_lt1Mq")
+save_selected_stats(out_file,  mid_gt1Mq, tracked,"mid_gt1Mq")
+plot_and_explore(mid_lt1Mq, plot_dir, "mid_lt1Mq", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+plot_and_explore(mid_gt1Mq, plot_dir, "mid_gt1Mq", 'queries', [ 'r_arpa', 'l_tld', 'l_com', 'r_COM'], lx=True, ly=False)
+
+if out_file != sys.stdout:
+    out_file.close()
\ No newline at end of file
diff --git a/imrs/imrs_classifier3.py b/imrs/imrs_classifier3.py
new file mode 100644
index 0000000..b9e6ecb
--- /dev/null
+++ b/imrs/imrs_classifier3.py
@@ -0,0 +1,77 @@
+#
+# Exploration of the ipstats file for each network
+#
+
+import sys
+import traceback
+import random
+import time
+import concurrent.futures
+import math
+import os
+from os import listdir
+from os.path import isfile, isdir, join
+import imrs
+from imrs import parse_imrs_volume_only, apnic_record
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.linear_model import LinearRegression
+import random
+import imrs_pandas
+from imrs_pandas import print_stats, save_stats, save_selected_stats, \
+                 plot_or_save, plot_and_explore, example_and_count, \
+                 print_names, print_mean
+
+
+# main
+if len(sys.argv) != 2 and len(sys.argv) != 3:
+    for x in range(0, len(sys.argv)):
+        print(str(x) + ":" + sys.argv[x])
+    print("Usage: imrs_classifier.py <imrs_ratio csv file> [<img_folder>]")
+    exit(1)
+imrs_file = sys.argv[1]
+plot_dir = "-"
+out_file = sys.stdout
+if len(sys.argv) == 3:
+    plot_dir = sys.argv[2]
+    csv_path = join(plot_dir, "stats-v3.csv")
+    out_file = open(csv_path, "w")
+    out_file.write("frame, property, count, mean, std, min, c25%, c50%, c75%, max\n")
+
+
+full_df = imrs_pandas.load_imrs_to_frame(imrs_file)
+print("Loaded full")
+
+imrs_pandas.imrs_corrections(full_df)
+print("Applied corrections")
+explored =  [ 'r_arpa', 'l_tld', 'l_com', 'r_COM', 'r_no_such' ]
+
+tracked = [ "network", "queries", "r_no_such", "h_count", "d_count", "r_arpa", "r_COM", "l10_q", "l_com", "l_tld", "l_sld", "l10_a", "APNIC", "COM"  ]
+save_selected_stats(out_file, full_df, tracked, "full_df")
+plot_and_explore(full_df, plot_dir, "full", 'queries', explored, lx=True, ly=False)
+
+# First, isolate the "small" nodes, defined
+# as sending fewer than 40 queries.
+small_df = full_df[full_df["queries"] <= 100]
+big_df = full_df[full_df["queries"] > 100]
+
+save_selected_stats(out_file,  small_df, tracked,"small")
+save_selected_stats(out_file,  big_df, tracked,"big")
+plot_and_explore(small_df, plot_dir, "small", 'queries', explored, lx=True, ly=False)
+plot_and_explore(big_df, plot_dir, "big", 'queries', explored, lx=True, ly=False)
+
+# then, create two subsets of the big sites:
+# ns_medium: < 1M queries
+# ns_many: >= 1M
+
+ns_medium = big_df[big_df['queries'] < 100000 ]
+ns_large = big_df[big_df['queries'] >= 100000 ]
+
+save_selected_stats(out_file,  ns_medium, tracked,"medium")
+save_selected_stats(out_file,  ns_large, tracked,"large")
+plot_and_explore(ns_medium, plot_dir, "medium", 'queries', explored, lx=True, ly=False)
+plot_and_explore(ns_large, plot_dir, "large", 'queries', explored, lx=True, ly=False)
+
+if out_file != sys.stdout:
+    out_file.close()
\ No newline at end of file
diff --git a/imrs/imrs_pandas.py b/imrs/imrs_pandas.py
new file mode 100644
index 0000000..4462e62
--- /dev/null
+++ b/imrs/imrs_pandas.py
@@ -0,0 +1,216 @@
+#
+# Common functions for parsing the imrs CSV files 
+# in panda format.
+#
+
+import sys
+import traceback
+import random
+import time
+import concurrent.futures
+import math
+import os
+from os import listdir
+from os.path import isfile, isdir, join
+import imrs
+from imrs import parse_imrs_volume_only, apnic_record
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.linear_model import LinearRegression
+import random
+
+def file_has_header(imrs_file):
+    has_header = False
+    # get the first line
+    line = ""
+    for line in open(imrs_file, "r"):
+        break
+    if len(line) > 0:
+        parts = line.split(",")
+        if len(parts) > 1:
+            try:
+                queries = int(parts[1])
+                print("No header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...") 
+                has_header = False
+            except:
+                print("header for " + imrs_file + ": " + parts[0] + ", " + parts[1] + ", ...") 
+                has_header = True
+    return has_header
+
+def load_imrs_to_frame(imrs_file):
+    if file_has_header(imrs_file):
+        df = pd.read_csv(imrs_ratio_file)
+    else:
+        df = pd.read_csv(imrs_file, header=None, names=imrs.imrs_headers, dtype={"network": str}, index_col = False)
+    return df
+
+def protected_ratio(v, d):
+    r = 0
+    if d > 0:
+        r = v/d
+    return r
+
+def protected_count(x, r, list):
+    s = r
+    if r > 0:
+        mx = 0
+        for k in list:
+            if x[k] > mx:
+                mx = x[k]
+        s = r*mx
+    count = 0
+    for k in list:
+        if x[k] > s:
+            count += 1
+    return count
+
+def reset_d31(x, list):
+    s = 0
+    for k in list:
+        s += x[k]
+    d31 = x["queries"] - s
+    if d31 < 0:
+        d31 = 0
+    return d31
+
+def compute_nb_tlds(x):
+    tld_count = 0
+    for tld in [ "COM", "NET", "ORG", "INFO", "CN", "IN", "DE", "US" ]:
+        if x[tld] > 0:
+            tld_count += 1
+    tld_count += int(x["TLDs"])
+    return tld_count
+
+def compute_nb_slds(x):
+    sld_count = 0
+    for sld in [ "RESOLVER", "EC2", "CLOUD", "WPAD", "CORP", "MAIL", "_TCP", "PROD" ]:
+        if x[sld] > 0:
+            sld_count += 1
+    sld_count += int(x["SLDs"])
+    if sld_count < 1:
+        sld_count = 1
+    return sld_count
+
+def save_stats(f, x_df, name):
+    x_des = x_df.describe()
+    # print(x_des.transpose())
+    for h in x_des:
+        c = x_des[h]
+        l = c.transpose()
+        f.write(name + ", " + str(h))
+        for w in l:
+            f.write(", " + str(w))
+        f.write("\n")
+
+def save_selected_stats(out_file,  df, selection, name):
+    df_view = df[selection]
+    save_stats(out_file, df_view, name)
+
+def print_stats(x_df, name):
+    print(name)
+    x_des = x_df.describe()
+    print(x_des.transpose())
+    if 'queries' in x_df.columns:
+        print("Queries " + str(x_df['queries'].sum()))
+    if 'APNIC' in x_df.columns:
+        dfa = x_df[x_df['APNIC']>0] 
+        print("APNIC   " + str(dfa['APNIC'].sum()) + ", "  + str(dfa['queries'].sum()) + ", " + str(dfa['APNIC'].count()))
+    # x_cor = x_df.corr()
+    # print(x_cor)
+
+
+def plot_or_save(plot_dir, image_name):
+    if plot_dir == "-":
+        plt.show()
+    else:
+        image_path = join(plot_dir, image_name)
+        plt.savefig(image_path)
+        plt.close(fig="all")
+
+def plot_and_explore(df, plot_dir, name, x_key, y_keys, lx=False, ly=False):
+    dfa = df[ df['APNIC'] == 0 ]
+    dfo = df[ df['APNIC'] != 0 ]
+    for y_key in y_keys:
+        axa = dfo.plot.scatter(x=x_key, y=y_key, alpha=0.5, logx=lx, logy=ly, color="blue")
+        dfa.plot.scatter(ax=axa, x=x_key, y=y_key, alpha=0.5, color="orange")
+        plot_or_save(plot_dir, name +"-"+x_key+"-"+y_key+".jpg")
+
+def example_and_count(df, name):
+    count = df.shape[0]
+    all_rows = df.shape[1]
+    queries = 0
+    network = ""
+    sample = df.sample(13)
+    nb_rows = sample.shape[0]
+    # print(name + ": samples = " + str(nb_rows) + ", out of " + str(all_rows))
+    sdp = sample[["network", "queries"]]
+    sdp_np = sdp.to_numpy()
+    # print("Sample shape: " + str(sdp.shape))
+    # print("Sdp_np shape: " + str(np.shape(sdp_np)))
+    for i in range(np.shape(sdp_np)[0]):
+        if sdp_np[i,1] > queries:
+            queries = sdp_np[i,1]
+            network = str(sdp_np[i,0])
+    print(name + ": count=" + str(count) + ", network=" + network)
+    return count, network
+
+def print_names(df):
+    s = ""
+    for n in list(df):
+        if n != "network":
+            s += n + ", "
+    print(s)
+
+def print_mean(df, name, n_list):
+    s = name + ", "
+    for n in n_list:
+        if n != "network":
+            s += str(df[n].median()) + ", "
+    print(s)
+
+
+def imrs_corrections(full_df): 
+    # apply corections for day overlow bug:
+    # ignore d00, it is always 0
+    # compute d31 = queries - sum (d01..d30)
+    # compute arpa = arpa0 - d31
+    days = [
+        "d01", "d02", "d03", "d04", "d05", "d06", "d07", "d08", "d09", "d10", \
+        "d11", "d12", "d13", "d14", "d15", "d16", "d17", "d18", "d19", "d20", \
+        "d21", "d22", "d23", "d24", "d25", "d26", "d27", "d28", "d29", "d30", \
+        "d31" ]
+    full_df["d31"] = full_df.apply(lambda x: reset_d31(x, days[:-1]), axis=1)
+    full_df["arpa"] = full_df["arpa0"] - full_df["d31"]
+
+    print("Computed corrections")
+    # compute the good column
+    full_df["good"] = full_df.apply(lambda x: x["queries"] - x["no_such"], axis=1)
+    # compute the ratio of good over APNIC
+    full_df["r_good_apnic"] = full_df.apply(lambda x: protected_ratio(x["good"], x["APNIC"]), axis=1)
+
+    # compute log10 column of queries and apnic
+    full_df["l10_q"] = np.log10(full_df["queries"])
+    full_df["l10_a"] = np.log10(2*full_df["APNIC"] + 1)
+    full_df["l10_g"] = np.log10(2*full_df["good"] + 1)
+    full_df["l_tld"] = np.log10(2*full_df["TLDs"] + 1)
+    full_df["l_sld"] = np.log10(2*full_df["SLDs"] + 1)
+    full_df["l_com"] = np.log10(2*full_df["COM"] + 1)
+    # add columns for ratios
+    for d in [ "no_such", "AAAA", "NS", "PTR", "NSEC", "SOA", "APNIC" ]:
+        r_d = "r_" + d
+        full_df[r_d] = full_df[d] / full_df["queries"]
+
+    full_df["r_arpa"] = full_df["arpa"] / (2*full_df["queries"])
+
+    full_df["r_COM"] = full_df.apply(lambda x: protected_ratio(x["COM"], x["queries"] - x["no_such"]), axis=1)
+    full_df["r_INFO"] = full_df.apply(lambda x: protected_ratio(x["INFO"], x["queries"] - x["no_such"]), axis=1)
+    print("Computed ratios")
+
+    hours = ["h00", "h01", "h02", "h03", "h04", "h05", "h06", "h07", "h08", "h09", \
+        "h10", "h11", "h12", "h13", "h14", "h15", "h16", "h17", "h18", "h19", \
+        "h20", "h21", "h22", "h23" ]
+
+    full_df["h_count"] = full_df.apply(lambda x: protected_count(x, 0, hours), axis=1)
+    full_df["d_count"] = full_df.apply(lambda x: protected_count(x, 0, days), axis=1)
+    print("Computed hours")
\ No newline at end of file