From adf1490fc52c82c631e29922670d3c0765b73551 Mon Sep 17 00:00:00 2001 From: Christian Huitema Date: Mon, 15 Apr 2024 12:17:26 -0700 Subject: [PATCH] Save sampler script --- imrs/imrs_sampler.py | 56 +++++++++++++++++++++++++++++ src/imrs_total.py | 86 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 142 insertions(+) create mode 100644 imrs/imrs_sampler.py create mode 100644 src/imrs_total.py diff --git a/imrs/imrs_sampler.py b/imrs/imrs_sampler.py new file mode 100644 index 0000000..a600914 --- /dev/null +++ b/imrs/imrs_sampler.py @@ -0,0 +1,56 @@ +# +# This script will try to build a sample of the input file. +# The purpose of the sample is, get a realistic test file +# that is small enough for iterative development, measures, +# etc., yet big enough to obtain statistically significant +# results. +# +# Usage: imrs_sample.py +# + +import sys +import traceback +import random +import time +import concurrent.futures +import os +from os import listdir +from os.path import isfile, isdir, join + + +# main + +if len(sys.argv) != 4: + print("Usage: imrs_sample.py ") + exit(1) +file_name_in = sys.argv[1] +rate_text = sys.argv[2] +file_name_out = sys.argv[3] +sampling_rate = 0 +if not rate_text.endswith("%"): + print("sampling rate should be e.g. 5\%, 0.1%, not " + rate_text) + exit(1) +try: + rate_percent = float(rate_text[:-1]) + sampling_rate = rate_percent/100.0 +except Exception as e: + traceback.print_exc() + print("Cannot parse <" + rate_percent + ">\nException: " + str(e)) + exit(1) + +nb_lines_in = 0 +nb_lines_out = 0 +with open(file_name_out,"wt") as F_OUT: + for line in open(file_name_in, "rt"): + nb_lines_in += 1 + if random.random() < sampling_rate: + F_OUT.write(line) + nb_lines_out += 1 + +if nb_lines_in == 0: + print("Input file " + file_name_in + " is empty.") +else: + print("Input file " + file_name_in + ": " + str(nb_lines_in) + " lines") + print("Output file " + file_name_out + ": " + str(nb_lines_out) + " lines") + print("Sampling rate requested: " + str(sampling_rate)) + print("Sampling rate actual: " + str(nb_lines_out/nb_lines_in)) diff --git a/src/imrs_total.py b/src/imrs_total.py new file mode 100644 index 0000000..8ac88cf --- /dev/null +++ b/src/imrs_total.py @@ -0,0 +1,86 @@ +#!/usr/bin/python +# coding=utf-8 +# +# This script organizes the sum of IMRS resolver data per +# cluster for the month. The results are collected in +# the folder ~/ipstats/cluster, with one subfolder +# per cluster, using names like ~/ipstats/cluster/us-lax/. +# For each cluster, the script compute a single file +# such as ~/ipstats/cluster/us-lax.202403.csv, containing +# the aggregated statistics for the whole month. +# + +import sys +import traceback +import random +import time +import concurrent.futures +import os +from os import listdir +from os.path import isfile, isdir, join + +def check_or_create_dir(dir_path): + if not isdir(dir_path): + try: + os.mkdir(dir_path) + except Exception as e: + traceback.print_exc() + print("Cannot create <" + dir_path + ">\nException: " + str(e)) + return False + return True + +# main +if len(sys.argv) < 4 or len(sys.argv) > 5 or \ + (len(sys.argv) == 5 and sys.argv[4] != "debug"): + print("Usage: imrs_monthly [\"debug\"]") + print("There are just " + str(len(sys.argv)) + " arguments.") + exit (1) +ipstats_folder = sys.argv[1] +month = sys.argv[2] +ithitool = sys.argv[3] +do_debug = len(sys.argv) == 5 + +print("Writing monthly per custom clusters aggregates for: " + ipstats_folder) +try: + # Look at every cluster under the "clusters" folder + monthly_folder = join(ipstats_folder, "monthly") + tmp_folder = join(ipstats_folder, "tmp") + monthly_list = listdir(monthly_folder) + if check_or_create_dir(monthly_folder) and \ + check_or_create_dir(tmp_folder): + tmp_file_name = join(tmp_folder, month + ".txt") + with open(tmp_file_name, "wt") as F: + # check that this is a cluster, and not some other file + # Watch for: cluster_id + "." + month + "-" + "ipstats.csv" + monthly_file_end = month + "-" + "ipstats.csv" + for monthly_file in monthly_list: + monthly_path = join(monthly_folder, monthly_file) + if len(monthly_file) > 7 and \ + monthly_file[2] == "-" and \ + monthly_file[6] == "." and \ + monthly_file.endswith(monthly_file_end): + F.write(monthly_path +"\n") + if do_debug: + print("Adding: " + monthly_file) + elif do_debug: + print("Not a monthly file: " + monthly_path) + total_file = "total-" + month + "-" + "ipstats.csv" + total_path = join(ipstats_folder, total_file) + merge_cmd = ithitool + ' -I ' + total_path + " " + tmp_file_name + if do_debug: + print("Running: " + merge_cmd) + sys.stdout.flush() + cmd_ret = os.system(merge_cmd) + if cmd_ret == 0: + if do_debug: + print(total_file + ": computed.") + else: + print(report_name + ": computation failed, error:" + str(cmd_ret)) +except Exception as exc: + traceback.print_exc() + print('\nCode generated an exception: %s' % (exc)) + + + + +