Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Statistics by Instance and by Cluster #244

Merged
merged 35 commits into from
Apr 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
9db61ce
Compute CFD of traffic
huitema Apr 16, 2024
e141e43
Fix cfd computation
huitema Apr 16, 2024
4c5ee25
Add linal new line.
huitema Apr 16, 2024
5569a15
Add list extraction script
huitema Apr 19, 2024
1ce2dce
Fix order
huitema Apr 19, 2024
36218bc
Fix usage
huitema Apr 19, 2024
15aada5
Compute monthly instances summary
huitema Apr 20, 2024
452f2c2
fix typo
huitema Apr 20, 2024
d89b1b7
One more typo
huitema Apr 20, 2024
f5f7fc0
typo in prepare_instances_list
huitema Apr 20, 2024
f1a234f
Slightly better list of instances
huitema Apr 20, 2024
be281b9
fix typo is_dir
huitema Apr 20, 2024
ae96f9a
folder variable to instance_folder
huitema Apr 20, 2024
87e69e1
fix instance-id typo
huitema Apr 20, 2024
1d8e9db
another fix
huitema Apr 20, 2024
32aa60a
fix tmp_path
huitema Apr 20, 2024
181a752
Add monthly IP count script
huitema Apr 21, 2024
72539c5
Close parenthesis
huitema Apr 21, 2024
9b71d23
Revise parameters parsing
huitema Apr 21, 2024
8139e33
Fix typo
huitema Apr 21, 2024
401ea44
Fix arg index
huitema Apr 21, 2024
41a1f89
fix open call
huitema Apr 21, 2024
77f2dca
Add test of input line
huitema Apr 21, 2024
5af35da
Fix last output line
huitema Apr 21, 2024
3d794b5
propertly open data files
huitema Apr 21, 2024
195616b
Add progress reports
huitema Apr 21, 2024
e8c4dbd
Better logs
huitema Apr 21, 2024
6ddda9a
compute aggregated ip count.
huitema Apr 21, 2024
7fab24f
Check extraction of cluster ID
huitema Apr 21, 2024
7fffdee
debug instance parsing
huitema Apr 21, 2024
f71cbf6
More cluster parsing fixes.
huitema Apr 21, 2024
b46628f
fix ip aggregate counting
huitema Apr 21, 2024
08c1c2c
Debugging file names
huitema Apr 21, 2024
e69524c
Missing parenthesis
huitema Apr 21, 2024
354634e
remove debugging print
huitema Apr 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 99 additions & 0 deletions imrs/imrs_apnic_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#
# This script will try to build a sample of the input file.
# The purpose of the sample is, get a realistic test file
# that is small enough for iterative development, measures,
# etc., yet big enough to obtain statistically significant
# results.
#
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file>
#

import sys
import traceback
import random
import time
import concurrent.futures
import math
import os
from os import listdir
from os.path import isfile, isdir, join

class imrs_apnic_item:
def __init__(self, ip, apnic_use, imrs_use):
self.ip = ip
self.apnic_use = apnic_use
self.imrs_use = imrs_use

def head():
s = "IP, apnic_use, imrs_use,"
return s


def text(self):
s = ip + "," + str(apnic_use) + "," + str(imrs_use) + ","
return s

class apnic_record:
def __init__(self):
self.ip = ""
self.use_count = 0
self.seen_in_imrs = False
self.imrs_count = 0

def parse(self, line):
parts = line.split(",")
nb_parts = len(parts)
if nb_parts >= 4:
try:
self.ip = parts[0].strip()
self.use_count = int(parts[3].strip())
except Exception as e:
traceback.print_exc()
print("Cannot parse APNIC Record:\n" + line.strip() + "\nException: " + str(e))
return False
return True

def parse_imrs(line):
ok = False
ip = ""
count = 0
try:
parts = line.split(",")
ip = parts[0].strip()
count = int(parts[1].strip())
ok = True
except Exception as e:
traceback.print_exc()
print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e))
return ok, ip, count


# main

if len(sys.argv) != 4:
print("Usage: imrs_apnic_list.py <imrs_file> <apnic_file> <output_file>")
exit(1)
imrs_file = sys.argv[1]
apnic_file = sys.argv[2]
output_file = sys.argv[3]

apnic_dict = dict()

for line in open(apnic_file,"r"):
apnic = apnic_record()
if apnic.parse(line):
apnic_dict[apnic.ip] = apnic

for line in open(imrs_file,"r"):
ok, ip, count = parse_imrs(line)
if ok:
if ip in apnic_dict:
apnic_dict[ip].seen_in_imrs = True
apnic_dict[ip].imrs_count = count

with open(output_file, "w") as F:
F.write("IP, apnic_use, imrs_use,\n")
for ip in apnic_dict:
apnic_entry = apnic_dict[ip]
if apnic_entry.seen_in_imrs:
F.write(apnic_entry.ip + "," + str(apnic_entry.use_count) + "," + str(apnic_entry.imrs_count) + "\n")
78 changes: 78 additions & 0 deletions imrs/imrs_frequency.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#
# This script will try to build a sample of the input file.
# The purpose of the sample is, get a realistic test file
# that is small enough for iterative development, measures,
# etc., yet big enough to obtain statistically significant
# results.
#
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file>
#

import sys
import traceback
import random
import time
import concurrent.futures
import math
import os
from os import listdir
from os.path import isfile, isdir, join

def parse_imrs(line):
ok = False
ip = ""
count = 0
try:
parts = line.split(",")
ip = parts[0].strip()
count = int(parts[1].strip())
ok = True
except Exception as e:
traceback.print_exc()
print("Cannot parse IMRS Record:\n" + line.strip() + "\nException: " + str(e))
return ok, ip, count


# main

if len(sys.argv) < 3 or len(sys.argv) > 4:
print("Usage: imrs_frequency.py <imrs_file> <output_file> [load_step%]")
exit(1)
imrs_file = sys.argv[1]
output_file = sys.argv[2]
load_step = 0
if len(sys.argv) == 4:
s_load_step = sys.argv[3]
if not s_load_step.endswith("%"):
print("Load step should be %, e.g. 1%, 0.1%, not " + s_load_step)
exit(1)
else:
load_step = float(s_load_step[:-1])/100.0

load_vec = []

total_load = 0
for line in open(imrs_file,"r"):
ok, ip, use_count = parse_imrs(line)
if ok:
load_vec.append(use_count)
total_load += use_count

load_vec.sort(reverse=True)

with open(output_file, "w") as F:
cumulative_use = 0
cumulative_count = 0
delta_threshold = int(total_load*load_step)
threshold = 0
last_written = 0
F.write("Count, Queries, frequency,\n")
for use_count in load_vec:
cumulative_count += 1
cumulative_use += use_count
if cumulative_use >= threshold:
F.write(str(cumulative_count) + "," + str(cumulative_use) + "," + str(cumulative_use/total_load) + ",\n")
threshold += delta_threshold
last_written = cumulative_count
if last_written < cumulative_count:
F.write(str(cumulative_count) + "," + str(cumulative_use) + "," + str(cumulative_use/total_load) + ",\n")
90 changes: 90 additions & 0 deletions imrs/imrs_record.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
#
# This script will try to build a sample of the input file.
# The purpose of the sample is, get a realistic test file
# that is small enough for iterative development, measures,
# etc., yet big enough to obtain statistically significant
# results.
#
# Usage: imrs_sample.py <input_file> <sampling rate in %> <output_file>
#

import sys
import traceback
import random
import time
import concurrent.futures
import math
import os
from os import listdir
from os.path import isfile, isdir, join


def imrs_parse_one_number(parts, parsed):
v = 0
p = parts[parsed].strip()
v = int(parts[parsed])
parsed += 1
return v, parsed

def imrs_parse_one_vector(parts, parsed, v):
for i in range(0, len(v)):
v[i],parsed = imrs_parse_one_number(parts, parsed)
return parsed

class imrs_hyperloglog:
def __init__(self):
self.E = 0.0
self.hllv=[0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0]
pass;
def parse(self, parts, parsed):
self.E = float(parts[parsed].strip())
parsed += 1
for i in range(0, len(self.hllv)):
self.hllv[i], parsed = imrs_parse_one_number(parts,parsed)
return parsed

class imrs_record:
def __init__(self):
self.ip = ""
self.query_volume = 0
self.hourly_volume = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
self.daily_volume = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
self.arpa_count = 0
self.no_such_domain_queries = 0
self.no_such_domain_reserved = 0
self.no_such_domain_frequent = 0
self.no_such_domain_chromioids = 0
self.tld_counts = [0,0,0,0,0,0,0,0]
self.tld_hyperlog = imrs_hyperloglog()
self.sld_counts = [0,0,0,0,0,0,0,0]
self.sld_hyperlog = imrs_hyperloglog()
self.name_parts = [0,0,0,0,0,0,0,0]
self.rr_types = [0,0,0,0,0,0,0,0]
self.locales = [0,0,0,0,0,0,0,0]

def parse_imrs(self, line):
ok = False
try:
parts = line.split(",")
self.ip = parts[0].strip()
parsed = 1
self.query_volume, parsed = imrs_parse_one_number(parts, parsed)
parsed = imrs_parse_one_vector(parts, parsed, self.hourly_volume)
parsed = imrs_parse_one_vector(parts, parsed, self.daily_volume)
self.arpa_count, parsed = imrs_parse_one_number(parts, parsed)
self.no_such_domain_queries, parsed = imrs_parse_one_number(parts, parsed)
self.no_such_domain_reserved, parsed = imrs_parse_one_number(parts, parsed)
self.no_such_domain_frequent, parsed = imrs_parse_one_number(parts, parsed)
self.no_such_domain_chromioids, parsed = imrs_parse_one_number(parts, parsed)
parsed = imrs_parse_one_vector(parts, parsed, self.tld_counts)
parsed = self.tld_hyperlog = imrs_hyperloglog(parts, parsed)
parsed = imrs_parse_one_vector(parts, parsed, self.sld_counts)
parsed = self.sld_hyperlog = imrs_hyperloglog(parts, parsed)
parsed = imrs_parse_one_vector(parts, parsed, self.name_parts)
parsed = imrs_parse_one_vector(parts, parsed, self.rr_types)
parsed = imrs_parse_one_vector(parts, parsed, self.locales)
ok = True
except Exception as e:
traceback.print_exc()
print("Cannot parse IMRS Record after " + str(parsed) + " parts:\n" + line.strip() + "\nException: " + str(e))
return ok
95 changes: 95 additions & 0 deletions src/imrs_instances.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
#!/usr/bin/python
# coding=utf-8
#
# This computes the montly totals for all the instances found
# in the east and west folders. The raw data is organized as:
# - ipstats / west / one folder per instance / files per date
# / east / one folder per instance / files per date
# The first processing step is to collect the list of file names
# for each instance: one file per date, possibly more if the
# same instance is present in east and west.
#
#

import sys
import traceback
import random
import time
import concurrent.futures
import os
from os import listdir
from os.path import isfile, isdir, join

def prepare_instances_list(ipstats_folder, month):
instances = dict()
for pole in [ "east", "west" ]:
pole_dir = join(ipstats_folder, pole)
folder_pole = listdir(pole_dir)
for instance_id in folder_pole:
instance_folder = join(pole_dir, instance_id)
if isdir(instance_folder):
if not instance_id in instances:
instances[instance_id] = []
file_list = listdir(instance_folder)
for file_name in file_list:
instances[instance_id].append(join(instance_folder, file_name))
return instances

def check_or_create_dir(dir_path):
if not isdir(dir_path):
try:
os.mkdir(dir_path)
except Exception as e:
traceback.print_exc()
print("Cannot create <" + dir_path + ">\nException: " + str(e))
return False
return True

def process_instance(instance_id, month, result_folder, tmp_folder, ithitool, instances, do_debug):
result_file = instance_id + "_" + month + "-ipstats.csv"
result_path = join(result_folder, result_file)
tmp_file = instance_id + "_" + month + "-file-list.txt"
tmp_path = join(tmp_folder, tmp_file)
with open(tmp_path,"wt") as F:
for file_name in instances[instance_id]:
F.write(file_name + "\n")
merge_cmd = ithitool + ' -I ' + result_path + " " + tmp_path
cmd_ret = os.system(merge_cmd)
if cmd_ret == 0:
if do_debug:
print(result_file + ": computed.")
else:
print(result_file + ": computation failed, error:" + str(cmd_ret))
return False
return True

# main
if len(sys.argv) < 4 or len(sys.argv) > 5 or \
(len(sys.argv) == 5 and sys.argv[4] != "debug"):
print("Usage: imrs_instances <ipstats_folder> <yyyymm> <ithitool> [debug]")
print("There are just " + str(len(sys.argv)) + " arguments.")
exit (1)
ipstats_folder = sys.argv[1]
month = sys.argv[2]
ithitool = sys.argv[3]
do_debug = len(sys.argv) == 5

print("Writing instance monthly files for: " + ipstats_folder)
try:
instances = prepare_instances_list(ipstats_folder, month)
result_folder = join(ipstats_folder, "instances")
tmp_folder = join(ipstats_folder, "tmp")
if check_or_create_dir(result_folder) and \
check_or_create_dir(tmp_folder):
for instance_id in instances:
if len(instances[instance_id]) > 0:
if not process_instance(instance_id, month, result_folder, tmp_folder, ithitool, instances, do_debug):
exit(1)
except Exception as exc:
traceback.print_exc()
print('\nCode generated an exception: %s' % (exc))





Loading
Loading