-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpreprocessing.py
111 lines (98 loc) · 3.93 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import gensim
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *
import math
from functools import lru_cache
import preprocessing_dict
@lru_cache(maxsize=10000)
def lemmatize_stemming(text):
return WordNetLemmatizer().lemmatize(text, pos='v')
def preprocess(text):
result = []
for token in gensim.utils.simple_preprocess(text):
if token not in preprocessing_dict.specific_words:
result.append(lemmatize_stemming(token))
else:
for tok in gensim.utils.simple_preprocess(preprocessing_dict.specific_words[token]):
result.append(lemmatize_stemming(tok))
return result
def calculate_score(data, vectorizer, entropy):
"""Calculate the score for new coming data
Args:
data (dict): The dictionary with the format: {LE name: dataframe}
vectorizer (CountVectorizer): The trained vector
entropy (dict): The dictionary with the format: {LE name: {term: entropy}}
"""
score = dict()
for key, val in data.items():
score[key] = 0
counts = calculate_term_vector(val, vectorizer).sum().to_dict()
for term, count in counts.items():
try:
score[key] += (entropy[key][term] * math.log2(1 + count)) ** 2
except:
pass
score[key] = math.sqrt(score[key])
return score
def calculate_term_vector(data, vectorizer):
"""
Calculate the term vector base on data
Args:
data (dataframe): format time, message
vectorizer (CountVectorizer): The trained vector
"""
# Tokenize data
if 'process' not in data.columns:
data['process'] = data['message'].map(preprocess)
matrix = vectorizer.transform(data['process'])
# Count term per each document
counts = pd.DataFrame(matrix.toarray(), columns=vectorizer.get_feature_names()).copy()
return counts
def preprocess_training_data(data):
"""Calculate the entropy (et) for each logging entity (LE) in training phase
Args:
data (dict): The dictionary with the format: {LE name: dataframe}
"""
result = dict()
df = pd.concat([x for x in data.values()], ignore_index=True)
df['process'] = df['message'].map(preprocess)
vectorizer = CountVectorizer(analyzer=lambda x: x)
vectorizer.fit(df['process'])
for key, val in data.items():
et = preprocess_log_entities_data(val, vectorizer)
result[key] = et
return result, vectorizer
def calculate_entropy(x):
if x == 0:
return 0
else:
return x * math.log2(x)
def preprocess_log_entities_data(le_data, vectorizer):
"""Calculate the entropy from database normative chunks
Args:
le_data (dict): The logging entity data, data frame with format: time, log
vectorizer (CountVectorizer): The trained vector
"""
counts = calculate_term_vector(le_data, vectorizer)
counts['timestamp'] = le_data['timestamp'].values
counts = counts.sort_values(by='timestamp')
# print(counts[0:20])
# Resample data to period and sum the term occurrences
time = counts['timestamp'].dt.to_period('10S')
agg = counts.groupby([time]).sum()
# print(agg[['last', 'message', 'rpd']])
agg_df = agg.div(agg.sum(axis=0), axis=1)
# Calculate the p * log2(p)
# agg_df = agg_df.applymap(calculate_entropy)
agg_df = agg_df * np.log2(agg_df)
agg_df.fillna(0, inplace=True)
# Sum according to column to get sum of all p * log2(p) (entropy of a term in M normative chunks). After that, divide M and plus 1 to calculate entropy.
entropy = 1 + agg_df.sum()/math.log2(len(agg_df))
return entropy.to_dict()
# if __name__ == '__main__':
# filepath = '/home/kien/SVTECH_CODE/log_template_SVTECH/data_without_template_per_host/ME_PR02.MAP063_RE0.csv'
# df = pd.read_csv(filepath)
# print(preprocess_training_data({'LE1': df}))