logpai · ldselvera · Nov 6, 2019 · Mar 24, 2020
diff --git a/logparser/Spell/Spell.py b/logparser/Spell/Spell.py
@@ -10,16 +10,19 @@
 import numpy as np
 import pandas as pd
 import hashlib
+from sys import version_info
 from datetime import datetime
 import string
+import pickle
 
 
 class LCSObject:
     """ Class object to store a log group with the same template
     """
-    def __init__(self, logTemplate='', logIDL=[]):
+    def __init__(self, logTemplate='', logIDL=[], logParams = {}):
         self.logTemplate = logTemplate
         self.logIDL = logIDL
+        self.logParams = logParams
 
 
 class Node:
@@ -44,16 +47,23 @@ class LogParser:
     """
     def __init__(self, indir='./', outdir='./result/', log_format=None, tau=0.5, rex=[], keep_para=True):
         self.path = indir
-        self.logName = None
+        self.logname = None
         self.savePath = outdir
         self.tau = tau
         self.logformat = log_format
         self.df_log = None
         self.rex = rex
         self.keep_para = keep_para
 
+    def JaccardSimilarity(self, seq1, seq2):
+        a = set(seq1)
+        b = set(seq2)
+        c = a.intersection(b)
+        return float(len(c))/ (len(a) + len(b) - len(c))
+
     def LCS(self, seq1, seq2):
         lengths = [[0 for j in range(len(seq2)+1)] for i in range(len(seq1)+1)]
+
         # row 0 and column 0 are initialized to 0 already
         for i in range(len(seq1)):
             for j in range(len(seq2)):
@@ -81,8 +91,8 @@ def SimpleLoopMatch(self, logClustL, seq):
         for logClust in logClustL:
             if float(len(logClust.logTemplate)) < 0.5 * len(seq):
                 continue
-            # Check the template is a subsequence of seq (we use set checking as a proxy here for speedup since
-            # incorrect-ordering bad cases rarely occur in logs)
+            # Check the template is a subsequence of seq 
+            #(we use set checking as a proxy here for speedup since incorrect-ordering bad cases rarely occur in logs)
             token_set = set(seq)
             if all(token in token_set or token == '<*>' for token in logClust.logTemplate):
                 return logClust
@@ -103,23 +113,26 @@ def PrefixTreeMatch(self, parentn, seq, idx):
 
         return retLogClust
 
-
     def LCSMatch(self, logClustL, seq):
         retLogClust = None
 
         maxLen = -1
-        maxlcs = []
+        #maxlcs = []
+
         maxClust = None
         set_seq = set(seq)
         size_seq = len(seq)
+
         for logClust in logClustL:
             set_template = set(logClust.logTemplate)
             if len(set_seq & set_template) < 0.5 * size_seq:
                 continue
+            if self.JaccardSimilarity(seq, logClust.logTemplate) < self.tau: 
+                continue
             lcs = self.LCS(seq, logClust.logTemplate)
             if len(lcs) > maxLen or (len(lcs) == maxLen and len(logClust.logTemplate) < len(maxClust.logTemplate)):
                 maxLen = len(lcs)
-                maxlcs = lcs
+                #maxlcs = lcs
                 maxClust = logClust
 
         # LCS should be large then tau * len(itself)
@@ -128,15 +141,16 @@ def LCSMatch(self, logClustL, seq):
 
         return retLogClust
 
-
-    def getTemplate(self, lcs, seq):
+    def getTemplate(self, lcs, seq, params):
         retVal = []
-        if not lcs:
-            return retVal
-
+
+#         print("seq", seq)
+#         print("lcs", lcs)
+        if not lcs: return retVal
         lcs = lcs[::-1]
         i = 0
         for token in seq:
+#             print("ret", retVal)
             i += 1
             if token == lcs[-1]:
                 retVal.append(token)
@@ -167,7 +181,6 @@ def addSeqToPrefixTree(self, rootn, newCluster):
         if parentn.logClust is None:
             parentn.logClust = newCluster
 
-
     def removeSeqFromPrefixTree(self, rootn, newCluster):
         parentn = rootn
         seq = newCluster.logTemplate
@@ -183,33 +196,41 @@ def removeSeqFromPrefixTree(self, rootn, newCluster):
                     matchedNode.templateNo -= 1
                     parentn = matchedNode
 
-
-    def outputResult(self, logClustL):
+    def outputResult(self, logClustL, rootNode):
 
         templates = [0] * self.df_log.shape[0]
         ids = [0] * self.df_log.shape[0]
         df_event = []
-
+        eid = 0
         for logclust in logClustL:
             template_str = ' '.join(logclust.logTemplate)
-            eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
+            #eid = hashlib.md5(template_str.encode('utf-8')).hexdigest()[0:8]
+            eid += 1
             for logid in logclust.logIDL:
                 templates[logid - 1] = template_str
                 ids[logid - 1] = eid
             df_event.append([eid, template_str, len(logclust.logIDL)])
 
-        df_event = pd.DataFrame(df_event, columns=['EventId', 'EventTemplate', 'Occurrences'])
+        df_event = pd.DataFrame(df_event, columns=['Log Key', 'Message', 'Occurrences'])
+        df_event = df_event.sort_values(by=['Occurrences'], ascending = False)
 
-        self.df_log['EventId'] = ids
-        self.df_log['EventTemplate'] = templates
+        self.df_log['Log Key'] = ids
+        self.df_log['Message'] = templates
         if self.keep_para:
             self.df_log["ParameterList"] = self.df_log.apply(self.get_parameter_list, axis=1) 
-        self.df_log.to_csv(os.path.join(self.savePath, self.logname + '_structured.csv'), index=False)
-        df_event.to_csv(os.path.join(self.savePath, self.logname + '_templates.csv'), index=False)
+
+        np.savetxt(r'Spell_result/np.txt', self.df_log['Log Key'].values, fmt='%d', newline=' ')
+        self.df_log.to_csv(os.path.join(self.savePath, 'logs_structured.csv'), index=False)
+        df_event.to_csv(os.path.join(self.savePath, 'logs_templates.csv'), index=False)
+
+        with open('Spell_result/LCSObject.plk', 'wb') as LCSObject_file:
+            pickle.dump(logClustL, LCSObject_file)
+        with open('Spell_result/Tree.plk', 'wb') as Tree_file:
+            pickle.dump(rootNode, Tree_file)
 
     def printTree(self, node, dep):
         pStr = ''   
-        for i in xrange(dep):
+        for _ in range(dep):
             pStr += '\t'
 
         if node.token == '':
@@ -223,59 +244,67 @@ def printTree(self, node, dep):
         for child in node.childD:
             self.printTree(node.childD[child], dep + 1)
 
-
+    def LCSsearch(self, logCluL, constLogMessL, logmessageL, rootNode, idx):
+        matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, idx)
+
+        if matchCluster is None:
+            matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL)
+
+            if matchCluster is None:
+                matchCluster = self.LCSMatch(logCluL, logmessageL)
+
+        return matchCluster
+
     def parse(self, logname):
         starttime = datetime.now()  
-        print('Parsing file: ' + os.path.join(self.path, logname))
         self.logname = logname  
         self.load_data()
         rootNode = Node()
         logCluL = []
-
+
+#         with open('Spell_result/Tree.plk', 'rb') as Tree_file:
+#             rootNode = pickle.load(Tree_file)
+#         with open('Spell_result/LCSObject.plk', 'rb') as LCSObject_file:
+#             logCluL = pickle.load(LCSObject_file)
+
         count = 0
-        for idx, line in self.df_log.iterrows():
+
+        for _, line in self.df_log.iterrows():
             logID = line['LineId']
-            logmessageL = list(filter(lambda x: x != '', re.split(r'[\s=:,]', self.preprocess(line['Content']))))
+            logmessageL = list(filter(lambda x: x != '', re.split(r'[\s=:,()]', self.preprocess(line['Content']))))
             constLogMessL = [w for w in logmessageL if w != '<*>']
+            matchCluster = self.LCSsearch(logCluL, constLogMessL, logmessageL, rootNode, 0)
 
-            #Find an existing matched log cluster
-            matchCluster = self.PrefixTreeMatch(rootNode, constLogMessL, 0)
-
+            # Match no existing log cluster
             if matchCluster is None:
-                matchCluster = self.SimpleLoopMatch(logCluL, constLogMessL)
-
-                if matchCluster is None:
-                    matchCluster = self.LCSMatch(logCluL, logmessageL)
-
-                    # Match no existing log cluster
-                    if matchCluster is None:
-                        newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID])
-                        logCluL.append(newCluster)
-                        self.addSeqToPrefixTree(rootNode, newCluster)
-                    #Add the new log message to the existing cluster
-                    else:
-                        newTemplate = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate), 
-                                                       matchCluster.logTemplate)
-                        if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate): 
-                            self.removeSeqFromPrefixTree(rootNode, matchCluster)
-                            matchCluster.logTemplate = newTemplate
-                            self.addSeqToPrefixTree(rootNode, matchCluster)
+                newCluster = LCSObject(logTemplate=logmessageL, logIDL=[logID])
+                logCluL.append(newCluster)
+                self.addSeqToPrefixTree(rootNode, newCluster)
+            #Add the new log message to the existing cluster
+            else:
+                newTemplate = self.getTemplate(self.LCS(logmessageL, matchCluster.logTemplate), matchCluster.logTemplate, matchCluster.logParams)
+                if ' '.join(newTemplate) != ' '.join(matchCluster.logTemplate): 
+                    self.removeSeqFromPrefixTree(rootNode, matchCluster)
+                    matchCluster.logTemplate = newTemplate
+                    self.addSeqToPrefixTree(rootNode, matchCluster)
             if matchCluster:
                 matchCluster.logIDL.append(logID)
             count += 1
-            if count % 1000 == 0 or count == len(self.df_log):
+            if count % 50000 == 0 or count == len(self.df_log):
                 print('Processed {0:.1f}% of log lines.'.format(count * 100.0 / len(self.df_log)))
-            
+
         if not os.path.exists(self.savePath):
             os.makedirs(self.savePath)
-
-        self.outputResult(logCluL)
+        self.outputResult(logCluL, rootNode)
         print('Parsing done. [Time taken: {!s}]'.format(datetime.now() - starttime))
 
     def load_data(self):
         headers, regex = self.generate_logformat_regex(self.logformat)
-        self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat)
-
+        if isinstance(self.logname, list):
+            self.df_log = self.log_to_dataframe(self.logname, regex, headers, self.logformat)
+        else:
+            self.df_log = self.log_to_dataframe(os.path.join(self.path, self.logname), regex, headers, self.logformat)
+
     def preprocess(self, line):
         for currentRex in self.rex:
             line = re.sub(currentRex, '<*>', line)
@@ -286,19 +315,41 @@ def log_to_dataframe(self, log_file, regex, headers, logformat):
         """
         log_messages = []
         linecount = 0
-        with open(log_file, 'r') as fin:
-            for line in fin.readlines():
+
+        if isinstance(log_file, list):
+            log_file = self.sort_logs(log_file)
+            for line in log_file:
                 line = re.sub(r'[^\x00-\x7F]+', '<NASCII>', line)
+                line = re.sub(' +', ' ', line)
                 try:
                     match = regex.search(line.strip())
                     message = [match.group(header) for header in headers]
                     log_messages.append(message)
                     linecount += 1
-                except Exception as e:
+                #except Exception as e:
+                except:
                     pass
+        else:
+            with open(log_file, 'r') as fin:
+                for line in fin.readlines():
+                    if ".pcap" not in line:
+                        line = re.sub(r'[^\x00-\x7F]+', '<NASCII>', line)
+                        line = re.sub(' +', ' ', line)
+                        try:
+                            match = regex.search(line.strip())
+                            message = [match.group(header) for header in headers]           
+
+                            if len(message[-1].split()) > 18: continue
+
+                            log_messages.append(message)
+                            linecount += 1
+                        #except Exception as e:
+                        except:
+                            pass
         logdf = pd.DataFrame(log_messages, columns=headers)
         logdf.insert(0, 'LineId', None)
         logdf['LineId'] = [i + 1 for i in range(linecount)]
+
         return logdf
 
     def generate_logformat_regex(self, logformat):
@@ -309,7 +360,10 @@ def generate_logformat_regex(self, logformat):
         regex = ''
         for k in range(len(splitters)):
             if k % 2 == 0:
-                splitter = re.sub(' +', '\s+', splitters[k])
+                if version_info.major == 2:#for Python 2 
+                    splitter = re.sub(' +', r'\s+', splitters[k])
+                else:
+                    splitter = re.sub(r'\s+', ' ', splitters[k])
                 regex += splitter
             else:
                 header = splitters[k].strip('<').strip('>')
@@ -319,13 +373,31 @@ def generate_logformat_regex(self, logformat):
         return headers, regex
 
     def get_parameter_list(self, row):
-        template_regex = re.sub(r"\s<.{1,5}>\s", "<*>", row["EventTemplate"])
+        template_regex = re.sub(r"\s<.{1,5}>\s", "<*>", row["Message"])
         if "<*>" not in template_regex: return []
+
         template_regex = re.sub(r'([^A-Za-z0-9])', r'\\\1', template_regex)
         template_regex = re.sub(r'\\ +', r'[^A-Za-z0-9]+', template_regex)
         template_regex = "^" + template_regex.replace("\<\*\>", "(.*?)") + "$"
+
         parameter_list = re.findall(template_regex, row["Content"])
         parameter_list = parameter_list[0] if parameter_list else ()
         parameter_list = list(parameter_list) if isinstance(parameter_list, tuple) else [parameter_list]
         parameter_list = [para.strip(string.punctuation).strip(' ') for para in parameter_list]
+
         return parameter_list
+
+    def sort_logs(self, log_file):
+        dates =[log.split()[0] + " " + log.split()[1] + " " + log.split()[2] for log in log_file]
+        messages =[" ".join(log.split()[3:]) for log in log_file ]
+
+        #Convert to data frame
+        df = pd.DataFrame(zip(dates,messages), columns = ['Date', 'Message'])
+
+        #Sort logs by date
+        df = df.sort_values(by='Date')
+
+        #Re-merge into single log
+        df = df['Date'].astype(str)+' '+df['Message']
+        log_list = df.values.tolist()
+        return log_list