-
Notifications
You must be signed in to change notification settings - Fork 2
/
example-anomaly.py
67 lines (48 loc) · 1.88 KB
/
example-anomaly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from skinfosec.models.anomaly.web import RequestAnomalyDetector
# This is using AWS via boto.
# It assumes your AWS credentials are provided in a boto config:
#SEE: http://boto.readthedocs.org/en/latest/boto_config_tut.html
#import boto3
#s3 = boto3.resource('s3')
#for bucket in s3.buckets.all():
# print(bucket.name)
from pyspark import SparkContext
from pyspark.sql import SQLContext, Row
from odo import odo
from apache_log_parser import make_parser
logFile = "access.log" # Should be some file on your system
sc = SparkContext("local", "Example Anomaly Dectection")
sql = SQLContext(sc)
logData = sc.textFile(logFile).cache()
parser = make_parser('%h %u %l %t \"%r\" %>s %b')
#logData = logData.map(lambda line: parser(line)["remote_host"])
cols=["remote_host", "remote_user", "remote_logname",
"time_received_tz_datetimeobj", "request_http_ver", "request_method",
"request_url", "status", "response_bytes_clf"]
# Note that parser(line) returns a dict. We do the following:
# - Filter out keys in cols, and get a list.
# - Convert each value of the dict into str
# - Join together the resulting str into a single line.
#logData = logData.map(lambda line: ''.join([str(e)+"," for e in [parser(line)[k] for k in cols]]))
def parse(line):
parsed = parser(line)
# Requires conversion to str
filtered = {k:str(v) for k,v in parsed.items() if k in cols}
return filtered
data = logData.map(lambda line: parse(line))
print data.take(3)
#from blaze import Table
#t = Table(data)
#print t.head(3)
#print type(data)
df = data.toDF()
longitudes = df.select(df["request_url"]).map(lambda r: len(r[0])).mean()
print longitudes
#print avg
#print df.printSchema()
#srdd = odo(logData, sql, dshape='var * {name: string, amount: float64}')
#X = preprocess_requests(content)
ad = RequestAnomalyDetector()
#ad = ad.fit(X)
#anomalies = ad.predict(X)
#print anomalies["uri_length"].head()