-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate_synthetic_logs.py
116 lines (98 loc) · 4.6 KB
/
generate_synthetic_logs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import argparse
import random
import pandas as pd
from datetime import datetime, timedelta
# Define actions and their applicable resource types
ACTIONS = {
'list_all_buckets_of_project': 'project',
'list_all_instances_in_project': 'project',
'list_all_objects_in_bucket': 'bucket',
'read_object': 'data_object',
'log_on_to_instance': 'instance',
'assume_service_account': 'service_account',
'assign_ssh_key_to_instance': 'instance'
}
def generate_principals(num_users, num_service_accounts):
users = [f'user_{i}' for i in range(1, num_users + 1)]
service_accounts = [f'service_account_{i}' for i in range(1, num_service_accounts + 1)]
return users, service_accounts
def generate_resources(num_projects, num_buckets, num_data_objects, num_instances):
projects = [f'project_{i}' for i in range(1, num_projects + 1)]
buckets = [f'bucket_{i}' for i in range(1, num_buckets + 1)]
data_objects = [f'data_object_{i}' for i in range(1, num_data_objects + 1)]
instances = [f'instance_{i}' for i in range(1, num_instances + 1)]
return projects, buckets, data_objects, instances
def generate_random_timestamp(start_date, end_date):
delta = end_date - start_date
random_seconds = random.randint(0, int(delta.total_seconds()))
return start_date + timedelta(seconds=random_seconds)
def generate_log_entries(num_entries, users, service_accounts, projects, buckets, data_objects, instances, malicious_probability):
principals = users + service_accounts
actions = list(ACTIONS.keys())
logs = []
start_date = datetime.now() - timedelta(days=365) # One year ago
end_date = datetime.now()
for _ in range(num_entries):
principal = random.choice(principals)
action = random.choice(actions)
resource_type = ACTIONS[action]
if resource_type == 'project':
resource = random.choice(projects)
elif resource_type == 'bucket':
resource = random.choice(buckets)
elif resource_type == 'data_object':
resource = random.choice(data_objects)
elif resource_type == 'instance':
resource = random.choice(instances)
elif resource_type == 'service_account':
resource = random.choice(service_accounts)
else:
resource = 'unknown_resource'
timestamp = generate_random_timestamp(start_date, end_date).isoformat() + 'Z'
# Assign label based on malicious_probability
label = 'malicious' if random.random() < malicious_probability else 'benign'
logs.append({
'timestamp': timestamp,
'principal': principal,
'action': action,
'resource': resource,
'label': label
})
return logs
def main():
parser = argparse.ArgumentParser(description='Generate Synthetic GCP Cloud Audit Logs')
parser.add_argument('--num_users', type=int, default=10, help='Number of user principals')
parser.add_argument('--num_service_accounts', type=int, default=5, help='Number of service account principals')
parser.add_argument('--num_projects', type=int, default=5, help='Number of projects')
parser.add_argument('--num_buckets', type=int, default=20, help='Number of buckets')
parser.add_argument('--num_data_objects', type=int, default=100, help='Number of data objects')
parser.add_argument('--num_instances', type=int, default=50, help='Number of instances')
parser.add_argument('--num_log_entries', type=int, default=10000, help='Number of log entries to generate')
parser.add_argument('--malicious_probability', type=float, default=0.05, help='Probability that a log entry is malicious')
parser.add_argument('--output', type=str, default='synthetic_logs.csv', help='Output CSV file name')
args = parser.parse_args()
# Generate principals and resources
users, service_accounts = generate_principals(args.num_users, args.num_service_accounts)
projects, buckets, data_objects, instances = generate_resources(
args.num_projects,
args.num_buckets,
args.num_data_objects,
args.num_instances
)
# Generate log entries
logs = generate_log_entries(
args.num_log_entries,
users,
service_accounts,
projects,
buckets,
data_objects,
instances,
args.malicious_probability
)
# Create DataFrame and save to CSV
df = pd.DataFrame(logs)
df.to_csv(args.output, index=False)
print(f'Successfully generated {args.num_log_entries} log entries and saved to {args.output}')
if __name__ == '__main__':
main()