Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable redline CSV support in upload #642

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
134 changes: 64 additions & 70 deletions timesketch/lib/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,87 +36,82 @@ def random_color():
rgb = tuple(int(i * 256) for i in colorsys.hsv_to_rgb(hue, 0.5, 0.95))
return u'{0:02X}{1:02X}{2:02X}'.format(rgb[0], rgb[1], rgb[2])

def read_and_validate_csv(path, delimiter):
"""Generator for reading a CSV or TSV file.

def get_csv_dialect(csv_header):
"""Get CSV dialect format.

Args:
path: Path to the file
delimiter: character used as a field separator
csv_header: List of CSV column names

Returns:
Name of the dialect if known, otherwise None
"""
# Columns that must be present in the CSV file
mandatory_fields = [u'message', u'datetime', u'timestamp_desc']

with open(path, 'rb') as fh:
# Check if redline format
redline_fields = {
u'Alert', u'Tag', u'Timestamp', u'Field', u'Summary'}
redline_intersection = set(
redline_fields).intersection(set(csv_header))
if len(redline_fields) == len(redline_intersection):
return u'redline'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If I understand what you're trying to achieve, I think set(redline_fields).issubset(set(csv_header)) has the same effect and is more readable.


# Check if Timesketch supported format
timesketch_fields = {u'message', u'datetime', u'timestamp_desc'}
timesketch_intersection = timesketch_fields.intersection(
set(csv_header))
if len(timesketch_fields) == len(timesketch_intersection):
return u'timesketch'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as above


return None


def read_and_validate_csv(path, delimiter):
"""Generator for reading a CSV or TSV file.

reader = csv.DictReader(fh, delimiter=delimiter.decode('string_escape'))
csv_header = reader.fieldnames
missing_fields = []
# Validate the CSV header
for field in mandatory_fields:
if field not in csv_header:
missing_fields.append(field)
if missing_fields:
raise RuntimeError(
u'Missing fields in CSV header: {0:s}'.format(missing_fields))
for row in reader:
if u'timestamp' not in csv_header and u'datetime' in csv_header:
try:
parsed_datetime = parser.parse(row[u'datetime'])
row[u'timestamp'] = str(
int(time.mktime(parsed_datetime.timetuple())))
except ValueError:
continue

yield row

def read_and_validate_redline(path):
"""Generator for reading a Redline CSV file.
Args:
path: Path to the file
delimiter: character used as a field separator
"""
# Columns that must be present in the CSV file

# check if it is the right redline format
mandatory_fields = [u'Alert', u'Tag', u'Timestamp', u'Field', u'Summary']
Returns:
Generator of event rows
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would also be useful to know what type event rows are.


Raises:
RuntimeError is CSV format is unknown
"""
with open(path, 'rb') as fh:
csv.register_dialect('myDialect',
delimiter=',',
quoting=csv.QUOTE_ALL,
skipinitialspace=True)
reader = csv.DictReader(fh, delimiter=',', dialect='myDialect')

reader = csv.DictReader(fh, delimiter=delimiter.decode('string_escape'))
csv_header = reader.fieldnames
missing_fields = []
# Validate the CSV header
for field in mandatory_fields:
if field not in csv_header:
missing_fields.append(field)
if missing_fields:
raise RuntimeError(
u'Missing fields in CSV header: {0:s}'.format(missing_fields))
for row in reader:

dt = parser.parse(row['Timestamp'])
timestamp = int(time.mktime(dt.timetuple())) * 1000
dt_iso_format = dt.isoformat()
timestamp_desc = row['Field']

summary = row['Summary']
alert = row['Alert']
tag = row['Tag']

row_to_yield = {}
row_to_yield["message"] = summary
row_to_yield["timestamp"] = timestamp
row_to_yield["datetime"] = dt_iso_format
row_to_yield["timestamp_desc"] = timestamp_desc
row_to_yield["alert"] = alert #extra field
tags = [tag]
row_to_yield["tag"] = tags # extra field

yield row_to_yield
csv_dialect = get_csv_dialect(csv_header)

if u'redline' in csv_dialect:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it seems that get_csv_dialect returns a string, not an array of strings. I think you want to use csv_dialect == 'redline'. Also, this will fail if csv_dialect is None.

for row in reader:
parsed_datetime = parser.parse(row[u'Timestamp'])
timestamp = int(
time.mktime(parsed_datetime.timetuple())) * 1000000
parsed_datetime_iso_format = parsed_datetime.isoformat()
row = dict(
message=row[u'Summary'],
timestamp=timestamp,
datetime=parsed_datetime_iso_format,
timestamp_desc=row[u'Field'],
alert=row[u'Alert'],
tag=[row[u'Tag']]
)
yield row

elif u'timesketch' in csv_dialect:
for row in reader:
if u'timestamp' not in csv_header and u'datetime' in csv_header:
try:
parsed_datetime = parser.parse(row[u'datetime'])
row[u'timestamp'] = int(
time.mktime(parsed_datetime.timetuple())) * 1000000
except ValueError:
continue
yield row
else:
raise RuntimeError(u'Unknown CSV format')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can turn this into a guard statement before the other checks

if csv_dialect is None:
    raise RuntimeError()

if csv_dialect == 'timesketch':
    ...

if csv_dialect == 'redline':
    ...

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the proposal



def read_and_validate_jsonl(path, _):
"""Generator for reading a JSONL (json lines) file.
Expand All @@ -127,7 +122,6 @@ def read_and_validate_jsonl(path, _):
# Fields that must be present in each entry of the JSONL file.
mandatory_fields = [u'message', u'datetime', u'timestamp_desc']
with open(path, 'rb') as fh:

lineno = 0
for line in fh:
lineno += 1
Expand Down
45 changes: 1 addition & 44 deletions tsctl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ from sqlalchemy.exc import IntegrityError
from timesketch import create_app
from timesketch.lib.datastores.elastic import ElasticsearchDataStore
from timesketch.lib.utils import read_and_validate_csv
from timesketch.lib.utils import read_and_validate_redline
from timesketch.lib.utils import read_and_validate_jsonl
from timesketch.lib.experimental.similarity import SimilarityScorer
from timesketch.models import db_session
Expand Down Expand Up @@ -390,49 +389,9 @@ class CreateTimelineFromCsv(CreateTimelineBase):
# Import the remaining events in the queue
total_events = es.import_event(
index_name, event_type, flush_interval=flush_interval)
sys.stdout.write(u'\nTotal events: {0:d}\n'.format(total_events))
sys.stdout.write(u'Total events: {0:d}\n'.format(total_events))
self.create_searchindex(timeline_name, index_name)

class CreateTimelineFromRedline(CreateTimelineBase):
"""Create a new Timesketch timeline from a Redline csv file."""

def __init__(self):
super(CreateTimelineFromRedline, self).__init__()

def run(self, timeline_name, index_name, file_path, event_type,
flush_interval, delimiter):
"""Create the timeline from a Redline file.

Args:
timeline_name: The name of the timeline in Timesketch
index_name: Name of the index in Elasticsearch
file_path: Path to the file to process
event_type: Type of event (e.g. plaso_event)
flush_interval: Number of events to queue up before bulk insert
delimiter: Character used as a field separator

"""
timeline_name = unicode(timeline_name.decode(encoding=u'utf-8'))
index_name = unicode(index_name.decode(encoding=u'utf-8'))
es = ElasticsearchDataStore(
host=current_app.config[u'ELASTIC_HOST'],
port=current_app.config[u'ELASTIC_PORT'])

es.create_index(index_name=index_name, doc_type=event_type)
for event in read_and_validate_redline(file_path):
event_counter = es.import_event(
index_name, event_type, event, flush_interval=flush_interval)
if event_counter % int(flush_interval) == 0:
sys.stdout.write(
u'Indexing progress: {0:d} events\r'.format(event_counter))
sys.stdout.flush()

# Import the remaining events in the queue
total_events = es.import_event(
index_name, event_type, flush_interval=flush_interval)
sys.stdout.write(u'\nTotal events: {0:d}\n'.format(total_events))
self.create_searchindex(timeline_name, index_name)


class CreateTimelineFromJSONL(CreateTimelineBase):
"""Create a new Timesketch timeline from a JSONL file."""
Expand Down Expand Up @@ -473,7 +432,6 @@ class CreateTimelineFromJSONL(CreateTimelineBase):
self.create_searchindex(timeline_name, index_name)



class PurgeTimeline(Command):
"""Delete timeline permanently from Timesketch and Elasticsearch."""
option_list = (Option(
Expand Down Expand Up @@ -624,7 +582,6 @@ if __name__ == '__main__':
shell_manager.add_command(u'manage_group', GroupManager())
shell_manager.add_command(u'add_index', AddSearchIndex())
shell_manager.add_command(u'csv2ts', CreateTimelineFromCsv())
shell_manager.add_command(u'redline2ts', CreateTimelineFromRedline())
shell_manager.add_command(u'jsonl2ts', CreateTimelineFromJSONL())
shell_manager.add_command(u'db', MigrateCommand)
shell_manager.add_command(u'drop_db', DropDataBaseTables())
Expand Down