Skip to content
This repository has been archived by the owner on Nov 1, 2023. It is now read-only.

add log checking to refactored integration check #700

Merged
17 commits merged into from
Apr 2, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 0 additions & 3 deletions src/api-service/__app__/onefuzzlib/agent_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,6 @@ def on_worker_event_done(machine_id: UUID, event: WorkerDoneEvent) -> Result[Non
node.debug_keep_node = True
node.save()
else:
logging.error(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note, this is now handled by task.mark_failed. Since we don't record errors on tasks already marked as failures, this moves the error longing to model our state handling.

"task failed. %s:%s status:%s", task.job_id, task.task_id, event.exit_status
)
task.mark_failed(
Error(
code=ErrorCode.TASK_FAILED,
Expand Down
2 changes: 1 addition & 1 deletion src/api-service/__app__/onefuzzlib/azure/vm.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def get_extension(vm_name: str, extension_name: str) -> Optional[Any]:
resource_group, vm_name, extension_name
)
except (ResourceNotFoundError, CloudError) as err:
logging.error("extension does not exist %s", err)
logging.info("extension does not exist %s", err)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is common place when we're deploying VMs. shouldn't be an error.

return None


Expand Down
7 changes: 5 additions & 2 deletions src/api-service/__app__/onefuzzlib/notifications/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,12 @@ def get_queue_tasks() -> Sequence[Tuple[Task, Sequence[str]]]:


def new_files(container: Container, filename: str) -> None:
report = get_report_or_regression(container, filename)

notifications = get_notifications(container)

report = get_report_or_regression(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since we're going to try to inspect every new file as a potential report, this sets up the parsing (seen later in this PR) to only log errors if we actually expect the files to be a report, rather than just optimistically being a report.

container, filename, expect_reports=bool(notifications)
)

if notifications:
logging.info("notifications for %s %s %s", container, filename, notifications)
done = []
Expand Down
41 changes: 25 additions & 16 deletions src/api-service/__app__/onefuzzlib/reports.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,28 @@


def parse_report_or_regression(
content: Union[str, bytes], file_path: Optional[str] = None
content: Union[str, bytes],
file_path: Optional[str] = None,
expect_reports: bool = False,
) -> Optional[Union[Report, RegressionReport]]:
if isinstance(content, bytes):
try:
content = content.decode()
except UnicodeDecodeError as err:
logging.error(
f"unable to parse report ({file_path}): "
f"unicode decode of report failed - {err}"
)
if expect_reports:
logging.error(
f"unable to parse report ({file_path}): "
f"unicode decode of report failed - {err}"
)
return None

try:
data = json.loads(content)
except json.decoder.JSONDecodeError as err:
logging.error(
f"unable to parse report ({file_path}): json decoding failed - {err}"
)
if expect_reports:
logging.error(
f"unable to parse report ({file_path}): json decoding failed - {err}"
)
return None

regression_err = None
Expand All @@ -46,29 +50,34 @@ def parse_report_or_regression(
try:
return Report.parse_obj(data)
except ValidationError as err:
logging.error(
f"unable to parse report ({file_path}) as a report or regression. "
f"regression error: {regression_err} report error: {err}"
)
if expect_reports:
logging.error(
f"unable to parse report ({file_path}) as a report or regression. "
f"regression error: {regression_err} report error: {err}"
)
return None


# cache the last 1000 reports
@cached(max_size=1000)
def get_report_or_regression(
container: Container, filename: str
container: Container, filename: str, *, expect_reports: bool = False
) -> Optional[Union[Report, RegressionReport]]:
file_path = "/".join([container, filename])
if not filename.endswith(".json"):
logging.error("get_report invalid extension: %s", file_path)
if expect_reports:
logging.error("get_report invalid extension: %s", file_path)
return None

blob = get_blob(container, filename, StorageType.corpus)
if blob is None:
logging.error("get_report invalid blob: %s", file_path)
if expect_reports:
logging.error("get_report invalid blob: %s", file_path)
return None

return parse_report_or_regression(blob, file_path=file_path)
return parse_report_or_regression(
blob, file_path=file_path, expect_reports=expect_reports
)


def get_report(container: Container, filename: str) -> Optional[Report]:
Expand Down
2 changes: 2 additions & 0 deletions src/api-service/__app__/onefuzzlib/tasks/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,8 @@ def mark_failed(
)
return

logging.error("task failed %s:%s - %s", self.job_id, self.task_id, error)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See previous comment about mark_failed.


self.error = error
self.set_state(TaskState.stopping)

Expand Down
96 changes: 96 additions & 0 deletions src/integration-tests/integration-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
# checks on each of the created items for the stage. This batch processing
# allows testing multiple components concurrently.

import datetime
import json
import logging
import os
import re
Expand All @@ -26,6 +28,7 @@
from typing import Dict, List, Optional, Set, Tuple
from uuid import UUID, uuid4

import requests
from onefuzz.api import Command, Onefuzz
from onefuzz.backend import ContainerWrapper, wait
from onefuzz.cli import execute_api
Expand Down Expand Up @@ -207,6 +210,8 @@ def __init__(self, onefuzz: Onefuzz, logger: logging.Logger, test_id: UUID) -> N
self.pools: Dict[OS, Pool] = {}
self.test_id = test_id
self.project = f"test-{self.test_id}"
self.start_log_marker = f"integration-test-injection-error-start-{self.test_id}"
self.stop_log_marker = f"integration-test-injection-error-stop-{self.test_id}"

def setup(
self,
Expand All @@ -215,6 +220,7 @@ def setup(
pool_size: int,
os_list: List[OS],
) -> None:
self.inject_log(self.start_log_marker)
for entry in os_list:
name = PoolName(f"testpool-{entry.name}-{self.test_id}")
self.logger.info("creating pool: %s:%s", entry.name, name)
Expand Down Expand Up @@ -686,6 +692,88 @@ def cleanup(self) -> None:
if errors:
raise Exception("cleanup failed")

def inject_log(self, message: str) -> None:
# This is an *extremely* minimal implementation of the Application Insights rest
# API, as discussed here:
#
# https://apmtips.com/posts/2017-10-27-send-metric-to-application-insights/

key = self.of.info.get().insights_instrumentation_key
assert key is not None, "instrumentation key required for integration testing"

data = {
"data": {
"baseData": {
"message": message,
"severityLevel": "Information",
"ver": 2,
},
"baseType": "MessageData",
},
"iKey": key,
"name": "Microsoft.ApplicationInsights.Message",
"time": datetime.datetime.now(datetime.timezone.utc)
.astimezone()
.isoformat(),
}

requests.post(
"https://dc.services.visualstudio.com/v2/track", json=data
).raise_for_status()

def check_log_end_marker(
self,
) -> Tuple[bool, str, bool]:
logs = self.of.debug.logs.keyword(
self.stop_log_marker, limit=1, timespan="PT1H"
)
return (
len(logs) > 0,
"waiting for application insight logs to flush",
True,
)

def check_logs_for_errors(self) -> None:
# only check for errors that exist between the start and stop markers
# also, only check for the most recent 100 errors within the last 2
# hours

self.inject_log(self.stop_log_marker)
wait(self.check_log_end_marker, frequency=5.0)
self.logger.info("application insights log flushed")

logs = self.of.debug.logs.keyword("error", limit=100000, timespan="PT3H")

seen_errors = False
seen_stop = False
for entry in logs:
entry_as_str = json.dumps(entry, sort_keys=True)
if not seen_stop:
bmc-msft marked this conversation as resolved.
Show resolved Hide resolved
if self.stop_log_marker in entry_as_str:
seen_stop = True
continue

if self.start_log_marker in entry_as_str:
break
bmc-msft marked this conversation as resolved.
Show resolved Hide resolved

# ignore logging.info coming from Azure Functions
if entry.get("customDimensions", {}).get("LogLevel") == "Information":
continue

# ignore warnings coming from the rust code, only be concerned about errors
if (
entry.get("severityLevel") == 2
and entry.get("sdkVersion") == "rust:0.1.5"
):
continue

seen_errors = True

self.logger.error("error log: %s", entry.get("message"))

if seen_errors:
raise Exception("logs included errors")


class Run(Command):
def check_jobs(
Expand Down Expand Up @@ -739,6 +827,11 @@ def cleanup(self, test_id: UUID, *, endpoint: Optional[str]) -> None:
tester = TestOnefuzz(self.onefuzz, self.logger, test_id=test_id)
tester.cleanup()

def check_logs(self, test_id: UUID, *, endpoint: Optional[str]) -> None:
self.onefuzz.__setup__(endpoint=endpoint)
tester = TestOnefuzz(self.onefuzz, self.logger, test_id=test_id)
tester.check_logs_for_errors()

def test(
self,
samples: Directory,
Expand Down Expand Up @@ -774,6 +867,9 @@ def test(
self.logger.warning("not testing crash repro")
else:
self.check_repros(test_id, endpoint=endpoint)

self.check_logs(test_id, endpoint=endpoint)

except Exception as e:
self.logger.error("testing failed: %s", repr(e))
error = e
Expand Down