Skip to content

Commit

Permalink
feature(monitoring): take screenshots with grafana image renderer
Browse files Browse the repository at this point in the history
- switch to grafana image renderer
- drop all of webdriver related implemetion
- remove grafana snapshots - it's not in use for quite some time
  • Loading branch information
fruch committed May 29, 2024
1 parent 2da25fc commit 89a7f2f
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 408 deletions.
23 changes: 3 additions & 20 deletions sdcm/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,6 @@
check_schema_version, check_nulls_in_peers, check_schema_agreement_in_gossip_and_peers, \
check_group0_tokenring_consistency, CHECK_NODE_HEALTH_RETRIES, CHECK_NODE_HEALTH_RETRY_DELAY
from sdcm.utils.decorators import NoValue, retrying, log_run_info, optional_cached_property
from sdcm.utils.remotewebbrowser import WebDriverContainerMixin
from sdcm.test_config import TestConfig
from sdcm.utils.sstable.sstable_utils import SstableUtils
from sdcm.utils.version_utils import (
Expand All @@ -127,7 +126,7 @@
from sdcm.sct_events.filters import EventsSeverityChangerFilter
from sdcm.utils.auto_ssh import AutoSshContainerMixin
from sdcm.monitorstack.ui import AlternatorDashboard
from sdcm.logcollector import GrafanaSnapshot, GrafanaScreenShot, PrometheusSnapshots, upload_archive_to_s3, \
from sdcm.logcollector import GrafanaScreenShot, PrometheusSnapshots, upload_archive_to_s3, \
save_kallsyms_map, collect_diagnostic_data
from sdcm.utils.ldap import LDAP_SSH_TUNNEL_LOCAL_PORT, LDAP_BASE_OBJECT, LDAP_PASSWORD, LDAP_USERS, \
LDAP_PORT, DEFAULT_PWD_SUFFIX
Expand Down Expand Up @@ -229,7 +228,7 @@ def destroy(self):
pass


class BaseNode(AutoSshContainerMixin, WebDriverContainerMixin): # pylint: disable=too-many-instance-attributes,too-many-public-methods
class BaseNode(AutoSshContainerMixin): # pylint: disable=too-many-instance-attributes,too-many-public-methods
CQL_PORT = 9042
CQL_SSL_PORT = 9142
MANAGER_AGENT_PORT = 10001
Expand Down Expand Up @@ -5686,12 +5685,10 @@ def get_grafana_screenshot_and_snapshot(self, test_start_time: Optional[int] = N
return {}

screenshot_links = []
snapshot_links = []
for node in self.nodes:
screenshot_links.extend(self.get_grafana_screenshots(node, test_start_time))
snapshot_links.extend(self.get_grafana_snapshots(node, test_start_time))

return {'screenshots': screenshot_links, 'snapshots': snapshot_links}
return {'screenshots': screenshot_links}

def get_grafana_screenshots(self, node: BaseNode, test_start_time: float) -> list[str]:
screenshot_links = []
Expand All @@ -5710,20 +5707,6 @@ def get_grafana_screenshots(self, node: BaseNode, test_start_time: float) -> lis

return screenshot_links

def get_grafana_snapshots(self, node: BaseNode, test_start_time: float) -> list[str]:
snapshot_links = []
grafana_extra_dashboards = []
if 'alternator_port' in self.params:
grafana_extra_dashboards = [AlternatorDashboard()]

snapshots_collector = GrafanaSnapshot(name="grafana-snapshot",
test_start_time=test_start_time,
extra_entities=grafana_extra_dashboards)
snapshots_data = snapshots_collector.collect(node, self.logdir)
snapshot_links.extend(snapshots_data.get('links', []))

return snapshot_links

def upload_annotations_to_s3(self):
annotations_url = ''
if not self.nodes:
Expand Down
103 changes: 9 additions & 94 deletions sdcm/logcollector.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,20 +54,18 @@
get_sct_root_path,
normalize_ipv6_url, create_remote_storage_dir,
)
from sdcm.utils.auto_ssh import AutoSshContainerMixin
from sdcm.utils.context_managers import environment
from sdcm.utils.decorators import retrying
from sdcm.utils.docker_utils import get_docker_bridge_gateway
from sdcm.utils.get_username import get_username
from sdcm.utils.k8s import KubernetesOps
from sdcm.utils.remotewebbrowser import RemoteBrowser, WebDriverContainerMixin
from sdcm.utils.s3_remote_uploader import upload_remote_files_directly_to_s3
from sdcm.utils.gce_utils import gce_public_addresses, gce_private_addresses

LOGGER = logging.getLogger(__name__)


class CollectingNode(AutoSshContainerMixin, WebDriverContainerMixin):
class CollectingNode:
# pylint: disable=too-few-public-methods,too-many-instance-attributes
logdir = None

Expand Down Expand Up @@ -440,7 +438,7 @@ class GrafanaEntity(BaseMonitoringEntity): # pylint: disable=too-few-public-met
]

grafana_port = 3000
grafana_entity_url_tmpl = "http://{node_ip}:{grafana_port}{path}?from={st}&to=now&refresh=1d"
grafana_entity_url_tmpl = "http://{node_ip}:{grafana_port}/render{path}?from={st}&to=now&refresh=1d"
sct_base_path = get_sct_root_path()

def __init__(self, *args, **kwargs):
Expand All @@ -450,18 +448,8 @@ def __init__(self, *args, **kwargs):
test_start_time = time.time() - (6 * 3600)
self.start_time = str(test_start_time).split('.', maxsplit=1)[0] + '000'
self.grafana_dashboards = self.base_grafana_dashboards + kwargs.pop("extra_entities", [])
self.remote_browser = None
super().__init__(*args, **kwargs)

def close_browser(self):
if self.remote_browser:
LOGGER.info('Grafana - browser quit')
self.remote_browser.quit()

def destory_webdriver_container(self):
if self.remote_browser:
self.remote_browser.destroy_containers()

def get_version(self, node):
_, _, version = self.get_monitoring_version(node)
if version:
Expand All @@ -480,7 +468,6 @@ class GrafanaScreenShot(GrafanaEntity):
GrafanaEntity
"""

@retrying(n=5)
def get_grafana_screenshot(self, node, local_dst):
"""
Take screenshot of the Grafana per-server-metrics dashboard and upload to S3
Expand All @@ -491,7 +478,6 @@ def get_grafana_screenshot(self, node, local_dst):
return screenshots

try:
self.remote_browser = RemoteBrowser(node)
for dashboard in self.grafana_dashboards:
try:
dashboard_metadata = MonitoringStack.get_dashboard_by_title(
Expand All @@ -512,11 +498,14 @@ def get_grafana_screenshot(self, node, local_dst):
dashboard.name,
datetime.datetime.now().strftime("%Y%m%d_%H%M%S"),
node.name))
self.remote_browser.open(grafana_url, dashboard.resolution)
dashboard.scroll_to_bottom(self.remote_browser.browser)
dashboard.wait_panels_loading(self.remote_browser.browser)
LOGGER.debug("Get screenshot for url %s, save to %s", grafana_url, screenshot_path)
self.remote_browser.get_screenshot(grafana_url, screenshot_path)
with requests.get(grafana_url, stream=True,
params=dict(width=dashboard.resolution[0],
height=dashboard.resolution[1])) as response:
response.raise_for_status()
with open(screenshot_path, 'wb') as output_file:
for chunk in response.iter_content(chunk_size=8192):
output_file.write(chunk)
screenshots.append(screenshot_path)
except Exception as details: # pylint: disable=broad-except
LOGGER.error("Error get screenshot %s: %s", dashboard.name, details, exc_info=True)
Expand All @@ -526,86 +515,13 @@ def get_grafana_screenshot(self, node, local_dst):
except Exception as details: # pylint: disable=broad-except
LOGGER.error("Error taking monitor screenshot: %s, traceback: %s", details, traceback.format_exc())
return []
finally:
self.close_browser()

def collect(self, node, local_dst, remote_dst=None, local_search_path=None):
node.logdir = local_dst
os.makedirs(local_dst, exist_ok=True)
return self.get_grafana_screenshot(node, local_dst)


class GrafanaSnapshot(GrafanaEntity):
"""Grafana snapshot
Collect Grafana snapshot
Extends:
GrafanaEntity
"""
@retrying(n=5)
def get_grafana_snapshot(self, node):
"""
Take snapshot of the Grafana per-server-metrics dashboard and upload to S3
"""
snapshots = []
version = self.get_version(node)
if not version:
return snapshots
try:
self.remote_browser = RemoteBrowser(node)
monitoring_ui.Login(self.remote_browser.browser,
ip=normalize_ipv6_url(node.grafana_address),
port=self.grafana_port).use_default_creds()
for dashboard in self.grafana_dashboards:
try:
dashboard_metadata = MonitoringStack.get_dashboard_by_title(
grafana_ip=normalize_ipv6_url(node.grafana_address),
port=self.grafana_port,
title=dashboard.title)
if not dashboard_metadata:
LOGGER.error("Dashboard '%s' was not found", dashboard.title)
continue

grafana_url = self.grafana_entity_url_tmpl.format(
node_ip=normalize_ipv6_url(node.grafana_address),
grafana_port=self.grafana_port,
path=dashboard_metadata["url"],
st=self.start_time)
LOGGER.info("Get snapshot link for url %s", grafana_url)
self.remote_browser.open(grafana_url, dashboard.resolution)
dashboard.scroll_to_bottom(self.remote_browser.browser)
dashboard.wait_panels_loading(self.remote_browser.browser)

snapshots.append(dashboard.get_snapshot(self.remote_browser.browser))
except Exception as details: # pylint: disable=broad-except
LOGGER.error("Error get snapshot %s: %s, traceback: %s",
dashboard.name, details, traceback.format_exc())

LOGGER.info(snapshots)
return snapshots

except Exception as details: # pylint: disable=broad-except
LOGGER.error("Error taking monitor snapshot: %s, traceback: %s", details, traceback.format_exc())
return []
finally:
self.close_browser()

def collect(self, node, local_dst, remote_dst=None, local_search_path=None):
node.logdir = local_dst
os.makedirs(local_dst, exist_ok=True)
snapshots = self.get_grafana_snapshot(node)
snapshots_file = os.path.join(local_dst, "grafana_snapshots")
with open(snapshots_file, "w", encoding="utf-8") as f: # pylint: disable=invalid-name
for snapshot in snapshots:
f.write(snapshot + '\n')

return {'links': snapshots, 'file': snapshots_file}

def __del__(self):
self.destory_webdriver_container()


class LogCollector:
"""Base class for LogCollector types
Expand Down Expand Up @@ -950,7 +866,6 @@ class MonitorLogCollector(LogCollector):
PrometheusSnapshots(name='prometheus_data'),
MonitoringStack(name='monitoring-stack'),
GrafanaScreenShot(name='grafana-screenshot'),
GrafanaSnapshot(name='grafana-snapshot')
]
cluster_log_type = "monitor-set"
cluster_dir_prefix = "monitor-set"
Expand Down
Loading

0 comments on commit 89a7f2f

Please sign in to comment.