Skip to content

Commit

Permalink
Merge branch 'develop' into wip/cleanupInstancesDevelop
Browse files Browse the repository at this point in the history
  • Loading branch information
dreambeyondorange authored Mar 20, 2024
2 parents 6543393 + c23948a commit fe86341
Show file tree
Hide file tree
Showing 17 changed files with 257 additions and 437 deletions.
20 changes: 0 additions & 20 deletions tests/integration-tests/configs/develop.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -248,13 +248,6 @@ test-suites:
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["centos7"]
schedulers: ["slurm"]
disable_hyperthreading:
test_disable_hyperthreading.py::test_hit_disable_hyperthreading:
dimensions:
- regions: ["us-west-1"]
instances: ["c5.xlarge"]
oss: ["ubuntu2204"]
schedulers: ["slurm"]
dns:
test_dns.py::test_hit_no_cluster_dns_mpi:
dimensions:
Expand Down Expand Up @@ -341,26 +334,13 @@ test-suites:
instances: ["c5.18xlarge"]
oss: ["centos7"]
schedulers: ["slurm"]
log_rotation:
test_log_rotation.py::test_log_rotation:
dimensions:
- regions: ["il-central-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["ubuntu2004"]
schedulers: ["slurm"]
monitoring:
test_monitoring.py::test_monitoring:
dimensions:
- regions: ["ap-northeast-2"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
schedulers: ["slurm"]
oss: ["rocky9"]
test_structured_log_events.py::test_custom_compute_action_failure:
dimensions:
- regions: ["af-south-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["ubuntu2204"]
schedulers: ["slurm"]
multiple_nics:
test_multiple_nics.py::test_multiple_nics:
dimensions:
Expand Down
14 changes: 0 additions & 14 deletions tests/integration-tests/configs/isolated_regions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -135,13 +135,6 @@ test-suites:
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
disable_hyperthreading:
test_disable_hyperthreading.py::test_hit_disable_hyperthreading:
dimensions:
- regions: {{ REGIONS }}
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
dns:
test_dns.py::test_hit_no_cluster_dns_mpi:
dimensions:
Expand Down Expand Up @@ -531,13 +524,6 @@ test-suites:
# - regions: {{ REGIONS }}
# oss: {{ OSS }}
# schedulers: {{ SCHEDULERS }}
log_rotation:
test_log_rotation.py::test_log_rotation:
dimensions:
- regions: {{ REGIONS }}
instances: {{ INSTANCES }}
oss: {{ OSS }}
schedulers: {{ SCHEDULERS }}
health_checks:
test_gpu_health_checks.py::test_cluster_with_gpu_health_checks:
dimensions:
Expand Down
7 changes: 0 additions & 7 deletions tests/integration-tests/configs/new_os.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,13 +87,6 @@ test-suites:
instances: ["g5g.2xlarge"]
oss: {{ NEW_OS }}
schedulers: ["slurm"]
disable_hyperthreading:
test_disable_hyperthreading.py::test_hit_disable_hyperthreading:
dimensions:
- regions: ["us-west-1"]
instances: ["m4.xlarge"]
oss: {{ NEW_OS }}
schedulers: ["slurm"]
dns:
test_dns.py::test_hit_no_cluster_dns_mpi:
dimensions:
Expand Down
20 changes: 0 additions & 20 deletions tests/integration-tests/configs/released.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,13 +118,6 @@ test-suites:
instances: ["g4dn.2xlarge"]
oss: ["rhel8"]
schedulers: ["slurm"]
disable_hyperthreading:
test_disable_hyperthreading.py::test_hit_disable_hyperthreading:
dimensions:
- regions: ["us-west-1"]
instances: ["c5.xlarge"]
oss: ["ubuntu2204"]
schedulers: ["slurm"]
dns:
test_dns.py::test_existing_hosted_zone:
dimensions:
Expand All @@ -146,26 +139,13 @@ test-suites:
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["rhel8"]
schedulers: ["slurm"]
log_rotation:
test_log_rotation.py::test_log_rotation:
dimensions:
- regions: ["il-central-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["ubuntu2004"]
schedulers: ["slurm"]
monitoring:
test_monitoring.py::test_monitoring:
dimensions:
- regions: ["ap-northeast-2"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
schedulers: ["slurm"]
oss: ["rocky9"]
test_structured_log_events.py::test_custom_compute_action_failure:
dimensions:
- regions: ["af-south-1"]
instances: {{ common.INSTANCES_DEFAULT_X86 }}
oss: ["ubuntu2204"]
schedulers: ["slurm"]
networking:
test_cluster_networking.py::test_existing_eip:
dimensions:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,52 +12,7 @@
import logging
import re

import pytest
from assertpy import assert_that
from remote_command_executor import RemoteCommandExecutor

from tests.common.assertions import assert_no_errors_in_logs
from tests.common.utils import fetch_instance_slots, run_system_analyzer


@pytest.mark.usefixtures("os")
def test_hit_disable_hyperthreading(
region,
scheduler,
instance,
pcluster_config_reader,
clusters_factory,
default_threads_per_core,
scheduler_commands_factory,
request,
):
"""Test Disable Hyperthreading for HIT clusters."""
slots_per_instance = fetch_instance_slots(region, instance)
cluster_config = pcluster_config_reader()
cluster = clusters_factory(cluster_config)
remote_command_executor = RemoteCommandExecutor(cluster)
scheduler_commands = scheduler_commands_factory(remote_command_executor)
_test_disable_hyperthreading_settings(
remote_command_executor,
scheduler_commands,
slots_per_instance,
scheduler,
hyperthreading_disabled=False,
partition="ht-enabled",
default_threads_per_core=default_threads_per_core,
)
_test_disable_hyperthreading_settings(
remote_command_executor,
scheduler_commands,
slots_per_instance,
scheduler,
hyperthreading_disabled=True,
partition="ht-disabled",
default_threads_per_core=default_threads_per_core,
)

assert_no_errors_in_logs(remote_command_executor, scheduler)
run_system_analyzer(cluster, scheduler_commands_factory, request)


def _test_disable_hyperthreading_settings(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,146 +9,11 @@
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging

import pytest
from assertpy import assert_that
from remote_command_executor import RemoteCommandExecutor
from retrying import retry
from time_utils import minutes, seconds
from utils import is_dcv_supported

from tests.cloudwatch_logging.cloudwatch_logging_boto3_utils import get_cluster_log_groups_from_boto3, get_log_events


@pytest.mark.usefixtures("instance", "os", "scheduler")
def test_log_rotation(
region, pcluster_config_reader, s3_bucket_factory, clusters_factory, test_datadir, scheduler_commands_factory, os
):
"""Test parallelcluster log rotation configuration."""
dcv_enabled = is_dcv_supported(region)
cluster_config = pcluster_config_reader(dcv_enabled=dcv_enabled)
cluster = clusters_factory(cluster_config)

remote_command_executor = RemoteCommandExecutor(cluster)
slurm_commands = scheduler_commands_factory(remote_command_executor)
compute_node_ip = cluster.describe_cluster_instances(node_type="Compute")[0].get("privateIpAddress")
headnode_ip = cluster.describe_cluster_instances(node_type="HeadNode")[0].get("privateIpAddress")
log_group_name = get_cluster_log_groups_from_boto3(f"/aws/parallelcluster/{cluster.name}")[0].get("logGroupName")

logging.info("Verifying ParallelCluster log rotation configuration.")
common_logs = [
{"log_name": "cloud-init", "log_path": "/var/log/cloud-init.log", "existence": True},
{"log_name": "supervisord", "log_path": "/var/log/supervisord.log", "existence": True},
{"log_name": "bootstrap_error_msg", "log_path": "/var/log/parallelcluster/bootstrap_error_msg"},
]
headnode_specified_logs = [
{
"log_name": "clustermgtd",
"log_path": "/var/log/parallelcluster/clustermgtd",
"existence": True,
"trigger_new_entries": True,
},
{
"log_name": "clusterstatusmgtd",
"log_path": "/var/log/parallelcluster/clusterstatusmgtd",
"existence": True,
"trigger_new_entries": True,
},
{"log_name": "cfn-init", "log_path": "/var/log/cfn-init.log", "existence": True},
{"log_name": "slurmdbd", "log_path": "/var/log/slurmdbd.log"},
{"log_name": "slurmctld", "log_path": "/var/log/slurmctld.log", "existence": True, "trigger_new_entries": True},
{
"log_name": "compute_console_output",
"log_path": "/var/log/parallelcluster/compute_console_output.log",
"existence": True,
},
{
"log_name": "slurm_fleet_status_manager",
"log_path": "/var/log/parallelcluster/slurm_fleet_status_manager.log",
"existence": True,
},
{
"log_name": "slurm_suspend",
"log_path": "/var/log/parallelcluster/slurm_suspend.log",
"existence": True,
"trigger_new_entries": True,
},
{
"log_name": "slurm_resume",
"log_path": "/var/log/parallelcluster/slurm_resume.log",
"existence": True,
"trigger_new_entries": True,
},
{"log_name": "chef-client", "log_path": "/var/log/chef-client.log", "existence": True},
{
"log_name": "clustermgtd_events",
"log_path": "/var/log/parallelcluster/clustermgtd.events",
"existence": True,
},
{
"log_name": "slurm_resume_events",
"log_path": "/var/log/parallelcluster/slurm_resume.events",
"existence": True,
},
]

if dcv_enabled:
headnode_specified_logs.extend(
[
{"log_name": "dcv-agent", "log_path": "/var/log/dcv/agent.*.log"},
{
"log_name": "dcv-session-launcher",
"log_path": "/var/log/dcv/sessionlauncher.log",
"existence": False,
},
{"log_name": "Xdcv", "log_path": "/var/log/dcv/Xdcv.*.log"},
{
"log_name": "dcv-server",
"log_path": "/var/log/dcv/server.log",
"existence": True,
"trigger_new_entries": False,
},
{"log_name": "dcv-xsession", "log_path": "/var/log/dcv/dcv-xsession.*.log"},
]
)

compute_specified_logs = [
{"log_name": "cloud-init-output", "log_path": "/var/log/cloud-init-output.log", "existence": True},
{
"log_name": "computemgtd",
"log_path": "/var/log/parallelcluster/computemgtd",
"existence": True,
"trigger_new_entries": True,
},
{"log_name": "slurmd", "log_path": "/var/log/slurmd.log", "existence": True},
]

before_log_rotation_message = "test message before log rotation."
after_log_rotation_message = "test message after log rotation."
_test_headnode_log_rotation(
os,
headnode_specified_logs,
common_logs,
remote_command_executor,
before_log_rotation_message,
after_log_rotation_message,
slurm_commands,
cluster,
headnode_ip,
log_group_name,
)
_test_compute_log_rotation(
os,
compute_specified_logs,
common_logs,
remote_command_executor,
before_log_rotation_message,
after_log_rotation_message,
cluster,
compute_node_ip,
log_group_name,
)
from tests.cloudwatch_logging.cloudwatch_logging_boto3_utils import get_log_events


@retry(wait_fixed=seconds(20), stop_max_delay=minutes(9))
Expand Down
Loading

0 comments on commit fe86341

Please sign in to comment.