Skip to content

Commit

Permalink
* Fixed a bug in Get->server_host() where it would error out trying t…
Browse files Browse the repository at this point in the history
…o check what servers were running on the peer whem the peer was offline.

* Fixed a bug in ScanCore where the '-v...' switches were not being honoured properly.
* Updated ScanCore's thermal "cool down" timers to count both thermal shutdowns and load sheds in the previous six hours.
* Fixed a bug where, if the peer was offline and the local node was in thermal warning, it would try to load shed.
* Upped the verbosity of logging in ScanCore's 'avoid_duplicate_run' function after a (so far unreproducable) bug where two copies of ScanCore managed to run at the same time.
* Updated scan-apc-ups's warning when communication is lost to a UPS and kick-apc-ups is in use to warn of the potential hard shut down.

Signed-off-by: Digimer <digimer@alteeve.ca>
  • Loading branch information
Digimer committed May 24, 2017
1 parent 4bf5b7a commit 20f2589
Show file tree
Hide file tree
Showing 6 changed files with 60 additions and 38 deletions.
11 changes: 7 additions & 4 deletions AN/Tools/Get.pm
Original file line number Diff line number Diff line change
Expand Up @@ -3940,15 +3940,18 @@ sub server_host
foreach my $node_key ("node1", "node2")
{
next if $host;
my $online = $an->data->{sys}{anvil}{$node_key}{online};
my $target = $an->data->{sys}{anvil}{$node_key}{use_ip};
my $port = $an->data->{sys}{anvil}{$node_key}{use_port};
my $password = $an->data->{sys}{anvil}{$node_key}{password};
my $shell_call = $an->data->{path}{virsh}." list --all";
$an->Log->entry({log_level => 2, message_key => "an_variables_0003", message_variables => {
name1 => "target", value1 => $target,
name2 => "port", value2 => $port,
name3 => "shell_call", value3 => $shell_call,
$an->Log->entry({log_level => 2, message_key => "an_variables_0004", message_variables => {
name1 => "online", value1 => $online,
name2 => "target", value2 => $target,
name3 => "port", value3 => $port,
name4 => "shell_call", value4 => $shell_call,
}, file => $THIS_FILE, line => __LINE__});
next if not $online;
my ($error, $ssh_fh, $return) = $an->Remote->remote_call({
target => $target,
port => $port,
Expand Down
62 changes: 33 additions & 29 deletions ScanCore/ScanCore
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ $an->Storage->read_words({file => $an->data->{path}{striker_strings}});

# See if the user is forcing the issue
$an->Get->switches();
$an->Log->adjust_log_level({key => $THIS_FILE});
$an->Log->adjust_log_level({key => "scancore"});

# Help?
if (($an->data->{switches}{h}) or
Expand Down Expand Up @@ -323,18 +323,14 @@ while(1)
$an->Storage->read_conf ({file => $an->data->{path}{striker_config}});
$an->Storage->read_words ({file => $an->data->{path}{scancore_strings}});
$an->Storage->read_words ({file => $an->data->{path}{striker_strings}});
$an->Log->adjust_log_level({key => $THIS_FILE});
$an->Log->adjust_log_level({key => "scancore"});
$an->Log->level ($an->data->{scancore}{log_level});
$an->data->{scancore}{enabled} = 1 if $an->data->{switches}{force};
load_agent_strings($an);

# Set the log level in case it was changed in the config file.
$an->Log->level($an->data->{scancore}{log_level});

# TODO: Calc the md5s of ScanCore's core files and if they change, exit.
# Calculate the sum and see if it has changed.
#calculate_scancore_sum($an);

# Did the user disable ScanCore while it was sleeping?
exit_if_disabled($an);

Expand Down Expand Up @@ -1937,19 +1933,17 @@ AND
my $last_shutdown = 0;
my $query = "
SELECT
round(extract(epoch from modified_date))
round(extract(epoch from modified_date))
FROM
history.hosts
WHERE
host_name = ".$an->data->{sys}{use_db_fh}->quote($node)."
host_name = ".$an->data->{sys}{use_db_fh}->quote($node)."
AND
host_emergency_stop = TRUE
(host_stop_reason = 'temperature' OR host_stop_reason = 'shed_load')
AND
host_stop_reason = 'temperature'
AND
modified_date > (now() - interval '6h')
modified_date > (now() - interval '6h')
ORDER BY
modified_date ASC
modified_date DESC
;";
# There should always be at least one.
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
Expand Down Expand Up @@ -2852,12 +2846,20 @@ AND
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "both_nodes_online", value1 => $both_nodes_online,
}, file => $THIS_FILE, line => __LINE__});
$an->Log->entry({log_level => 2, message_key => "an_variables_0002", message_variables => {
name1 => "sys::anvil::node1::online", value1 => $an->data->{sys}{anvil}{node1}{online},
name2 => "sys::anvil::node2::online", value2 => $an->data->{sys}{anvil}{node2}{online},
}, file => $THIS_FILE, line => __LINE__});
if ((not $an->data->{sys}{anvil}{node1}{online}) or (not $an->data->{sys}{anvil}{node2}{online}))
{
# Disable load shedding because our peer is dead.
$both_nodes_online = 0;
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "both_nodes_online", value1 => $both_nodes_online,
$both_nodes_online = 0;
$evaluate_load_shed = 0;
$shed_load = 0;
$an->Log->entry({log_level => 2, message_key => "an_variables_0003", message_variables => {
name1 => "both_nodes_online", value1 => $both_nodes_online,
name2 => "evaluate_load_shed", value2 => $evaluate_load_shed,
name3 => "shed_load", value3 => $shed_load,
}, file => $THIS_FILE, line => __LINE__});
}

Expand Down Expand Up @@ -2902,10 +2904,12 @@ AND
}, file => $THIS_FILE, line => __LINE__});
next if $an->data->{sys}{anvil}{$node_key}{host_uuid} eq $an->data->{sys}{host_uuid};

$peer_host_uuid = $an->data->{sys}{anvil}{$node_key}{host_uuid};

$peer_host_name = $an->data->{sys}{anvil}{$node_key}{name};
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "peer_host_uuid", value1 => $peer_host_uuid,
$peer_host_uuid = $an->data->{sys}{anvil}{$node_key}{host_uuid};
$an->Log->entry({log_level => 2, message_key => "an_variables_0002", message_variables => {
name1 => "peer_host_name", value1 => $peer_host_name,
name2 => "peer_host_uuid", value2 => $peer_host_uuid,
}, file => $THIS_FILE, line => __LINE__});
last;
}
Expand Down Expand Up @@ -11348,46 +11352,46 @@ sub scan_directory
sub avoid_duplicate_run
{
my ($an) = @_;
$an->Log->entry({log_level => 3, title_key => "tools_log_0001", title_variables => { function => "avoid_duplicate_run" }, message_key => "tools_log_0002", file => $THIS_FILE, line => __LINE__});
$an->Log->entry({log_level => 2, title_key => "tools_log_0001", title_variables => { function => "avoid_duplicate_run" }, message_key => "tools_log_0002", file => $THIS_FILE, line => __LINE__});

my $my_pid = $$;
$an->Log->entry({log_level => 3, message_key => "an_variables_0001", message_variables => {
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "my_pid", value1 => $my_pid,
}, file => $THIS_FILE, line => __LINE__});

# See if there is a pending lock. If there is, we won't do anything because the other instance might
# be waiting on the lock. We don't worry about lock age because any other node or dashboard will reap
# the lock if it gets too old.
my $lock = $an->DB->locking({check => 1});
$an->Log->entry({log_level => 3, message_key => "an_variables_0001", message_variables => {
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "lock", value1 => $lock,
}, file => $THIS_FILE, line => __LINE__});

my $already_running = 0;
my $other_pids = [];
my $shell_call = $an->data->{path}{ps}." aux | ".$an->data->{path}{'grep'}." -v grep | ".$an->data->{path}{'grep'}." -v tail | ".$an->data->{path}{'grep'}." -v '\/bin\/sh' | ".$an->data->{path}{'grep'}." $THIS_FILE";
$an->Log->entry({log_level => 3, message_key => "scancore_log_0007", message_variables => { shell_call => $shell_call }, file => $THIS_FILE, line => __LINE__});
$an->Log->entry({log_level => 2, message_key => "scancore_log_0007", message_variables => { shell_call => $shell_call }, file => $THIS_FILE, line => __LINE__});
open (my $file_handle, "$shell_call 2>&1 |") or $an->Alert->error({title_key => "an_0003", message_key => "error_message_0070", message_variables => { shell_call => $shell_call, error => $! }, code => 2, file => $THIS_FILE, line => __LINE__});
while (<$file_handle>)
{
chomp;
my $line = $_;
$an->Log->entry({log_level => 3, message_key => "an_variables_0001", message_variables => {
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "line", value1 => $line,
}, file => $THIS_FILE, line => __LINE__});

if ($line =~ /^.*?\s+(\d+)\s/)
{
my $pid = $1;
$an->Log->entry({log_level => 3, message_key => "an_variables_0001", message_variables => {
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "pid", value1 => $pid,
}, file => $THIS_FILE, line => __LINE__});

if ($pid ne $my_pid)
{
# Another copy is running.
$already_running = 1;
$an->Log->entry({log_level => 3, message_key => "an_variables_0001", message_variables => {
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "already_running", value1 => $already_running,
}, file => $THIS_FILE, line => __LINE__});

Expand All @@ -11405,7 +11409,7 @@ sub avoid_duplicate_run
my $current_time = time;
my $difference = $current_time - $last_updated;
### NOTE: Customer requested, move to 2 before v2.0 release
$an->Log->entry({log_level => 3, message_key => "an_variables_0004", message_variables => {
$an->Log->entry({log_level => 2, message_key => "an_variables_0004", message_variables => {
name1 => "last_updated", value1 => $last_updated,
name2 => "current_time", value2 => $current_time,
name3 => "difference", value3 => $difference,
Expand All @@ -11427,15 +11431,15 @@ sub avoid_duplicate_run

# Kill it
my $shell_call = $an->data->{path}{'kill'}." $pid";
$an->Log->entry({log_level => 3, message_key => "an_variables_0001", message_variables => {
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "shell_call", value1 => $shell_call,
}, file => $THIS_FILE, line => __LINE__});
open (my $file_handle, "$shell_call 2>&1 |") or $an->Alert->error({title_key => "an_0003", message_key => "error_title_0014", message_variables => { shell_call => $shell_call, error => $! }, code => 2, file => $THIS_FILE, line => __LINE__});
while(<$file_handle>)
{
chomp;
my $line = $_;
$an->Log->entry({log_level => 3, message_key => "an_variables_0001", message_variables => {
$an->Log->entry({log_level => 2, message_key => "an_variables_0001", message_variables => {
name1 => "line", value1 => $line,
}, file => $THIS_FILE, line => __LINE__});
}
Expand Down
8 changes: 4 additions & 4 deletions ScanCore/ScanCore.xml
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,10 @@ If you need any assistance, please feel free to contact #!string!scancore_brand_
<key name="scancore_log_0074">#!free!#</key>
<key name="scancore_log_0075">#!free!#</key>
<key name="scancore_log_0076">#!free!#</key>
<key name="scancore_log_0077">The power feeding this machine has entered a warning state.</key>
<key name="scancore_log_0078">The temperature of this machine has entered a warning state!</key>
<key name="scancore_log_0079">The power feeding this machine has entered a critical state.</key>
<key name="scancore_log_0080">The temperature of this machine has entered a critical state!</key>
<key name="scancore_log_0077">The power feeding this machine is in a warning state.</key>
<key name="scancore_log_0078">The temperature of this machine is in a warning state!</key>
<key name="scancore_log_0079">The power feeding this machine is in a critical state.</key>
<key name="scancore_log_0080">The temperature of this machine is in a critical state!</key>
<key name="scancore_log_0081">The power feeding this machine has returned to a healthy state.</key>
<key name="scancore_log_0082">The temperature of this machine has returned to a healthy state!</key>
<key name="scancore_log_0083">#!string!scancore_brand_0001!# was invoked with '--one-shot' and the run is finished, exiting.</key>
Expand Down
8 changes: 7 additions & 1 deletion ScanCore/agents/scan-apc-ups/scan-apc-ups
Original file line number Diff line number Diff line change
Expand Up @@ -4943,11 +4943,17 @@ sub gather_ups_data
{
# This is a new loss of comms.
$an->Log->entry({log_level => 1, message_key => "scan_apc_ups_message_0008", message_variables => { ups_name => $ups_name }, file => $THIS_FILE, line => __LINE__});
my $message_key = "scan_apc_ups_note_0059";
if ($an->data->{tools}{'anvil-kick-apc-ups'}{enabled})
{
# The UPS watchdog feature is enabled...
$message_key = "scan_apc_ups_note_0062";
}
$an->Alert->register_alert({
alert_level => "warning",
alert_agent_name => $THIS_FILE,
alert_title_key => "an_alert_title_0004",
alert_message_key => "scan_apc_ups_note_0059",
alert_message_key => $message_key,
alert_message_variables => {
ups_name => $ups_name,
},
Expand Down
7 changes: 7 additions & 0 deletions ScanCore/agents/scan-apc-ups/scan-apc-ups.xml
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,13 @@ Communication with the UPS: [#!variable!ups_name!#] has been restored.
The UPS: [#!variable!ups_name!#] battery health has changed:
- [#!variable!old_value!#] -> [#!variable!new_value!#]
</key>
<key name="scan_apc_ups_note_0062">
Communication with the UPS: [#!variable!ups_name!#] has been lost!
Warning: The UPS watchdog feature is enabled! If the UPS was counting down
when the connection was lost, it might be about to power cycle.
If communication is lost to both/all UPSes, all power could soon be
lost and the full Anvil! stack will hard power cycle!
</key>

<!-- Errors -->
<key name="scan_apc_ups_error_0001">The variable: [#!variable!name!#] should have been an integer, but it appears it was not. Read: [#!variable!value!#].</key>
Expand Down
2 changes: 2 additions & 0 deletions striker.conf
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,8 @@ tools::anvil-kick-apc-ups::enabled = 0
#
# If the timer runs out and the UPS shuts down, this controls how long the UPS
# "sleeps" for before turning back on. The default is '300' (5 minutes).
# NOTE: Values under 300 seem to cause display issues on some SMT1500-modeul
# UPSes.
#sys::apc::ups::sleep_time = 300
#
# The kick script is started once a minute. If you want to reduce the frequency
Expand Down

0 comments on commit 20f2589

Please sign in to comment.