Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SMART self-test results to zpool status -c #7178

Merged
merged 1 commit into from
Feb 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 12 additions & 2 deletions cmd/zpool/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,12 @@ dist_zpoolexec_SCRIPTS = \
zpool.d/nvme_err \
zpool.d/pwr_cyc \
zpool.d/upath \
zpool.d/vendor
zpool.d/vendor \
zpool.d/smart_test \
zpool.d/test_type \
zpool.d/test_status \
zpool.d/test_progress \
zpool.d/test_ended

zpoolconfdefaults = \
enc \
Expand Down Expand Up @@ -101,7 +106,12 @@ zpoolconfdefaults = \
nvme_err \
pwr_cyc \
upath \
vendor
vendor \
smart_test \
test_type \
test_status \
test_progress \
test_ended

install-data-hook:
$(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)"
Expand Down
132 changes: 116 additions & 16 deletions cmd/zpool/zpool.d/smart
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,44 @@ ata_err: Show SMART ATA errors (ATA).
pwr_cyc: Show SMART power cycle count (ATA).
serial: Show disk serial number.
nvme_err: Show SMART NVMe errors (NVMe).
smart_test: Show SMART self-test results summary.
test_type: Show SMART self-test type (short, long... ).
test_status: Show SMART self-test status.
test_progress: Show SMART self-test percentage done.
test_ended: Show when the last SMART self-test ended (if supported).
"

# Hack for developer testing
#
# If you set $samples to a directory containing smartctl output text files,
# we will use them instead of running smartctl on the vdevs. This can be
# useful if you want to test a bunch of different smartctl outputs. Also, if
# $samples is set, and additional 'file' column is added to the zpool output
# showing the filename.
samples=

# get_filename_from_dir DIR
#
# Look in directory DIR and return a filename from it. The filename returned
# is chosen quasi-sequentially (based off our PID). This allows us to return
# a different filename every time this script is invoked (which we do for each
# vdev), without having to maintain state.
get_filename_from_dir()
{
dir=$1
pid="$$"
num_files=$(find "$dir" -maxdepth 1 -type f | wc -l)
mod=$((pid % num_files))
i=0
find "$dir" -type f -printf "%f\n" | while read -r file ; do
if [ "$mod" = "$i" ] ; then
echo "$file"
break
fi
i=$((i+1))
done
}

script=$(basename "$0")

if [ "$1" = "-h" ] ; then
Expand All @@ -35,8 +71,16 @@ fi

smartctl_path=$(which smartctl)

if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then
if [ -n "$samples" ] ; then
# cat a smartctl output text file instead of running smartctl
# on a vdev (only used for developer testing).
file=$(get_filename_from_dir $samples)
echo "file=$file"
raw_out=$(cat "$samples/$file")
else
raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH")
fi

# What kind of drive are we? Look for the right line in smartctl:
#
Expand All @@ -49,7 +93,6 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
# NVMe:
# SMART/Health Information (NVMe Log 0xnn, NSID 0xnn)
#
type=$(echo "$raw_out" | grep -m 1 -Eo '^ATA|NVMe|SAS$')
out=$(echo "$raw_out" | awk '
# SAS specific
/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8}
Expand All @@ -58,10 +101,11 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
/Elements in grown defect list/{print "defect="$6}

# SAS common
/SAS/{type="sas"}
/Drive Temperature:/{print "temp="$4}
# Status can be a long string, substitute spaces for '_'
/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i}
/number of hours powered up/{print "hours_on="$7}
/number of hours powered up/{print "hours_on="$7; hours_on=int($7)}
/Serial number:/{print "serial="$3}

# SATA specific
Expand All @@ -74,13 +118,16 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
/Power_Cycle_Count/{print "pwr_cyc="$10}

# SATA common
/SATA/{type="sata"}
/Temperature_Celsius/{print "temp="$10}
/Airflow_Temperature_Cel/{print "temp="$10}
/Current Temperature:/{print "temp="$3}
/SMART overall-health self-assessment test result:/{print "health="$6}
/Power_On_Hours/{print "hours_on="$10}
/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)}
/Serial Number:/{print "serial="$3}

# NVMe common
/NVMe/{type="nvme"}
/Temperature:/{print "temp="$2}
/SMART overall-health self-assessment test result:/{print "health="$6}
/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4}
Expand All @@ -90,39 +137,92 @@ if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ]; then
# NVMe specific
/Media and Data Integrity Errors:/{print "nvme_err="$6}

END {ORS="\n"; print ""}
# SMART self-test info
/Self-test execution status:/{progress=tolower($4)} # SAS
/SMART Self-test log/{test_seen=1} # SAS
/SMART Extended Self-test Log/{test_seen=1} # SATA
/# 1/{
test_type=tolower($3"_"$4);
# Status could be one word ("Completed") or multiple ("Completed: read
# failure"). Look for the ":" to see if we need to grab more words.

if ($5 ~ ":")
status=tolower($5""$6"_"$7)
else
status=tolower($5)
if (status=="self")
status="running";

if (type == "sas") {
hours=int($(NF-4))
} else {
hours=int($(NF-1))
# SATA reports percent remaining, rather than percent done
# Convert it to percent done.
progress=(100-int($(NF-2)))"%"
}
# When we int()-ify "hours", it converts stuff like "NOW" and "-" into
# 0. In those cases, set it to hours_on, so they will cancel out in
# the "hours_ago" calculation later on.
if (hours == 0)
hours=hours_on

if (test_seen) {
print "test="hours_on
print "test_type="test_type
print "test_status="status
print "test_progress="progress
}
# Not all drives report hours_on
if (hours_on && hours) {
total_hours_ago=(hours_on-hours)
days_ago=int(total_hours_ago/24)
hours_ago=(total_hours_ago % 24)
if (days_ago != 0)
ago_str=days_ago"d"
if (hours_ago !=0)
ago_str=ago_str""hours_ago"h"
print "test_ended="ago_str
}
}

END {print "type="type; ORS="\n"; print ""}
');
fi
type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2)

# if type is not set by now, either we don't have a block device
# or smartctl failed. Either way, default to ATA and set out to
# nothing
# If type is not set by now, either we don't have a block device
# or smartctl failed. Either way, default to ATA and set $out to
# nothing.
if [ -z "$type" ]; then
type="ATA"
type="sata"
out=
fi

case $script in
smart)
# Print temperature plus common predictors of drive failure
if [ "$type" = "SAS" ] ; then
if [ "$type" = "sas" ] ; then
scripts="temp|health|r_ucor|w_ucor"
elif [ "$type" = "ATA" ] ; then
elif [ "$type" = "sata" ] ; then
scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor"
elif [ "$type" = "NVMe" ] ; then
elif [ "$type" = "nvme" ] ; then
scripts="temp|health|nvme_err"
fi
;;
smartx)
# Print some other interesting stats
if [ "$type" = "SAS" ] ; then
if [ "$type" = "sas" ] ; then
scripts="hours_on|defect|nonmed|r_proc|w_proc"
elif [ "$type" = "ATA" ] ; then
elif [ "$type" = "sata" ] ; then
scripts="hours_on|pwr_cyc"
elif [ "$type" = "NVMe" ] ; then
elif [ "$type" = "nvme" ] ; then
scripts="hours_on|pwr_cyc"
fi
;;
smart_test)
scripts="test_type|test_status|test_progress|test_ended"
;;
*)
scripts="$script"
esac
Expand Down
1 change: 1 addition & 0 deletions cmd/zpool/zpool.d/smart_test
1 change: 1 addition & 0 deletions cmd/zpool/zpool.d/test_ended
1 change: 1 addition & 0 deletions cmd/zpool/zpool.d/test_progress
1 change: 1 addition & 0 deletions cmd/zpool/zpool.d/test_status
1 change: 1 addition & 0 deletions cmd/zpool/zpool.d/test_type