Skip to content

Commit

Permalink
zed: Control NVMe fault LEDs
Browse files Browse the repository at this point in the history
The ZED code currently can only turn on the fault LED for
a faulted disk in a JBOD enclosure.  This extends support
for faulted NVMe disks as well.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tony Hutter <hutter2@llnl.gov>
Closes #12648
Closes #12695
  • Loading branch information
tonyhutter authored Nov 10, 2021
1 parent e39fe05 commit ae70d62
Show file tree
Hide file tree
Showing 5 changed files with 229 additions and 11 deletions.
73 changes: 68 additions & 5 deletions cmd/zed/zed.d/statechange-led.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@
[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
. "${ZED_ZEDLET_DIR}/zed-functions.sh"

if [ ! -d /sys/class/enclosure ] ; then
if [ ! -d /sys/class/enclosure ] && [ ! -d /sys/bus/pci/slots ] ; then
# No JBOD enclosure or NVMe slots
exit 1
fi

Expand Down Expand Up @@ -92,6 +93,29 @@ check_and_set_led()
done
}

# Fault LEDs for JBODs and NVMe drives are handled a little differently.
#
# On JBODs the fault LED is called 'fault' and on a path like this:
#
# /sys/class/enclosure/0:0:1:0/SLOT 10/fault
#
# On NVMe it's called 'attention' and on a path like this:
#
# /sys/bus/pci/slot/0/attention
#
# This function returns the full path to the fault LED file for a given
# enclosure/slot directory.
#
path_to_led()
{
dir=$1
if [ -f "$dir/fault" ] ; then
echo "$dir/fault"
elif [ -f "$dir/attention" ] ; then
echo "$dir/attention"
fi
}

state_to_val()
{
state="$1"
Expand All @@ -105,6 +129,38 @@ state_to_val()
esac
}

#
# Given a nvme name like 'nvme0n1', pass back its slot directory
# like "/sys/bus/pci/slots/0"
#
nvme_dev_to_slot()
{
dev="$1"

# Get the address "0000:01:00.0"
address=$(cat "/sys/class/block/$dev/device/address")

# For each /sys/bus/pci/slots subdir that is an actual number
# (rather than weird directories like "1-3/").
# shellcheck disable=SC2010
for i in $(ls /sys/bus/pci/slots/ | grep -E "^[0-9]+$") ; do
this_address=$(cat "/sys/bus/pci/slots/$i/address")

# The format of address is a little different between
# /sys/class/block/$dev/device/address and
# /sys/bus/pci/slots/
#
# address= "0000:01:00.0"
# this_address = "0000:01:00"
#
if echo "$address" | grep -Eq ^"$this_address" ; then
echo "/sys/bus/pci/slots/$i"
break
fi
done
}


# process_pool (pool)
#
# Iterate through a pool and set the vdevs' enclosure slot LEDs to
Expand Down Expand Up @@ -134,6 +190,11 @@ process_pool()
# Get dev name (like 'sda')
dev=$(basename "$(echo "$therest" | awk '{print $(NF-1)}')")
vdev_enc_sysfs_path=$(realpath "/sys/class/block/$dev/device/enclosure_device"*)
if [ ! -d "$vdev_enc_sysfs_path" ] ; then
# This is not a JBOD disk, but it could be a PCI NVMe drive
vdev_enc_sysfs_path=$(nvme_dev_to_slot "$dev")
fi

current_val=$(echo "$therest" | awk '{print $NF}')

if [ "$current_val" != "0" ] ; then
Expand All @@ -145,9 +206,10 @@ process_pool()
continue
fi

if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then
led_path=$(path_to_led "$vdev_enc_sysfs_path")
if [ ! -e "$led_path" ] ; then
rc=3
zed_log_msg "vdev $vdev '$file/fault' doesn't exist"
zed_log_msg "vdev $vdev '$led_path' doesn't exist"
continue
fi

Expand All @@ -158,7 +220,7 @@ process_pool()
continue
fi

if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then
if ! check_and_set_led "$led_path" "$val"; then
rc=3
fi
done
Expand All @@ -169,7 +231,8 @@ if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; the
# Got a statechange for an individual vdev
val=$(state_to_val "$ZEVENT_VDEV_STATE_STR")
vdev=$(basename "$ZEVENT_VDEV_PATH")
check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val"
ledpath=$(path_to_led "$ZEVENT_VDEV_ENC_SYSFS_PATH")
check_and_set_led "$ledpath" "$val"
else
# Process the entire pool
poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID")
Expand Down
4 changes: 2 additions & 2 deletions cmd/zed/zed.d/zed.rc
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,8 @@

##
# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for
# device mapper and multipath devices as well. Your enclosure must be
# supported by the Linux SES driver for this to work.
# device mapper and multipath devices as well. This works with JBOD enclosures
# and NVMe PCI drives (assuming they're supported by Linux in sysfs).
#
ZED_USE_ENCLOSURE_LEDS=1

Expand Down
8 changes: 7 additions & 1 deletion cmd/zpool/zpool.d/ses
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,13 @@ for i in $scripts ; do
val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null)
;;
fault_led)
val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null)
# JBODs fault LED is called 'fault', NVMe fault LED is called
# 'attention'.
if [ -f "$VDEV_ENC_SYSFS_PATH/fault" ] ; then
val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null)
elif [ -f "$VDEV_ENC_SYSFS_PATH/attention" ] ; then
val=$(cat "$VDEV_ENC_SYSFS_PATH/attention" 2>/dev/null)
fi
;;
locate_led)
val=$(cat "$VDEV_ENC_SYSFS_PATH/locate" 2>/dev/null)
Expand Down
146 changes: 143 additions & 3 deletions lib/libzutil/os/linux/zutil_device_path_os.c
Original file line number Diff line number Diff line change
Expand Up @@ -154,18 +154,148 @@ zfs_strip_path(char *path)
return (strrchr(path, '/') + 1);
}

/*
* Read the contents of a sysfs file into an allocated buffer and remove the
* last newline.
*
* This is useful for reading sysfs files that return a single string. Return
* an allocated string pointer on success, NULL otherwise. Returned buffer
* must be freed by the user.
*/
static char *
zfs_read_sysfs_file(char *filepath)
{
char buf[4096]; /* all sysfs files report 4k size */
char *str = NULL;

FILE *fp = fopen(filepath, "r");
if (fp == NULL) {
return (NULL);
}
if (fgets(buf, sizeof (buf), fp) == buf) {
/* success */

/* Remove the last newline (if any) */
size_t len = strlen(buf);
if (buf[len - 1] == '\n') {
buf[len - 1] = '\0';
}
str = strdup(buf);
}

fclose(fp);

return (str);
}

/*
* Given a dev name like "nvme0n1", return the full PCI slot sysfs path to
* the drive (in /sys/bus/pci/slots).
*
* For example:
* dev: "nvme0n1"
* returns: "/sys/bus/pci/slots/0"
*
* 'dev' must be an NVMe device.
*
* Returned string must be freed. Returns NULL on error or no sysfs path.
*/
static char *
zfs_get_pci_slots_sys_path(const char *dev_name)
{
DIR *dp = NULL;
struct dirent *ep;
char *address1 = NULL;
char *address2 = NULL;
char *path = NULL;
char buf[MAXPATHLEN];
char *tmp;

/* If they preface 'dev' with a path (like "/dev") then strip it off */
tmp = strrchr(dev_name, '/');
if (tmp != NULL)
dev_name = tmp + 1; /* +1 since we want the chr after '/' */

if (strncmp("nvme", dev_name, 4) != 0)
return (NULL);

(void) snprintf(buf, sizeof (buf), "/sys/block/%s/device/address",
dev_name);

address1 = zfs_read_sysfs_file(buf);
if (!address1)
return (NULL);

/*
* /sys/block/nvme0n1/device/address format will
* be "0000:01:00.0" while /sys/bus/pci/slots/0/address will be
* "0000:01:00". Just NULL terminate at the '.' so they match.
*/
tmp = strrchr(address1, '.');
if (tmp != NULL)
*tmp = '\0';

dp = opendir("/sys/bus/pci/slots/");
if (dp == NULL) {
free(address1);
return (NULL);
}

/*
* Look through all the /sys/bus/pci/slots/ subdirs
*/
while ((ep = readdir(dp))) {
/*
* We only care about directory names that are a single number.
* Sometimes there's other directories like
* "/sys/bus/pci/slots/0-3/" in there - skip those.
*/
if (!zfs_isnumber(ep->d_name))
continue;

(void) snprintf(buf, sizeof (buf),
"/sys/bus/pci/slots/%s/address", ep->d_name);

address2 = zfs_read_sysfs_file(buf);
if (!address2)
continue;

if (strcmp(address1, address2) == 0) {
/* Addresses match, we're all done */
free(address2);
if (asprintf(&path, "/sys/bus/pci/slots/%s",
ep->d_name) == -1) {
free(tmp);
continue;
}
break;
}
free(address2);
}

closedir(dp);
free(address1);

return (path);
}

/*
* Given a dev name like "sda", return the full enclosure sysfs path to
* the disk. You can also pass in the name with "/dev" prepended
* to it (like /dev/sda).
* to it (like /dev/sda). This works for both JBODs and NVMe PCI devices.
*
* For example, disk "sda" in enclosure slot 1:
* dev: "sda"
* dev_name: "sda"
* returns: "/sys/class/enclosure/1:0:3:0/Slot 1"
*
* Or:
*
* dev_name: "nvme0n1"
* returns: "/sys/bus/pci/slots/0"
*
* 'dev' must be a non-devicemapper device.
*
* Returned string must be freed.
* Returned string must be freed. Returns NULL on error.
*/
char *
zfs_get_enclosure_sysfs_path(const char *dev_name)
Expand Down Expand Up @@ -252,6 +382,16 @@ zfs_get_enclosure_sysfs_path(const char *dev_name)
if (dp != NULL)
closedir(dp);

if (!path) {
/*
* This particular disk isn't in a JBOD. It could be an NVMe
* drive. If so, look up the NVMe device's path in
* /sys/bus/pci/slots/. Within that directory is a 'attention'
* file which controls the NVMe fault LED.
*/
path = zfs_get_pci_slots_sys_path(dev_name);
}

return (path);
}

Expand Down
9 changes: 9 additions & 0 deletions lib/libzutil/zutil_nicenum.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <math.h>
#include <stdio.h>
#include <libzutil.h>
#include <string.h>

/*
* Return B_TRUE if "str" is a number string, B_FALSE otherwise.
Expand All @@ -42,6 +43,14 @@ zfs_isnumber(const char *str)
if (!(isdigit(*str) || (*str == '.')))
return (B_FALSE);

/*
* Numbers should not end with a period ("." ".." or "5." are
* not valid)
*/
if (str[strlen(str) - 1] == '.') {
return (B_FALSE);
}

return (B_TRUE);
}

Expand Down

0 comments on commit ae70d62

Please sign in to comment.