diff --git a/.github/suppressions.txt b/.github/suppressions.txt index f9508a24b4ad..b28514e674b5 100644 --- a/.github/suppressions.txt +++ b/.github/suppressions.txt @@ -1,3 +1,6 @@ preprocessorErrorDirective:./module/zfs/vdev_raidz_math_avx512f.c:243 preprocessorErrorDirective:./module/zfs/vdev_raidz_math_sse2.c:266 - +uninitvar:module/os/freebsd/zfs/vdev_geom.c +uninitvar:module/os/freebsd/zfs/zfs_vfsops.c +uninitvar:module/os/freebsd/spl/spl_zone.c +uninitvar:lib/libzutil/os/freebsd/zutil_import_os.c diff --git a/.gitignore b/.gitignore index 57867bfc6eab..056bbb8f08c9 100644 --- a/.gitignore +++ b/.gitignore @@ -62,4 +62,9 @@ cscope.* *.patch *.orig *.log +*.tmp venv + +*.so +*.so.debug +*.so.full diff --git a/Makefile.am b/Makefile.am index 4c0b541ccd4d..101b38ac335b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -104,8 +104,9 @@ commitcheck: fi cstyle: - @find ${top_srcdir} -name build -prune -o -name '*.[hc]' \ - ! -name 'zfs_config.*' ! -name '*.mod.c' -type f \ + @find ${top_srcdir} -name build -prune -o -type f -name '*.[hc]' \ + ! -name 'zfs_config.*' ! -name '*.mod.c' \ + ! -name 'opt_global.h' ! -name '*_if*.h' \ -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} \+ filter_executable = -exec test -x '{}' \; -print diff --git a/README.md b/README.md index ff8a0e851305..9c6ed7523362 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,6 @@ This repository contains the code for running OpenZFS on Linux and FreeBSD. Full documentation for installing OpenZFS on your favorite Linux distribution can be found at the [ZoL Site](https://zfsonlinux.org/). -FreeBSD support is a work in progress. See the [PR](https://github.com/openzfs/zfs/pull/8987). - # Contribute & Develop We have a separate document with [contribution guidelines](./.github/CONTRIBUTING.md). @@ -34,3 +32,4 @@ For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197` # Supported Kernels * The `META` file contains the officially recognized supported Linux kernel versions. + * Supported FreeBSD versions are 12-STABLE and 13-CURRENT. diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 2078bc13b3c1..6b152e848e2e 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -1,10 +1,10 @@ SUBDIRS = zfs zpool zdb zhack zinject zstream zstreamdump ztest -SUBDIRS += fsck_zfs vdev_id raidz_test zgenhostid +SUBDIRS += fsck_zfs vdev_id raidz_test if USING_PYTHON SUBDIRS += arcstat arc_summary dbufstat endif if BUILD_LINUX -SUBDIRS += mount_zfs zed zvol_id zvol_wait +SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait endif diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am index b9e221c1f7cc..7b25726f498e 100644 --- a/cmd/zpool/Makefile.am +++ b/cmd/zpool/Makefile.am @@ -11,6 +11,10 @@ zpool_SOURCES = \ zpool_util.h \ zpool_vdev.c +if BUILD_FREEBSD +zpool_SOURCES += os/freebsd/zpool_vdev_os.c +endif + if BUILD_LINUX zpool_SOURCES += os/linux/zpool_vdev_os.c endif @@ -20,6 +24,9 @@ zpool_LDADD = \ $(top_builddir)/lib/libuutil/libuutil.la \ $(top_builddir)/lib/libzfs/libzfs.la +if BUILD_FREEBSD +zpool_LDADD += -L/usr/local/lib -lintl -lgeom +endif zpool_LDADD += -lm $(LIBBLKID) zpoolconfdir = $(sysconfdir)/zfs/zpool.d diff --git a/cmd/zpool/os/freebsd/zpool_vdev_os.c b/cmd/zpool/os/freebsd/zpool_vdev_os.c new file mode 100644 index 000000000000..4a8d9272d309 --- /dev/null +++ b/cmd/zpool/os/freebsd/zpool_vdev_os.c @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov . + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libdiskmgt, makes sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zpool_util.h" +#include + +int +check_device(const char *name, boolean_t force, boolean_t isspare, + boolean_t iswholedisk) +{ + char path[MAXPATHLEN]; + + if (strncmp(name, _PATH_DEV, sizeof (_PATH_DEV) - 1) != 0) + snprintf(path, sizeof (path), "%s%s", _PATH_DEV, name); + else + strlcpy(path, name, sizeof (path)); + + return (check_file(path, force, isspare)); +} + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + return (0); +} + +void +zpool_vdev_enable_file(struct stat64 *statbuf, boolean_t *wholedisk) +{ + if (S_ISCHR(statbuf->st_mode)) { + statbuf->st_mode &= ~S_IFCHR; + statbuf->st_mode |= S_IFBLK; + *wholedisk = B_FALSE; + } +} diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index bb49211dc8e9..a11fdd33c566 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -941,6 +941,10 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) if (fd == -1) { if (errno == EBUSY) is_exclusive = 1; +#ifdef __FreeBSD__ + if (errno == EPERM) + is_exclusive = 1; +#endif } else { (void) close(fd); } diff --git a/config/Rules.am b/config/Rules.am index 83fbf4ca0520..168cecea2cfc 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -14,26 +14,48 @@ DEFAULT_INCLUDES += \ -I$(top_srcdir)/lib/libspl/include/os/linux endif +if BUILD_FREEBSD +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/lib/libspl/include/os/freebsd +endif + AM_LIBTOOLFLAGS = --silent AM_CFLAGS = -std=gnu99 -Wall -Wstrict-prototypes -fno-strict-aliasing AM_CFLAGS += $(NO_OMIT_FRAME_POINTER) AM_CFLAGS += $(DEBUG_CFLAGS) AM_CFLAGS += $(ASAN_CFLAGS) -AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) +AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) $(NO_FORMAT_ZERO_LENGTH) +if BUILD_FREEBSD +AM_CFLAGS += -fPIC -Werror -Wno-unknown-pragmas -Wno-enum-conversion +AM_CFLAGS += -include $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h +AM_CFLAGS += -I/usr/include -I/usr/local/include +AM_CFLAGS += -D_MACHINE_ENDIAN_H_ +endif AM_CPPFLAGS = -D_GNU_SOURCE AM_CPPFLAGS += -D_REENTRANT AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 AM_CPPFLAGS += -D_LARGEFILE64_SOURCE AM_CPPFLAGS += -DHAVE_LARGE_STACKS=1 -AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-linux-user\" AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) +if BUILD_LINUX +AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-linux-user\" +endif +if BUILD_FREEBSD +AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-freebsd-user\" +endif AM_LDFLAGS = $(DEBUG_LDFLAGS) AM_LDFLAGS += $(ASAN_LDFLAGS) + +if BUILD_FREEBSD +AM_LDFLAGS += -fstack-protector-strong -shared +AM_LDFLAGS += -Wl,-x -Wl,--fatal-warnings -Wl,--warn-shared-textrel +AM_LDFLAGS += -lm +endif diff --git a/config/always-arch.m4 b/config/always-arch.m4 index eb8839b97d7b..25e8c963a4b4 100644 --- a/config/always-arch.m4 +++ b/config/always-arch.m4 @@ -17,7 +17,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [ i?86) TARGET_CPU=i386 ;; - x86_64) + amd64|x86_64) TARGET_CPU=x86_64 ;; powerpc*) diff --git a/config/always-compiler-options.m4 b/config/always-compiler-options.m4 index ca8b6bfccd1d..a84123317989 100644 --- a/config/always-compiler-options.m4 +++ b/config/always-compiler-options.m4 @@ -87,6 +87,27 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION], [ AC_SUBST([NO_FORMAT_TRUNCATION]) ]) +dnl # +dnl # Check if gcc supports -Wno-format-truncation option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH], [ + AC_MSG_CHECKING([whether $CC supports -Wno-format-zero-length]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wno-format-zero-length" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_FORMAT_ZERO_LENGTH=-Wno-format-zero-length + AC_MSG_RESULT([yes]) + ], [ + NO_FORMAT_ZERO_LENGTH= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_FORMAT_ZERO_LENGTH]) +]) + dnl # dnl # Check if gcc supports -Wno-bool-compare option. diff --git a/config/kernel.m4 b/config/kernel.m4 index c29de349418e..8cbf4aee9899 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -2,29 +2,31 @@ dnl # dnl # Default ZFS kernel configuration dnl # AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ - dnl # Setup the kernel build environment. - ZFS_AC_KERNEL - ZFS_AC_QAT - - dnl # Sanity checks for module building and CONFIG_* defines - ZFS_AC_KERNEL_TEST_MODULE - ZFS_AC_KERNEL_CONFIG_DEFINED - - dnl # Sequential ZFS_LINUX_TRY_COMPILE tests - ZFS_AC_KERNEL_FPU_HEADER - ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T - ZFS_AC_KERNEL_MISC_MINOR - ZFS_AC_KERNEL_DECLARE_EVENT_CLASS - - dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests - ZFS_AC_KERNEL_TEST_SRC - ZFS_AC_KERNEL_TEST_RESULT - - AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ - KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" - ]) + AM_COND_IF([BUILD_LINUX], [ + dnl # Setup the kernel build environment. + ZFS_AC_KERNEL + ZFS_AC_QAT + + dnl # Sanity checks for module building and CONFIG_* defines + ZFS_AC_KERNEL_TEST_MODULE + ZFS_AC_KERNEL_CONFIG_DEFINED + + dnl # Sequential ZFS_LINUX_TRY_COMPILE tests + ZFS_AC_KERNEL_FPU_HEADER + ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T + ZFS_AC_KERNEL_MISC_MINOR + ZFS_AC_KERNEL_DECLARE_EVENT_CLASS + + dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests + ZFS_AC_KERNEL_TEST_SRC + ZFS_AC_KERNEL_TEST_RESULT + + AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ + KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" + ]) - AC_SUBST(KERNEL_MAKE) + AC_SUBST(KERNEL_MAKE) + ]) ]) dnl # diff --git a/config/toolchain-simd.m4 b/config/toolchain-simd.m4 index e86eb7f17a0d..1153cd6941a8 100644 --- a/config/toolchain-simd.m4 +++ b/config/toolchain-simd.m4 @@ -3,7 +3,7 @@ dnl # Checks if host toolchain supports SIMD instructions dnl # AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [ case "$host_cpu" in - x86_64 | x86 | i686) + amd64 | x86_64 | x86 | i686) ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2 ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3 diff --git a/config/user.m4 b/config/user.m4 index 3d97e9a418c3..b69412fda1e2 100644 --- a/config/user.m4 +++ b/config/user.m4 @@ -4,14 +4,16 @@ dnl # AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_GETTEXT ZFS_AC_CONFIG_USER_MOUNT_HELPER - ZFS_AC_CONFIG_USER_UDEV - ZFS_AC_CONFIG_USER_SYSTEMD ZFS_AC_CONFIG_USER_SYSVINIT ZFS_AC_CONFIG_USER_DRACUT ZFS_AC_CONFIG_USER_ZLIB - ZFS_AC_CONFIG_USER_LIBUUID + AM_COND_IF([BUILD_LINUX], [ + ZFS_AC_CONFIG_USER_UDEV + ZFS_AC_CONFIG_USER_SYSTEMD + ZFS_AC_CONFIG_USER_LIBUUID + ZFS_AC_CONFIG_USER_LIBBLKID + ]) ZFS_AC_CONFIG_USER_LIBTIRPC - ZFS_AC_CONFIG_USER_LIBBLKID ZFS_AC_CONFIG_USER_LIBUDEV ZFS_AC_CONFIG_USER_LIBSSL ZFS_AC_CONFIG_USER_LIBAIO @@ -19,10 +21,9 @@ AC_DEFUN([ZFS_AC_CONFIG_USER], [ ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV ZFS_AC_CONFIG_USER_ZFSEXEC - ZFS_AC_TEST_FRAMEWORK - AC_CHECK_FUNCS([mlockall strlcat strlcpy]) + AC_CHECK_FUNCS([issetugid mlockall strlcat strlcpy]) ]) dnl # diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 index 2ee9b8eb9420..016c0fc09537 100644 --- a/config/zfs-build.m4 +++ b/config/zfs-build.m4 @@ -157,6 +157,7 @@ AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ ZFS_AC_CONFIG_ALWAYS_CC_NO_BOOL_COMPARE ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION + ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA ZFS_AC_CONFIG_ALWAYS_CC_ASAN @@ -173,13 +174,6 @@ AC_DEFUN([ZFS_AC_CONFIG], [ dnl # Remove the previous build test directory. rm -Rf build - AC_ARG_VAR([TEST_JOBS], - [simultaneous jobs during configure (defaults to $(nproc))]) - if test "x$ac_cv_env_TEST_JOBS_set" != "xset"; then - TEST_JOBS=$(nproc) - fi - AC_SUBST(TEST_JOBS) - ZFS_CONFIG=all AC_ARG_WITH([config], AS_HELP_STRING([--with-config=CONFIG], @@ -197,6 +191,16 @@ AC_DEFUN([ZFS_AC_CONFIG], [ ZFS_AC_CONFIG_ALWAYS + + AM_COND_IF([BUILD_LINUX], [ + AC_ARG_VAR([TEST_JOBS], + [simultaneous jobs during configure (defaults to $(nproc))]) + if test "x$ac_cv_env_TEST_JOBS_set" != "xset"; then + TEST_JOBS=$(nproc) + fi + AC_SUBST(TEST_JOBS) + ]) + case "$ZFS_CONFIG" in kernel) ZFS_AC_CONFIG_KERNEL ;; user) ZFS_AC_CONFIG_USER ;; @@ -405,7 +409,7 @@ dnl # Using the VENDOR tag from config.guess set the default dnl # package type for 'make pkg': (rpm | deb | tgz) dnl # AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ - AC_MSG_CHECKING([linux distribution]) + AC_MSG_CHECKING([os distribution]) if test -f /etc/toss-release ; then VENDOR=toss ; elif test -f /etc/fedora-release ; then @@ -428,6 +432,8 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ VENDOR=debian ; elif test -f /etc/alpine-release ; then VENDOR=alpine ; + elif test -f /bin/freebsd-version ; then + VENDOR=freebsd ; else VENDOR= ; fi @@ -447,13 +453,17 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ lunar) DEFAULT_PACKAGE=tgz ;; ubuntu) DEFAULT_PACKAGE=deb ;; debian) DEFAULT_PACKAGE=deb ;; + freebsd) DEFAULT_PACKAGE=pkg ;; *) DEFAULT_PACKAGE=rpm ;; esac AC_MSG_RESULT([$DEFAULT_PACKAGE]) AC_SUBST(DEFAULT_PACKAGE) - DEFAULT_INIT_DIR=$sysconfdir/init.d AC_MSG_CHECKING([default init directory]) + case "$VENDOR" in + freebsd) DEFAULT_INIT_DIR=$sysconfdir/rc.d ;; + *) DEFAULT_INIT_DIR=$sysconfdir/init.d;; + esac AC_MSG_RESULT([$DEFAULT_INIT_DIR]) AC_SUBST(DEFAULT_INIT_DIR) @@ -470,6 +480,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ lunar) DEFAULT_INIT_SCRIPT=lunar ;; ubuntu) DEFAULT_INIT_SCRIPT=lsb ;; debian) DEFAULT_INIT_SCRIPT=lsb ;; + freebsd) DEFAULT_INIT_SCRIPT=freebsd;; *) DEFAULT_INIT_SCRIPT=lsb ;; esac AC_MSG_RESULT([$DEFAULT_INIT_SCRIPT]) @@ -485,6 +496,7 @@ AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ sles) DEFAULT_INITCONF_DIR=/etc/sysconfig ;; ubuntu) DEFAULT_INITCONF_DIR=/etc/default ;; debian) DEFAULT_INITCONF_DIR=/etc/default ;; + freebsd) DEFAULT_INITCONF_DIR=$sysconfdir/rc.conf.d;; *) DEFAULT_INITCONF_DIR=/etc/default ;; esac AC_MSG_RESULT([$DEFAULT_INITCONF_DIR]) @@ -506,7 +518,9 @@ dnl # Default ZFS package configuration dnl # AC_DEFUN([ZFS_AC_PACKAGE], [ ZFS_AC_DEFAULT_PACKAGE - ZFS_AC_RPM - ZFS_AC_DPKG - ZFS_AC_ALIEN + AS_IF([test x$VENDOR != xfreebsd], [ + ZFS_AC_RPM + ZFS_AC_DPKG + ZFS_AC_ALIEN + ]) ]) diff --git a/configure.ac b/configure.ac index 7522940d2c32..902108f3649d 100644 --- a/configure.ac +++ b/configure.ac @@ -109,6 +109,14 @@ AC_CONFIG_FILES([ etc/zfs/Makefile include/Makefile include/os/Makefile + include/os/freebsd/Makefile + include/os/freebsd/linux/Makefile + include/os/freebsd/spl/Makefile + include/os/freebsd/spl/acl/Makefile + include/os/freebsd/spl/rpc/Makefile + include/os/freebsd/spl/sys/Makefile + include/os/freebsd/zfs/Makefile + include/os/freebsd/zfs/sys/Makefile include/os/linux/Makefile include/os/linux/kernel/Makefile include/os/linux/kernel/linux/Makefile @@ -138,6 +146,8 @@ AC_CONFIG_FILES([ lib/libspl/include/ia32/Makefile lib/libspl/include/ia32/sys/Makefile lib/libspl/include/os/Makefile + lib/libspl/include/os/freebsd/Makefile + lib/libspl/include/os/freebsd/sys/Makefile lib/libspl/include/os/linux/Makefile lib/libspl/include/os/linux/sys/Makefile lib/libspl/include/rpc/Makefile diff --git a/contrib/Makefile.am b/contrib/Makefile.am index 9f34fd8354d8..1486b28d3cda 100644 --- a/contrib/Makefile.am +++ b/contrib/Makefile.am @@ -1,2 +1,5 @@ -SUBDIRS = bash_completion.d bpftrace dracut initramfs pyzfs zcp +SUBDIRS = bash_completion.d pyzfs zcp +if BUILD_LINUX +SUBDIRS += bpftrace dracut initramfs +endif DIST_SUBDIRS = bash_completion.d bpftrace dracut initramfs pyzfs zcp diff --git a/etc/Makefile.am b/etc/Makefile.am index 67ef94a2017b..ac71da9445d8 100644 --- a/etc/Makefile.am +++ b/etc/Makefile.am @@ -1,2 +1,5 @@ -SUBDIRS = default zfs sudoers.d $(ZFS_INIT_SYSTEMD) $(ZFS_INIT_SYSV) $(ZFS_MODULE_LOAD) +SUBDIRS = zfs sudoers.d +if BUILD_LINUX +SUBDIRS += default $(ZFS_INIT_SYSTEMD) $(ZFS_INIT_SYSV) $(ZFS_MODULE_LOAD) +endif DIST_SUBDIRS = default init.d zfs systemd modules-load.d sudoers.d diff --git a/include/os/Makefile.am b/include/os/Makefile.am index 09c0beec4757..7eab1abde984 100644 --- a/include/os/Makefile.am +++ b/include/os/Makefile.am @@ -1,3 +1,6 @@ if BUILD_LINUX SUBDIRS = linux endif +if BUILD_FREEBSD +SUBDIRS = freebsd +endif diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am new file mode 100644 index 000000000000..3c87d4a0e791 --- /dev/null +++ b/include/os/freebsd/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = linux spl zfs diff --git a/include/os/freebsd/linux/Makefile.am b/include/os/freebsd/linux/Makefile.am new file mode 100644 index 000000000000..936cf21319be --- /dev/null +++ b/include/os/freebsd/linux/Makefile.am @@ -0,0 +1,5 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/freebsd/linux/compiler.h \ + $(top_srcdir)/include/os/freebsd/linux/types.h + +EXTRA_DIST = $(KERNEL_H) diff --git a/include/os/freebsd/linux/compiler.h b/include/os/freebsd/linux/compiler.h new file mode 100644 index 000000000000..d76050378e83 --- /dev/null +++ b/include/os/freebsd/linux/compiler.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iXsystems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * Copyright (c) 2013-2016 Mellanox Technologies, Ltd. + * Copyright (c) 2015 François Tigeot + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _LINUX_COMPILER_H_ +#define _LINUX_COMPILER_H_ + +#include + +#define __user +#define __kernel +#define __safe +#define __force +#define __nocast +#define __iomem +#define __chk_user_ptr(x) ((void)0) +#define __chk_io_ptr(x) ((void)0) +#define __builtin_warning(x, y...) (1) +#define __acquires(x) +#define __releases(x) +#define __acquire(x) do { } while (0) +#define __release(x) do { } while (0) +#define __cond_lock(x, c) (c) +#define __bitwise +#define __devinitdata +#define __deprecated +#define __init +#define __initconst +#define __devinit +#define __devexit +#define __exit +#define __rcu +#define __percpu +#define __weak __weak_symbol +#define __malloc +#define ___stringify(...) #__VA_ARGS__ +#define __stringify(...) ___stringify(__VA_ARGS__) +#define __attribute_const__ __attribute__((__const__)) +#undef __always_inline +#define __always_inline inline +#define noinline __noinline +#define ____cacheline_aligned __aligned(CACHE_LINE_SIZE) + +#ifndef _KERNEL +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#define typeof(x) __typeof(x) + +#define uninitialized_var(x) x = x +#define __maybe_unused __unused +#define __always_unused __unused +#define __must_check __result_use_check + +#define __printf(a, b) __printflike(a, b) + +#define barrier() __asm__ __volatile__("": : :"memory") +#define smp_rmb() rmb() +#define ___PASTE(a, b) a##b +#define __PASTE(a, b) ___PASTE(a, b) + +#define ACCESS_ONCE(x) (*(volatile __typeof(x) *)&(x)) + +#define WRITE_ONCE(x, v) do { \ + barrier(); \ + ACCESS_ONCE(x) = (v); \ + barrier(); \ +} while (0) + +#define lockless_dereference(p) READ_ONCE(p) + +#define _AT(T, X) ((T)(X)) + +#endif /* _LINUX_COMPILER_H_ */ diff --git a/include/os/freebsd/linux/types.h b/include/os/freebsd/linux/types.h new file mode 100644 index 000000000000..301163c034f5 --- /dev/null +++ b/include/os/freebsd/linux/types.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iXsystems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * Copyright (c) 2013-2017 Mellanox Technologies, Ltd. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _LINUX_TYPES_H_ +#define _LINUX_TYPES_H_ + +#include +#include +#include +#include + + +#ifndef __bitwise__ +#ifdef __CHECKER__ +#define __bitwise__ __attribute__((bitwise)) +#else +#define __bitwise__ +#endif +#endif + +typedef uint16_t __le16; +typedef uint16_t __be16; +typedef uint32_t __le32; +typedef uint32_t __be32; +typedef uint64_t __le64; +typedef uint64_t __be64; + +typedef unsigned gfp_t; +typedef uint64_t loff_t; +typedef vm_paddr_t resource_size_t; +typedef uint16_t __bitwise__ __sum16; +typedef unsigned long pgoff_t; +typedef unsigned __poll_t; + +typedef uint64_t u64; +typedef u64 phys_addr_t; + +typedef size_t __kernel_size_t; + +#define DECLARE_BITMAP(n, bits) \ + unsigned long n[howmany(bits, sizeof (long) * 8)] + +typedef unsigned long irq_hw_number_t; + +struct rcu_head { + void *raw[2]; +} __aligned(sizeof (void *)); + +typedef void (*rcu_callback_t)(struct rcu_head *head); +typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func); +typedef int linux_task_fn_t(void *data); + +#endif /* _LINUX_TYPES_H_ */ diff --git a/include/os/freebsd/spl/Makefile.am b/include/os/freebsd/spl/Makefile.am new file mode 100644 index 000000000000..b321825cb77e --- /dev/null +++ b/include/os/freebsd/spl/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = acl rpc sys diff --git a/include/os/freebsd/spl/acl/Makefile.am b/include/os/freebsd/spl/acl/Makefile.am new file mode 100644 index 000000000000..65a03ea1789c --- /dev/null +++ b/include/os/freebsd/spl/acl/Makefile.am @@ -0,0 +1,4 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/freebsd/spl/acl/acl_common.h + +EXTRA_DIST = $(KERNEL_H) diff --git a/include/os/freebsd/spl/acl/acl_common.h b/include/os/freebsd/spl/acl/acl_common.h new file mode 100644 index 000000000000..00a2a9dfe73a --- /dev/null +++ b/include/os/freebsd/spl/acl/acl_common.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _ACL_COMMON_H +#define _ACL_COMMON_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; + +extern int acltrivial(const char *); +extern void adjust_ace_pair(ace_t *pair, mode_t mode); +extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t); +extern int ace_trivial(ace_t *acep, int aclcnt); +extern int ace_trivial_common(void *, int, + uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *, + uint32_t *mask)); +#if !defined(_KERNEL) +extern acl_t *acl_alloc(acl_type_t); +extern void acl_free(acl_t *aclp); +extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, + uid_t owner, gid_t group); +#endif /* !_KERNEL */ +int cmp2acls(void *a, void *b); +int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count); +void acl_trivial_access_masks(mode_t mode, boolean_t isdir, + trivial_acl_t *masks); + +#ifdef __cplusplus +} +#endif + +#endif /* _ACL_COMMON_H */ diff --git a/include/os/freebsd/spl/rpc/Makefile.am b/include/os/freebsd/spl/rpc/Makefile.am new file mode 100644 index 000000000000..266a3b759ca4 --- /dev/null +++ b/include/os/freebsd/spl/rpc/Makefile.am @@ -0,0 +1,8 @@ +COMMON_H = + +KERNEL_H = \ + $(top_srcdir)/include/os/freebsd/spl/rpc/xdr.h + +USER_H = + +EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) diff --git a/include/os/freebsd/spl/rpc/xdr.h b/include/os/freebsd/spl/rpc/xdr.h new file mode 100644 index 000000000000..b4df2c1ea04a --- /dev/null +++ b/include/os/freebsd/spl/rpc/xdr.h @@ -0,0 +1,71 @@ +/* + * Sun RPC is a product of Sun Microsystems, Inc. and is provided for + * unrestricted use provided that this legend is included on all tape + * media and as a part of the software program in whole or part. Users + * may copy or modify Sun RPC without charge, but are not authorized + * to license or distribute it to anyone else except as part of a product or + * program developed by the user. + * + * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE + * WARRANTIES OF DESIGN, MERCHANTIBILITY AND FITNESS FOR A PARTICULAR + * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. + * + * Sun RPC is provided with no support and without any obligation on the + * part of Sun Microsystems, Inc. to assist in its use, correction, + * modification or enhancement. + * + * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE + * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC + * OR ANY PART THEREOF. + * + * In no event will Sun Microsystems, Inc. be liable for any lost revenue + * or profits or other special, indirect and consequential damages, even if + * Sun has been advised of the possibility of such damages. + * + * Sun Microsystems, Inc. + * 2550 Garcia Avenue + * Mountain View, California 94043 + */ + +#ifndef _OPENSOLARIS_RPC_XDR_H_ +#define _OPENSOLARIS_RPC_XDR_H_ + +#include +#include_next + +#ifndef _KERNEL + +#include + +/* + * Taken from sys/xdr/xdr_mem.c. + * + * FreeBSD's userland XDR doesn't implement control method (only the kernel), + * but OpenSolaris nvpair still depend on it, so we have to implement it here. + */ +static __inline bool_t +xdrmem_control(XDR *xdrs, int request, void *info) +{ + xdr_bytesrec *xptr; + + switch (request) { + case XDR_GET_BYTES_AVAIL: + xptr = (xdr_bytesrec *)info; + xptr->xc_is_last_record = TRUE; + xptr->xc_num_avail = xdrs->x_handy; + return (TRUE); + default: + assert(!"unexpected request"); + } + return (FALSE); +} + +#undef XDR_CONTROL +#define XDR_CONTROL(xdrs, req, op) \ + (((xdrs)->x_ops->x_control == NULL) ? \ + xdrmem_control((xdrs), (req), (op)) : \ + (*(xdrs)->x_ops->x_control)(xdrs, req, op)) + +#endif /* !_KERNEL */ + +#endif /* !_OPENSOLARIS_RPC_XDR_H_ */ diff --git a/include/os/freebsd/spl/sys/Makefile.am b/include/os/freebsd/spl/sys/Makefile.am new file mode 100644 index 000000000000..29a39cacf4d1 --- /dev/null +++ b/include/os/freebsd/spl/sys/Makefile.am @@ -0,0 +1,72 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/freebsd/spl/sys/acl_impl.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/acl.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/atomic.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/byteorder.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/callb.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/cmn_err.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/condvar.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/console.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/cred.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/ctype.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/debug.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/dirent.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/disp.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/dkio.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/endian.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/extdirent.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/file.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/freebsd_rwlock.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/inttypes.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/isa_defs.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/kmem_cache.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/kmem.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/kstat.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/list_impl.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/list.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/lock.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/Makefile.am \ + $(top_srcdir)/include/os/freebsd/spl/sys/misc.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/mod_os.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/mode.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/mount.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/mutex.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/param.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/policy.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/proc.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/processor.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/procfs_list.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/random.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/rwlock.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/sdt.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/sid.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/sig.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/simd_x86.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/simd.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/spl_condvar.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/string.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/strings.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/sunddi.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/sysmacros.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/systeminfo.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/systm.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/taskq.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/thread.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/time.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/timer.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/trace_zfs.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/trace.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/types.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/types32.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/uio.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/uuid.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/vfs.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/vm.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/vmsystm.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/vnode_impl.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/vnode.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/zmod.h \ + $(top_srcdir)/include/os/freebsd/spl/sys/zone.h + +EXTRA_DIST = $(KERNEL_H) diff --git a/include/os/freebsd/spl/sys/acl.h b/include/os/freebsd/spl/sys/acl.h new file mode 100644 index 000000000000..ee50b0a18368 --- /dev/null +++ b/include/os/freebsd/spl/sys/acl.h @@ -0,0 +1,216 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Garrett D'Amore + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2017 RackTop Systems. + */ + +#ifndef _SYS_ACL_H +#define _SYS_ACL_H + +#include +#include + +/* + * When compiling OpenSolaris kernel code, this file is included instead of the + * FreeBSD one. Include the original sys/acl.h as well. + */ +#undef _SYS_ACL_H +#include_next +#define _SYS_ACL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_ACL_ENTRIES (1024) /* max entries of each type */ +typedef struct { + int a_type; /* the type of ACL entry */ + uid_t a_id; /* the entry in -uid or gid */ + o_mode_t a_perm; /* the permission field */ +} aclent_t; + +typedef struct ace { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ +} ace_t; + +/* + * The following are Defined types for an aclent_t. + */ +#define USER_OBJ (0x01) /* object owner */ +#define USER (0x02) /* additional users */ +#define GROUP_OBJ (0x04) /* owning group of the object */ +#define GROUP (0x08) /* additional groups */ +#define CLASS_OBJ (0x10) /* file group class and mask entry */ +#define OTHER_OBJ (0x20) /* other entry for the object */ +#define ACL_DEFAULT (0x1000) /* default flag */ +/* default object owner */ +#define DEF_USER_OBJ (ACL_DEFAULT | USER_OBJ) +/* default additional users */ +#define DEF_USER (ACL_DEFAULT | USER) +/* default owning group */ +#define DEF_GROUP_OBJ (ACL_DEFAULT | GROUP_OBJ) +/* default additional groups */ +#define DEF_GROUP (ACL_DEFAULT | GROUP) +/* default mask entry */ +#define DEF_CLASS_OBJ (ACL_DEFAULT | CLASS_OBJ) +/* default other entry */ +#define DEF_OTHER_OBJ (ACL_DEFAULT | OTHER_OBJ) + +/* + * The following are defined for ace_t. + */ +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \ + ACL_DEFAULTED) + +/* + * These are only applicable in a CIFS context. + */ +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) + +#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) + +#define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ + ACE_READ_NAMED_ATTRS) + +#define ACE_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS) + +#define ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE) +/* + * The following flags are supported by both NFSv4 ACLs and ace_t. + */ +#define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | \ + ACE_INHERIT_ONLY_ACE | \ + ACE_INHERITED_ACE | \ + ACE_IDENTIFIER_GROUP) + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \ + ACE_IDENTIFIER_GROUP) +#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \ + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) + +/* cmd args to acl(2) for aclent_t */ +#define GETACL 1 +#define SETACL 2 +#define GETACLCNT 3 + +/* cmd's to manipulate ace acls. */ +#define ACE_GETACL 4 +#define ACE_SETACL 5 +#define ACE_GETACLCNT 6 + +/* minimal acl entries from GETACLCNT */ +#define MIN_ACL_ENTRIES 4 + +extern void aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp); +extern int acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries); +extern void ksort(caddr_t, int, int, int (*)(void *, void *)); +extern int cmp2acls(void *, void *); + +extern int acl(const char *path, int cmd, int cnt, void *buf); +extern int facl(int fd, int cmd, int cnt, void *buf); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_H */ diff --git a/include/os/freebsd/spl/sys/acl_impl.h b/include/os/freebsd/spl/sys/acl_impl.h new file mode 100644 index 000000000000..8718f5bcf63f --- /dev/null +++ b/include/os/freebsd/spl/sys/acl_impl.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ACL_IMPL_H +#define _SYS_ACL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * acl flags + * + * ACL_AUTO_INHERIT, ACL_PROTECTED and ACL_DEFAULTED + * flags can also be stored in this field. + */ +#define ACL_IS_TRIVIAL 0x10000 +#define ACL_IS_DIR 0x20000 + +typedef enum acl_type { + ACLENT_T = 0, + ACE_T = 1 +} zfs_acl_type_t; + +struct acl_info { + zfs_acl_type_t acl_type; /* style of acl */ + int acl_cnt; /* number of acl entries */ + int acl_entry_size; /* sizeof acl entry */ + int acl_flags; /* special flags about acl */ + void *acl_aclp; /* the acl */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_IMPL_H */ diff --git a/include/os/freebsd/spl/sys/atomic.h b/include/os/freebsd/spl/sys/atomic.h new file mode 100644 index 000000000000..e283c6c0e3ff --- /dev/null +++ b/include/os/freebsd/spl/sys/atomic.h @@ -0,0 +1,188 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_ATOMIC_H_ +#define _OPENSOLARIS_SYS_ATOMIC_H_ + +#include +#include + +#define casptr(_a, _b, _c) \ + atomic_cmpset_ptr((volatile uintptr_t *)(_a), \ + (uintptr_t)(_b), \ + (uintptr_t)(_c)) +#define cas32 atomic_cmpset_32 +#define atomic_sub_64 atomic_subtract_64 + +#if defined(__i386__) || defined(KLD_MODULE) +#define I386_HAVE_ATOMIC64 +#endif + +#if !defined(__LP64__) && !defined(__mips_n32) && \ + !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) +extern void atomic_add_64(volatile uint64_t *target, int64_t delta); +extern void atomic_dec_64(volatile uint64_t *target); +#endif +#ifndef __sparc64__ +#if defined(__LP64__) || defined(__mips_n32) || \ + defined(ARM_HAVE_ATOMIC64) || defined(I386_HAVE_ATOMIC64) + +#define membar_producer() wmb() + +static __inline uint64_t +atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval) +{ + +#ifdef __i386__ + atomic_fcmpset_64(target, &cmp, newval); +#else + atomic_fcmpset_long(target, &cmp, newval); +#endif + return (cmp); +} + +static __inline uint32_t +atomic_cas_32(volatile uint32_t *target, uint32_t cmp, uint32_t newval) +{ + + atomic_fcmpset_int(target, &cmp, newval); + return (cmp); +} + +static __inline uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + uint64_t prev; + + prev = atomic_fetchadd_long(target, delta); + + return (prev + delta); +} + +#else +extern uint32_t atomic_cas_32(volatile uint32_t *target, uint32_t cmp, + uint32_t newval); +extern uint64_t atomic_cas_64(volatile uint64_t *target, uint64_t cmp, + uint64_t newval); +extern uint64_t atomic_add_64_nv(volatile uint64_t *target, int64_t delta); +extern void membar_producer(void); +#endif +#endif +extern uint8_t atomic_or_8_nv(volatile uint8_t *target, uint8_t value); + +#if defined(__sparc64__) || defined(__powerpc__) || defined(__arm__) || \ + defined(__mips__) || defined(__aarch64__) || defined(__riscv) +extern void atomic_or_8(volatile uint8_t *target, uint8_t value); +#else +static __inline void +atomic_or_8(volatile uint8_t *target, uint8_t value) +{ + atomic_set_8(target, value); +} +#endif + +static __inline uint32_t +atomic_add_32_nv(volatile uint32_t *target, int32_t delta) +{ + return (atomic_fetchadd_32(target, delta) + delta); +} + +static __inline uint32_t +atomic_add_int_nv(volatile uint32_t *target, int delta) +{ + return (atomic_add_32_nv(target, delta)); +} + +static __inline void +atomic_dec_32(volatile uint32_t *target) +{ + atomic_subtract_32(target, 1); +} + +static __inline uint32_t +atomic_dec_32_nv(volatile uint32_t *target) +{ + return (atomic_fetchadd_32(target, -1) - 1); +} + +#if defined(__LP64__) || defined(__mips_n32) || \ + defined(ARM_HAVE_ATOMIC64) || defined(I386_HAVE_ATOMIC64) +static __inline void +atomic_dec_64(volatile uint64_t *target) +{ + atomic_subtract_64(target, 1); +} +#endif + +static __inline void +atomic_inc_32(volatile uint32_t *target) +{ + atomic_add_32(target, 1); +} + +static __inline uint32_t +atomic_inc_32_nv(volatile uint32_t *target) +{ + return (atomic_add_32_nv(target, 1)); +} + +static __inline void +atomic_inc_64(volatile uint64_t *target) +{ + atomic_add_64(target, 1); +} + +static __inline uint64_t +atomic_inc_64_nv(volatile uint64_t *target) +{ + return (atomic_add_64_nv(target, 1)); +} + +static __inline uint64_t +atomic_dec_64_nv(volatile uint64_t *target) +{ + return (atomic_add_64_nv(target, -1)); +} + +#if !defined(COMPAT_32BIT) && defined(__LP64__) +static __inline void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_64((volatile uint64_t *)target, + (uint64_t)cmp, (uint64_t)newval)); +} +#else +static __inline void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_32((volatile uint32_t *)target, + (uint32_t)cmp, (uint32_t)newval)); +} +#endif /* !defined(COMPAT_32BIT) && defined(__LP64__) */ + +#endif /* !_OPENSOLARIS_SYS_ATOMIC_H_ */ diff --git a/include/os/freebsd/spl/sys/byteorder.h b/include/os/freebsd/spl/sys/byteorder.h new file mode 100644 index 000000000000..79ae848c7259 --- /dev/null +++ b/include/os/freebsd/spl/sys/byteorder.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _OPENSOLARIS_SYS_BYTEORDER_H_ +#define _OPENSOLARIS_SYS_BYTEORDER_H_ + +#include + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#if BYTE_ORDER == BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +#if BYTE_ORDER == BIG_ENDIAN +#define htonll(x) BMASK_64(x) +#define ntohll(x) BMASK_64(x) +#else +#define htonll(x) BSWAP_64(x) +#define ntohll(x) BSWAP_64(x) +#endif + +#define BE_IN32(xa) htonl(*((uint32_t *)(void *)(xa))) + +#endif /* _OPENSOLARIS_SYS_BYTEORDER_H_ */ diff --git a/include/os/freebsd/spl/sys/callb.h b/include/os/freebsd/spl/sys/callb.h new file mode 100644 index 000000000000..ed9ed8cd88a8 --- /dev/null +++ b/include/os/freebsd/spl/sys/callb.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CALLB_H +#define _SYS_CALLB_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * definitions of callback classes (c_class) + * + * Callbacks belong in the same class if (1) their callback routines + * do the same kind of processing (ideally, using the same callback function) + * and (2) they can/should be executed at the same time in a cpr + * suspend/resume operation. + * + * Note: The DAEMON class, in particular, is for stopping kernel threads + * and nothing else. The CALLB_* macros below should be used to deal + * with kernel threads, and the callback function should be callb_generic_cpr. + * Another idiosyncrasy of the DAEMON class is that if a suspend operation + * fails, some of the callback functions may be called with the RESUME + * code which were never called with SUSPEND. Not a problem currently, + * but see bug 4201851. + */ +#define CB_CL_CPR_DAEMON 0 +#define CB_CL_CPR_VM 1 +#define CB_CL_CPR_CALLOUT 2 +#define CB_CL_CPR_OBP 3 +#define CB_CL_CPR_FB 4 +#define CB_CL_PANIC 5 +#define CB_CL_CPR_RPC 6 +#define CB_CL_CPR_PROMPRINTF 7 +#define CB_CL_UADMIN 8 +#define CB_CL_CPR_PM 9 +#define CB_CL_HALT 10 +#define CB_CL_CPR_DMA 11 +#define CB_CL_CPR_POST_USER 12 +#define CB_CL_UADMIN_PRE_VFS 13 +#define CB_CL_MDBOOT CB_CL_UADMIN +#define CB_CL_ENTER_DEBUGGER 14 +#define CB_CL_CPR_POST_KERNEL 15 +#define CB_CL_CPU_DEEP_IDLE 16 +#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */ + +/* + * CB_CL_CPR_DAEMON class specific definitions are given below: + */ + +/* + * code for CPR callb_execute_class + */ +#define CB_CODE_CPR_CHKPT 0 +#define CB_CODE_CPR_RESUME 1 + +typedef void * callb_id_t; +/* + * Per kernel thread structure for CPR daemon callbacks. + * Must be protected by either a existing lock in the daemon or + * a new lock created for such a purpose. + */ +typedef struct callb_cpr { + kmutex_t *cc_lockp; /* lock to protect this struct */ + char cc_events; /* various events for CPR */ + callb_id_t cc_id; /* callb id address */ + kcondvar_t cc_callb_cv; /* cv for callback waiting */ + kcondvar_t cc_stop_cv; /* cv to checkpoint block */ +} callb_cpr_t; + +/* + * cc_events definitions + */ +#define CALLB_CPR_START 1 /* a checkpoint request's started */ +#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */ +#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */ + +/* + * Used when checking that all kernel threads are stopped. + */ +#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */ +#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */ +#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */ + /* due to pwr mgmt of disks, make -- */ + /* big enough for worst spinup time */ + +/* + * + * CALLB_CPR_INIT macro is used by kernel threads to add their entry to + * the callback table and perform other initialization. It automatically + * adds the thread as being in the callback class CB_CL_CPR_DAEMON. + * + * cp - ptr to the callb_cpr_t structure for this kernel thread + * + * lockp - pointer to mutex protecting the callb_cpr_t stuct + * + * func - pointer to the callback function for this kernel thread. + * It has the prototype boolean_t (void *arg, int code) + * where: arg - ptr to the callb_cpr_t structure + * code - not used for this type of callback + * returns: B_TRUE if successful; B_FALSE if unsuccessful. + * + * name - a string giving the name of the kernel thread + * + * Note: lockp is the lock to protect the callb_cpr_t (cp) structure + * later on. No lock held is needed for this initialization. + */ +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + strlcpy(curthread->td_name, (name), \ + sizeof (curthread->td_name)); \ + bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \ + (cp)->cc_lockp = lockp; \ + (cp)->cc_id = callb_add(func, (void *)(cp), \ + CB_CL_CPR_DAEMON, name); \ + cv_init(&(cp)->cc_callb_cv, NULL, CV_DEFAULT, NULL); \ + cv_init(&(cp)->cc_stop_cv, NULL, CV_DEFAULT, NULL); \ + } + +#ifndef __lock_lint +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); +#else +#define CALLB_CPR_ASSERT(cp) +#endif +/* + * Some threads (like the idle threads) do not adhere to the callback + * protocol and are always considered safe. Such threads must never exit. + * They register their presence by calling this macro during their + * initialization. + * + * Args: + * t - thread pointer of the client kernel thread + * name - a string giving the name of the kernel thread + */ +#define CALLB_CPR_INIT_SAFE(t, name) { \ + (void) callb_add_thread(callb_generic_cpr_safe, \ + (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \ + name, t); \ + } +/* + * The lock to protect cp's content must be held before + * calling the following two macros. + * + * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END + * is safe for checkpoint/resume. + */ +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + } +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp) \ + while ((cp)->cc_events & CALLB_CPR_START) \ + cv_wait(&(cp)->cc_stop_cv, lockp); \ + (cp)->cc_events &= ~CALLB_CPR_SAFE; \ + } +/* + * cv_destroy is nop right now but may be needed in the future. + */ +#define CALLB_CPR_EXIT(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + mutex_exit((cp)->cc_lockp); \ + (void) callb_delete((cp)->cc_id); \ + cv_destroy(&(cp)->cc_callb_cv); \ + cv_destroy(&(cp)->cc_stop_cv); \ + } + +extern callb_cpr_t callb_cprinfo_safe; +extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *); +extern callb_id_t callb_add_thread(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); +extern int callb_delete(callb_id_t); +extern void callb_execute(callb_id_t, int); +extern void *callb_execute_class(int, int); +extern boolean_t callb_generic_cpr(void *, int); +extern boolean_t callb_generic_cpr_safe(void *, int); +extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *); +extern void callb_lock_table(void); +extern void callb_unlock_table(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CALLB_H */ diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h new file mode 100644 index 000000000000..0bb0e637cddc --- /dev/null +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -0,0 +1,372 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CCOMPILE_H +#define _SYS_CCOMPILE_H + +/* + * This file contains definitions designed to enable different compilers + * to be used harmoniously on Solaris systems. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Allow for version tests for compiler bugs and features. + */ +#if defined(__GNUC__) +#define __GNUC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#else +#define __GNUC_VERSION 0 +#endif + +#if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__) + +/* + * analogous to lint's PRINTFLIKEn + */ +#define __sun_attr___PRINTFLIKE__(__n) \ + __attribute__((__format__(printf, __n, (__n)+1))) +#define __sun_attr___VPRINTFLIKE__(__n) \ + __attribute__((__format__(printf, __n, 0))) + +/* + * Handle the kernel printf routines that can take '%b' too + */ +#if __GNUC_VERSION < 30402 +/* + * XX64 at least this doesn't work correctly yet with 3.4.1 anyway! + */ +#define __sun_attr___KPRINTFLIKE__ __sun_attr___PRINTFLIKE__ +#define __sun_attr___KVPRINTFLIKE__ __sun_attr___VPRINTFLIKE__ +#else +#define __sun_attr___KPRINTFLIKE__(__n) \ + __attribute__((__format__(cmn_err, __n, (__n)+1))) +#define __sun_attr___KVPRINTFLIKE__(__n) \ + __attribute__((__format__(cmn_err, __n, 0))) +#endif + +/* + * This one's pretty obvious -- the function never returns + */ +#define __sun_attr___noreturn__ __attribute__((__noreturn__)) + + +/* + * This is an appropriate label for functions that do not + * modify their arguments, e.g. strlen() + */ +#define __sun_attr___pure__ __attribute__((__pure__)) + +/* + * This is a stronger form of __pure__. Can be used for functions + * that do not modify their arguments and don't depend on global + * memory. + */ +#define __sun_attr___const__ __attribute__((__const__)) + +/* + * structure packing like #pragma pack(1) + */ +#define __sun_attr___packed__ __attribute__((__packed__)) + +#define ___sun_attr_inner(__a) __sun_attr_##__a +#define __sun_attr__(__a) ___sun_attr_inner __a + +#else /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ + +#define __sun_attr__(__a) + +#endif /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ + +/* + * Shorthand versions for readability + */ + +#define __PRINTFLIKE(__n) __sun_attr__((__PRINTFLIKE__(__n))) +#define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n))) +#define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n))) +#define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n))) +#ifdef _KERNEL +#define __NORETURN __sun_attr__((__noreturn__)) +#endif +#define __CONST __sun_attr__((__const__)) +#define __PURE __sun_attr__((__pure__)) + +#if (defined(ZFS_DEBUG) || !defined(NDEBUG))&& !defined(DEBUG) +#define DEBUG +#endif +#define EXPORT_SYMBOL(x) +#define MODULE_AUTHOR(s) +#define MODULE_DESCRIPTION(s) +#define MODULE_LICENSE(s) +#define module_param(a, b, c) +#define module_param_call(a, b, c, d, e) +#define module_param_named(a, b, c, d) +#define MODULE_PARM_DESC(a, b) +#define asm __asm +#ifdef ZFS_DEBUG +#undef NDEBUG +#endif + +#ifndef EINTEGRITY +#define EINTEGRITY 97 /* EINTEGRITY is new in 13 */ +#endif + +/* + * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD + * equivalents. This gives us more useful error messages from strerror(3). + */ +#define ECKSUM EINTEGRITY +#define EFRAGS ENOSPC + +/* Similar for ENOACTIVE */ +#define ENOTACTIVE ECANCELED + +#define EREMOTEIO EREMOTE +#define ECHRNG ENXIO +#define ETIME ETIMEDOUT + +#define O_LARGEFILE 0 +#define O_RSYNC 0 +#define O_DSYNC 0 + +#define KMALLOC_MAX_SIZE MAXPHYS + +#ifdef _KERNEL +typedef unsigned long long u_longlong_t; +typedef long long longlong_t; + +#include +typedef void zfs_kernel_param_t; +#define param_set_charp(a, b) (0) +#define ATTR_UID AT_UID +#define ATTR_GID AT_GID +#define ATTR_MODE AT_MODE +#define ATTR_XVATTR AT_XVATTR +#define ATTR_CTIME AT_CTIME +#define ATTR_MTIME AT_MTIME +#define ATTR_ATIME AT_ATIME +#define vmem_free zfs_kmem_free +#define vmem_zalloc(size, flags) zfs_kmem_alloc(size, flags | M_ZERO) +#define vmem_alloc zfs_kmem_alloc +#define MUTEX_NOLOCKDEP 0 +#define RW_NOLOCKDEP 0 + + +#if __FreeBSD_version < 1300051 +#define vm_page_valid(m) (m)->valid = VM_PAGE_BITS_ALL +#define vm_page_do_sunbusy(m) +#define vm_page_none_valid(m) ((m)->valid == 0) +#else +#define vm_page_do_sunbusy(m) vm_page_sunbusy(m) +#endif + +#if __FreeBSD_version < 1300074 +#define VOP_UNLOCK1(x) VOP_UNLOCK(x, 0) +#else +#define VOP_UNLOCK1(x) VOP_UNLOCK(x) +#endif + +#if __FreeBSD_version < 1300064 +#define VN_IS_DOOMED(vp) ((vp)->v_iflag & VI_DOOMED) +#endif + +#if __FreeBSD_version < 1300068 +#define VFS_VOP_VECTOR_REGISTER(x) +#endif + +#if __FreeBSD_version >= 1300076 +#define getnewvnode_reserve_() getnewvnode_reserve() +#else +#define getnewvnode_reserve_() getnewvnode_reserve(1) +#endif + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +typedef struct { + volatile int counter; +} atomic_t; + + /* BEGIN CSTYLED */ +#define hlist_for_each(p, head) \ + for (p = (head)->first; p; p = (p)->next) + +#define hlist_entry(ptr, type, field) container_of(ptr, type, field) + +#define container_of(ptr, type, member) \ +({ \ + const __typeof(((type *)0)->member) *__p = (ptr); \ + (type *)((uintptr_t)__p - offsetof(type, member)); \ +}) + /* END CSTYLED */ + +static inline void +hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + n->next = h->first; + if (h->first != NULL) + h->first->pprev = &n->next; + WRITE_ONCE(h->first, n); + n->pprev = &h->first; +} + +static inline void +hlist_del(struct hlist_node *n) +{ + WRITE_ONCE(*(n->pprev), n->next); + if (n->next != NULL) + n->next->pprev = n->pprev; +} + /* BEGIN CSTYLED */ +#define READ_ONCE(x) ({ \ + __typeof(x) __var = ({ \ + barrier(); \ + ACCESS_ONCE(x); \ + }); \ + barrier(); \ + __var; \ +}) + +#define HLIST_HEAD_INIT { } +#define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT +#define INIT_HLIST_HEAD(head) (head)->first = NULL + +#define INIT_HLIST_NODE(node) \ + do { \ + (node)->next = NULL; \ + (node)->pprev = NULL; \ + } while (0) + +/* END CSTYLED */ +static inline int +atomic_read(const atomic_t *v) +{ + return (READ_ONCE(v->counter)); +} + +static inline int +atomic_inc(atomic_t *v) +{ + return (atomic_fetchadd_int(&v->counter, 1) + 1); +} + +static inline int +atomic_dec(atomic_t *v) +{ + return (atomic_fetchadd_int(&v->counter, -1) - 1); +} + +#else +typedef long loff_t; +typedef long rlim64_t; +typedef int bool_t; +typedef int enum_t; +#define __init +#define __exit +#define FALSE 0 +#define TRUE 1 + /* + * XXX We really need to consolidate on standard + * error codes in the common code + */ +#define ENOSTR ENOTCONN +#define ENODATA EINVAL + + +#define __XSI_VISIBLE 1000 +#define __BSD_VISIBLE 1 +#define __POSIX_VISIBLE 201808 +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#define open64 open +#define pwrite64 pwrite +#define ftruncate64 ftruncate +#define lseek64 lseek +#define pread64 pread +#define stat64 stat +#define lstat64 lstat +#define statfs64 statfs +#define readdir64 readdir +#define dirent64 dirent +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + ((((type)(x) - 1) | ((type)(align) - 1)) + 1) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define RLIM64_INFINITY RLIM_INFINITY +#define ERESTART EAGAIN +#define ABS(a) ((a) < 0 ? -(a) : (a)) + +#endif +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CCOMPILE_H */ diff --git a/include/os/freebsd/spl/sys/cmn_err.h b/include/os/freebsd/spl/sys/cmn_err.h new file mode 100644 index 000000000000..a75471f647eb --- /dev/null +++ b/include/os/freebsd/spl/sys/cmn_err.h @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CMN_ERR_H +#define _SYS_CMN_ERR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#if !defined(_ASM) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Common error handling severity levels */ + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +#ifndef _ASM + +/*PRINTFLIKE2*/ +extern void cmn_err(int, const char *, ...) + __KPRINTFLIKE(2); +#pragma rarely_called(cmn_err) + +extern void vzcmn_err(zoneid_t, int, const char *, __va_list) + __KVPRINTFLIKE(3); +#pragma rarely_called(vzcmn_err) + +extern void vcmn_err(int, const char *, __va_list) + __KVPRINTFLIKE(2); +#pragma rarely_called(vcmn_err) + +/*PRINTFLIKE3*/ +extern void zcmn_err(zoneid_t, int, const char *, ...) + __KPRINTFLIKE(3); +#pragma rarely_called(zcmn_err) + +extern void vzprintf(zoneid_t, const char *, __va_list) + __KVPRINTFLIKE(2); +#pragma rarely_called(vzprintf) + +/*PRINTFLIKE2*/ +extern void zprintf(zoneid_t, const char *, ...) + __KPRINTFLIKE(2); +#pragma rarely_called(zprintf) + +extern void vuprintf(const char *, __va_list) + __KVPRINTFLIKE(1); +#pragma rarely_called(vuprintf) + +/*PRINTFLIKE1*/ +extern void panic(const char *, ...) + __KPRINTFLIKE(1) __NORETURN; +#pragma rarely_called(panic) + +extern void vpanic(const char *, __va_list) + __KVPRINTFLIKE(1) __NORETURN; +#pragma rarely_called(vpanic) + +#endif /* !_ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CMN_ERR_H */ diff --git a/include/os/freebsd/spl/sys/condvar.h b/include/os/freebsd/spl/sys/condvar.h new file mode 100644 index 000000000000..b21940166c5d --- /dev/null +++ b/include/os/freebsd/spl/sys/condvar.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * Copyright (c) 2013 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_CONDVAR_H_ +#define _OPENSOLARIS_SYS_CONDVAR_H_ + +#include + +#include +#include +#include +#include + +static __inline sbintime_t +zfs_nstosbt(int64_t _ns) +{ + sbintime_t sb = 0; + +#ifdef KASSERT + KASSERT(_ns >= 0, ("Negative values illegal for nstosbt: %jd", _ns)); +#endif + if (_ns >= SBT_1S) { + sb = (_ns / 1000000000) * SBT_1S; + _ns = _ns % 1000000000; + } + /* 9223372037 = ceil(2^63 / 1000000000) */ + sb += ((_ns * 9223372037ull) + 0x7fffffff) >> 31; + return (sb); +} + + +typedef struct cv kcondvar_t; +#define CALLOUT_FLAG_ABSOLUTE C_ABSOLUTE + +typedef enum { + CV_DEFAULT, + CV_DRIVER +} kcv_type_t; + +#define zfs_cv_init(cv, name, type, arg) do { \ + const char *_name; \ + ASSERT((type) == CV_DEFAULT); \ + for (_name = #cv; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #cv; \ + cv_init((cv), _name); \ +} while (0) +#define cv_init(cv, name, type, arg) zfs_cv_init(cv, name, type, arg) + + +static inline int +cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + + return (_cv_wait_sig(cvp, &(mp)->lock_object) == 0); +} + +static inline int +cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) +{ + int rc; + + timo -= ddi_get_lbolt(); + if (timo <= 0) + return (-1); + rc = _cv_timedwait_sbt((cvp), &(mp)->lock_object, \ + tick_sbt * (timo), 0, C_HARDCLOCK); + if (rc == EWOULDBLOCK) + return (-1); + return (1); +} + +static inline int +cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) +{ + int rc; + + timo -= ddi_get_lbolt(); + if (timo <= 0) + return (-1); + rc = _cv_timedwait_sig_sbt(cvp, &(mp)->lock_object, \ + tick_sbt * (timo), 0, C_HARDCLOCK); + if (rc == EWOULDBLOCK) + return (-1); + if (rc == EINTR || rc == ERESTART) + return (0); + + return (1); +} + +#define cv_timedwait_io cv_timedwait +#define cv_timedwait_sig_io cv_timedwait_sig + +static inline clock_t +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + hrtime_t hrtime; + int rc; + + ASSERT(tim >= res); + + hrtime = gethrtime(); + if (flag == 0) + tim += hrtime; + + if (hrtime >= tim) + return (tim - hrtime); + rc = cv_timedwait_sbt(cvp, mp, zfs_nstosbt(tim), + zfs_nstosbt(res), C_ABSOLUTE); + + KASSERT(rc == EWOULDBLOCK || rc == 0, ("unexpected rc value %d", rc)); + return (tim - gethrtime()); +} + +static inline clock_t +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + sbintime_t sbt; + hrtime_t hrtime; + int rc; + + ASSERT(tim >= res); + + hrtime = gethrtime(); + if (flag == 0) + tim += hrtime; + + if (hrtime >= tim) + return (tim - hrtime); + + sbt = zfs_nstosbt(tim); + rc = cv_timedwait_sig_sbt(cvp, mp, sbt, zfs_nstosbt(res), C_ABSOLUTE); + + KASSERT(rc == EWOULDBLOCK || rc == EINTR || rc == ERESTART || + rc == 0, ("unexpected rc value %d", rc)); + return (tim - gethrtime()); +} + +#endif /* _OPENSOLARIS_SYS_CONDVAR_H_ */ diff --git a/include/os/freebsd/spl/sys/console.h b/include/os/freebsd/spl/sys/console.h new file mode 100644 index 000000000000..abf3db756767 --- /dev/null +++ b/include/os/freebsd/spl/sys/console.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_CONSOLE_H +#define _SPL_CONSOLE_H + +static inline void +console_vprintf(const char *fmt, va_list args) +{ + vprintf(fmt, args); +} + +static inline void +console_printf(const char *fmt, ...) +{ + va_list args; + + va_start(args, fmt); + console_vprintf(fmt, args); + va_end(args); +} + +#endif /* _SPL_CONSOLE_H */ diff --git a/include/os/freebsd/spl/sys/cred.h b/include/os/freebsd/spl/sys/cred.h new file mode 100644 index 000000000000..e32910e0efab --- /dev/null +++ b/include/os/freebsd/spl/sys/cred.h @@ -0,0 +1,188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SYS_CRED_H +#define _SYS_CRED_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The credential is an opaque kernel private data structure defined in + * . + */ + +typedef struct ucred cred_t; + +#define CRED() curthread->td_ucred +#define kcred (thread0.td_ucred) + +#define KUID_TO_SUID(x) (x) +#define KGID_TO_SGID(x) (x) +#define crgetuid(cred) ((cred)->cr_uid) +#define crgetruid(cred) ((cred)->cr_ruid) +#define crgetgid(cred) ((cred)->cr_gid) +#define crgetgroups(cred) ((cred)->cr_groups) +#define crgetngroups(cred) ((cred)->cr_ngroups) +#define crgetsid(cred, i) (NULL) + +struct proc; /* cred.h is included in proc.h */ +struct prcred; +struct ksid; +struct ksidlist; +struct credklpd; +struct credgrp; + +struct auditinfo_addr; /* cred.h is included in audit.h */ + +extern int ngroups_max; +/* + * kcred is used when you need all privileges. + */ + +extern void cred_init(void); +extern void crfree(cred_t *); +extern cred_t *cralloc(void); /* all but ref uninitialized */ +extern cred_t *cralloc_ksid(void); /* cralloc() + ksid alloc'ed */ +extern cred_t *crget(void); /* initialized */ +extern void crcopy_to(cred_t *, cred_t *); +extern cred_t *crdup(cred_t *); +extern void crdup_to(cred_t *, cred_t *); +extern cred_t *crgetcred(void); +extern void crset(struct proc *, cred_t *); +extern void crset_zone_privall(cred_t *); +extern int supgroupmember(gid_t, const cred_t *); +extern int hasprocperm(const cred_t *, const cred_t *); +extern int prochasprocperm(struct proc *, struct proc *, const cred_t *); +extern int crcmp(const cred_t *, const cred_t *); +extern cred_t *zone_kcred(void); + +extern gid_t crgetrgid(const cred_t *); +extern gid_t crgetsgid(const cred_t *); + +#define crgetzoneid(x) (0) +extern projid_t crgetprojid(const cred_t *); + +extern cred_t *crgetmapped(const cred_t *); + + +extern const struct auditinfo_addr *crgetauinfo(const cred_t *); +extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *); + +extern uint_t crgetref(const cred_t *); + +extern const gid_t *crgetggroups(const struct credgrp *); + + +/* + * Sets real, effective and/or saved uid/gid; + * -1 argument accepted as "no change". + */ +extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t); +extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t); + +/* + * Sets real, effective and saved uids/gids all to the same + * values. Both values must be non-negative and <= MAXUID + */ +extern int crsetugid(cred_t *, uid_t, gid_t); + +/* + * Functions to handle the supplemental group list. + */ +extern struct credgrp *crgrpcopyin(int, gid_t *); +extern void crgrprele(struct credgrp *); +extern void crsetcredgrp(cred_t *, struct credgrp *); + +/* + * Private interface for setting zone association of credential. + */ +struct zone; +extern void crsetzone(cred_t *, struct zone *); +extern struct zone *crgetzone(const cred_t *); + +/* + * Private interface for setting project id in credential. + */ +extern void crsetprojid(cred_t *, projid_t); + +/* + * Private interface for nfs. + */ +extern cred_t *crnetadjust(cred_t *); + +/* + * Private interface for procfs. + */ +extern void cred2prcred(const cred_t *, struct prcred *); + +/* + * Private interfaces for Rampart Trusted Solaris. + */ +struct ts_label_s; +extern struct ts_label_s *crgetlabel(const cred_t *); +extern boolean_t crisremote(const cred_t *); + +/* + * Private interfaces for ephemeral uids. + */ +#define VALID_UID(id, zn) \ + ((id) <= MAXUID || valid_ephemeral_uid((zn), (id))) + +#define VALID_GID(id, zn) \ + ((id) <= MAXUID || valid_ephemeral_gid((zn), (id))) + +extern boolean_t valid_ephemeral_uid(struct zone *, uid_t); +extern boolean_t valid_ephemeral_gid(struct zone *, gid_t); + +extern int eph_uid_alloc(struct zone *, int, uid_t *, int); +extern int eph_gid_alloc(struct zone *, int, gid_t *, int); + +extern void crsetsid(cred_t *, struct ksid *, int); +extern void crsetsidlist(cred_t *, struct ksidlist *); + +extern struct ksidlist *crgetsidlist(const cred_t *); + +extern int crsetpriv(cred_t *, ...); + +extern struct credklpd *crgetcrklpd(const cred_t *); +extern void crsetcrklpd(cred_t *, struct credklpd *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRED_H */ diff --git a/include/os/freebsd/spl/sys/ctype.h b/include/os/freebsd/spl/sys/ctype.h new file mode 100644 index 000000000000..f225858072ab --- /dev/null +++ b/include/os/freebsd/spl/sys/ctype.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _SPL_SYS_CTYPE_H_ +#define _SPL_SYS_CTYPE_H_ +#include_next + +#define isalnum(ch) (isalpha(ch) || isdigit(ch)) +#define iscntrl(C) (uchar(C) <= 0x1f || uchar(C) == 0x7f) +#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E) +/* BEGIN CSTYLED */ +#define ispunct(C) \ + (((C) >= 0x21 && (C) <= 0x2F) || \ + ((C) >= 0x3A && (C) <= 0x40) || \ + ((C) >= 0x5B && (C) <= 0x60) || \ + ((C) >= 0x7B && (C) <= 0x7E)) +/* END CSTYLED */ + +#endif diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h new file mode 100644 index 000000000000..2751f57801f7 --- /dev/null +++ b/include/os/freebsd/spl/sys/debug.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Available Solaris debug functions. All of the ASSERT() macros will be + * compiled out when NDEBUG is defined, this is the default behavior for + * the SPL. To enable assertions use the --enable-debug with configure. + * The VERIFY() functions are never compiled out and cannot be disabled. + * + * PANIC() - Panic the node and print message. + * ASSERT() - Assert X is true, if not panic. + * ASSERT3B() - Assert boolean X OP Y is true, if not panic. + * ASSERT3S() - Assert signed X OP Y is true, if not panic. + * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. + * ASSERT3P() - Assert pointer X OP Y is true, if not panic. + * ASSERT0() - Assert value is zero, if not panic. + * VERIFY() - Verify X is true, if not panic. + * VERIFY3B() - Verify boolean X OP Y is true, if not panic. + * VERIFY3S() - Verify signed X OP Y is true, if not panic. + * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. + * VERIFY3P() - Verify pointer X OP Y is true, if not panic. + * VERIFY0() - Verify value is zero, if not panic. + */ + +#ifndef _SPL_DEBUG_H +#define _SPL_DEBUG_H + + +/* + * Common DEBUG functionality. + */ +int spl_panic(const char *file, const char *func, int line, + const char *fmt, ...); +void spl_dumpstack(void); + +#ifndef expect +#define expect(expr, value) (__builtin_expect((expr), (value))) +#endif +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + +/* BEGIN CSTYLED */ +#define PANIC(fmt, a...) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "%s", "VERIFY(" #cond ") failed\n")) + +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + boolean_t _verify3_left = (boolean_t)(LEFT); \ + boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ + } while (0) + +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + int64_t _verify3_left = (int64_t)(LEFT); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + uint64_t _verify3_left = (uint64_t)(LEFT); \ + uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ + } while (0) + +#define VERIFY0(RIGHT) do { \ + int64_t _verify3_left = (int64_t)(0); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left == _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ + } while (0) +#define CTASSERT_GLOBAL(x) CTASSERT(x) + +/* + * Debugging disabled (--disable-debug) + */ +#ifdef NDEBUG + +#define ASSERT(x) ((void)0) +#define ASSERT3B(x,y,z) ((void)0) +#define ASSERT3S(x,y,z) ((void)0) +#define ASSERT3U(x,y,z) ((void)0) +#define ASSERT3P(x,y,z) ((void)0) +#define ASSERT0(x) ((void)0) +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) + +/* + * Debugging enabled (--enable-debug) + */ +#else + +#define ASSERT3B VERIFY3B +#define ASSERT3S VERIFY3S +#define ASSERT3U VERIFY3U +#define ASSERT3P VERIFY3P +#define ASSERT0 VERIFY0 +#define ASSERT VERIFY +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") implies (" #B ")"))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") is equivalent to (" #B ")"))) +/* END CSTYLED */ + +#endif /* NDEBUG */ + +#endif /* SPL_DEBUG_H */ diff --git a/include/os/freebsd/spl/sys/dirent.h b/include/os/freebsd/spl/sys/dirent.h new file mode 100644 index 000000000000..2403766a427d --- /dev/null +++ b/include/os/freebsd/spl/sys/dirent.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_DIRENT_H_ +#define _OPENSOLARIS_SYS_DIRENT_H_ + +#include + +#include_next + +typedef struct dirent dirent64_t; +typedef ino_t ino64_t; + +#define dirent64 dirent + +#define d_ino d_fileno + +#define DIRENT64_RECLEN(len) _GENERIC_DIRLEN(len) + +#endif /* !_OPENSOLARIS_SYS_DIRENT_H_ */ diff --git a/include/os/freebsd/spl/sys/disp.h b/include/os/freebsd/spl/sys/disp.h new file mode 100644 index 000000000000..2be1b76e4334 --- /dev/null +++ b/include/os/freebsd/spl/sys/disp.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2013 Andriy Gapon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_DISP_H_ +#define _OPENSOLARIS_SYS_DISP_H_ + +#include + +#define kpreempt(x) kern_yield(PRI_USER) + +#endif /* _OPENSOLARIS_SYS_DISP_H_ */ diff --git a/include/os/freebsd/spl/sys/dkio.h b/include/os/freebsd/spl/sys/dkio.h new file mode 100644 index 000000000000..4e9ded4a9788 --- /dev/null +++ b/include/os/freebsd/spl/sys/dkio.h @@ -0,0 +1,495 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _OPENSOLARIS_SYS_DKIO_H_ +#define _OPENSOLARIS_SYS_DKIO_H_ + +#include /* Needed for NDKMAP define */ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_SUNOS_VTOC_16) +#define NDKMAP 16 /* # of logical partitions */ +#define DK_LABEL_LOC 1 /* location of disk label */ +#elif defined(_SUNOS_VTOC_8) +#define NDKMAP 8 /* # of logical partitions */ +#define DK_LABEL_LOC 0 /* location of disk label */ +#else +#error "No VTOC format defined." +#endif + +/* + * Structures and definitions for disk io control commands + */ + +/* + * Structures used as data by ioctl calls. + */ + +#define DK_DEVLEN 16 /* device name max length, including */ + /* unit # & NULL (ie - "xyc1") */ + +/* + * Used for controller info + */ +struct dk_cinfo { + char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */ + ushort_t dki_ctype; /* controller type */ + ushort_t dki_flags; /* flags */ + ushort_t dki_cnum; /* controller number */ + uint_t dki_addr; /* controller address */ + uint_t dki_space; /* controller bus type */ + uint_t dki_prio; /* interrupt priority */ + uint_t dki_vec; /* interrupt vector */ + char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */ + uint_t dki_unit; /* unit number */ + uint_t dki_slave; /* slave number */ + ushort_t dki_partition; /* partition number */ + ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */ +}; + +/* + * Controller types + */ +#define DKC_UNKNOWN 0 +#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */ +#define DKC_WDC2880 2 +#define DKC_XXX_0 3 /* unassigned */ +#define DKC_XXX_1 4 /* unassigned */ +#define DKC_DSD5215 5 +#define DKC_ACB4000 7 +#define DKC_MD21 8 +#define DKC_XXX_2 9 /* unassigned */ +#define DKC_NCRFLOPPY 10 +#define DKC_SMSFLOPPY 12 +#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */ +#define DKC_INTEL82072 14 /* native floppy chip */ +#define DKC_MD 16 /* meta-disk (virtual-disk) driver */ +#define DKC_INTEL82077 19 /* 82077 floppy disk controller */ +#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */ +#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */ +#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */ +#define DKC_VBD 23 /* virtual block device */ + +/* + * Sun reserves up through 1023 + */ + +#define DKC_CUSTOMER_BASE 1024 + +/* + * Flags + */ +#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */ +#define DKI_MAPTRK 0x02 /* controller does track mapping */ +#define DKI_FMTTRK 0x04 /* formats only full track at a time */ +#define DKI_FMTVOL 0x08 /* formats only full volume at a time */ +#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */ +#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */ +#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */ + +/* + * partition headers: section 1 + * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I)) + */ +struct dk_map { + uint64_t dkl_cylno; /* starting cylinder */ + uint64_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * Used for all partitions + */ +struct dk_allmap { + struct dk_map dka_map[NDKMAP]; +}; + +#if defined(_SYSCALL32) +struct dk_allmap32 { + struct dk_map32 dka_map[NDKMAP]; +}; +#endif /* _SYSCALL32 */ + +/* + * Definition of a disk's geometry + */ +struct dk_geom { + unsigned short dkg_ncyl; /* # of data cylinders */ + unsigned short dkg_acyl; /* # of alternate cylinders */ + unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */ + unsigned short dkg_nhead; /* # of heads */ + unsigned short dkg_obs1; /* obsolete */ + unsigned short dkg_nsect; /* # of data sectors per track */ + unsigned short dkg_intrlv; /* interleave factor */ + unsigned short dkg_obs2; /* obsolete */ + unsigned short dkg_obs3; /* obsolete */ + unsigned short dkg_apc; /* alternates per cyl (SCSI only) */ + unsigned short dkg_rpm; /* revolutions per minute */ + unsigned short dkg_pcyl; /* # of physical cylinders */ + unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */ + unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */ + unsigned short dkg_extra[7]; /* for compatible expansion */ +}; + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkg_bhead dkg_obs1 /* used to be head offset */ +#define dkg_gap1 dkg_obs2 /* used to be gap1 */ +#define dkg_gap2 dkg_obs3 /* used to be gap2 */ + +/* + * Disk io control commands + * Warning: some other ioctls with the DIOC prefix exist elsewhere. + * The Generic DKIOC numbers are from 0 - 50. + * The Floppy Driver uses 51 - 100. + * The Hard Disk (except SCSI) 101 - 106. (these are obsolete) + * The CDROM Driver 151 - 200. + * The USCSI ioctl 201 - 250. + */ +#define DKIOC (0x04 << 8) + +/* + * The following ioctls are generic in nature and need to be + * supported as appropriate by all disk drivers + */ +#define DKIOCGGEOM (DKIOC|1) /* Get geometry */ +#define DKIOCINFO (DKIOC|3) /* Get info */ +#define DKIOCEJECT (DKIOC|6) /* Generic 'eject' */ +#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */ +#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */ + +/* + * Disk Cache Controls. These ioctls should be supported by + * all disk drivers. + * + * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl + * argument, but it should be passed as NULL to allow for future + * reinterpretation. From user-mode, this ioctl request is synchronous. + * + * When invoked from within the kernel, the arg can be NULL to indicate + * a synchronous request or can be the address of a struct dk_callback + * to request an asynchronous callback when the flush request is complete. + * In this case, the flag to the ioctl must include FKIOCTL and the + * dkc_callback field of the pointed to struct must be non-null or the + * request is made synchronously. + * + * In the callback case: if the ioctl returns 0, a callback WILL be performed. + * If the ioctl returns non-zero, a callback will NOT be performed. + * NOTE: In some cases, the callback may be done BEFORE the ioctl call + * returns. The caller's locking strategy should be prepared for this case. + */ +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; + +/* bit flag definitions for dkc_flag */ +#define FLUSH_VOLATILE 0x1 /* Bit 0: if set, only flush */ + /* volatile cache; otherwise, flush */ + /* volatile and non-volatile cache */ + +#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */ + /* enablement status */ +#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */ + +/* + * The following ioctls are used by Sun drivers to communicate + * with their associated format routines. Support of these ioctls + * is not required of foreign drivers + */ +#define DKIOCSGEOM (DKIOC|2) /* Set geometry */ +#define DKIOCSAPART (DKIOC|4) /* Set all partitions */ +#define DKIOCGAPART (DKIOC|5) /* Get all partitions */ +#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */ +#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */ + +/* + * The following ioctl's are removable media support + */ +#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */ +#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */ +#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */ +#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */ + + +/* + * ioctl for hotpluggable devices + */ +#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */ + +/* + * Ioctl to force driver to re-read the alternate partition and rebuild + * the internal defect map. + */ +#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */ +#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */ + +/* + * Used by applications to get disk defect information from IDE + * drives. + */ +#ifdef _SYSCALL32 +struct defect_header32 { + int head; + caddr32_t buffer; +}; +#endif /* _SYSCALL32 */ + +struct defect_header { + int head; + caddr_t buffer; +}; + +#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */ + +/* + * Used by applications to get partition or slice information + */ +#ifdef _SYSCALL32 +struct part_info32 { + uint32_t p_start; + int p_length; +}; +#endif /* _SYSCALL32 */ + +struct part_info { + uint64_t p_start; + int p_length; +}; + +/* The following ioctls are for Optical Memory Device */ +#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */ +#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */ + +/* + * This state enum is the argument passed to the DKIOCSTATE ioctl. + */ +enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE }; + +#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */ + +/* + * ioctls to read/write mboot info. + */ +#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */ +#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */ + +/* + * ioctl to get the device temperature. + */ +#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */ + +/* + * Used for providing the temperature. + */ + +struct dk_temperature { + uint_t dkt_flags; /* Flags */ + short dkt_cur_temp; /* Current disk temperature */ + short dkt_ref_temp; /* reference disk temperature */ +}; + +#define DKT_BYPASS_PM 0x1 +#define DKT_INVALID_TEMP 0xFFFF + + +/* + * Media types or profiles known + */ +#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */ + + +/* + * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to + * maintain compatibility with SFF8090. The following define the + * optical media type. + */ +#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */ +#define DK_MO_ERASABLE 0x03 /* MO Erasable */ +#define DK_MO_WRITEONCE 0x04 /* MO Write once */ +#define DK_AS_MO 0x05 /* AS MO */ +#define DK_CDROM 0x08 /* CDROM */ +#define DK_CDR 0x09 /* CD-R */ +#define DK_CDRW 0x0A /* CD-RW */ +#define DK_DVDROM 0x10 /* DVD-ROM */ +#define DK_DVDR 0x11 /* DVD-R */ +#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */ + +/* + * Media types for other rewritable magnetic media + */ +#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */ +#define DK_FLOPPY 0x10002 /* Floppy media */ +#define DK_ZIP 0x10003 /* IOMEGA ZIP media */ +#define DK_JAZ 0x10004 /* IOMEGA JAZ media */ + +#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */ +#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */ + +#define DKIOCPARTITION (DKIOC|9) /* Get partition info */ + +/* + * Ioctls to get/set volume capabilities related to Logical Volume Managers. + * They include the ability to get/set capabilities and to issue a read to a + * specific underlying device of a replicated device. + */ + +#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */ +#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */ +#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */ + +typedef uint_t volcapinfo_t; + +typedef uint_t volcapset_t; + +#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */ +#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */ + +typedef struct volcap { + volcapinfo_t vc_info; /* Capabilities available */ + volcapset_t vc_set; /* Capabilities set */ +} volcap_t; + +#define VOL_SIDENAME 256 + +typedef struct vol_directed_rd { + int vdr_flags; + offset_t vdr_offset; + size_t vdr_nbytes; + size_t vdr_bytesread; + void *vdr_data; + int vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd_t; + +#define DKV_SIDE_INIT (-1) +#define DKV_DMR_NEXT_SIDE 0x00000001 +#define DKV_DMR_DONE 0x00000002 +#define DKV_DMR_ERROR 0x00000004 +#define DKV_DMR_SUCCESS 0x00000008 +#define DKV_DMR_SHORT 0x00000010 + +#ifdef _MULTI_DATAMODEL +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct vol_directed_rd32 { + int32_t vdr_flags; + offset_t vdr_offset; /* 64-bit element on 32-bit alignment */ + size32_t vdr_nbytes; + size32_t vdr_bytesread; + caddr32_t vdr_data; + int32_t vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd32_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif +#endif /* _MULTI_DATAMODEL */ + +/* + * The ioctl is used to fetch disk's device type, vendor ID, + * model number/product ID, firmware revision and serial number together. + * + * Currently there are two device types - DKD_ATA_TYPE which means the + * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE + * which means the disk is driven by sd/scsi hba driver. + */ +#define DKIOC_GETDISKID (DKIOC|46) + +/* These two labels are for dkd_dtype of dk_disk_id_t */ +#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */ +#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */ + +#define DKD_ATA_MODEL 40 /* model number length */ +#define DKD_ATA_FWVER 8 /* firmware revision length */ +#define DKD_ATA_SERIAL 20 /* serial number length */ + +#define DKD_SCSI_VENDOR 8 /* vendor ID length */ +#define DKD_SCSI_PRODUCT 16 /* product ID length */ +#define DKD_SCSI_REVLEVEL 4 /* revision level length */ +#define DKD_SCSI_SERIAL 12 /* serial number length */ + +/* + * The argument type for DKIOC_GETDISKID ioctl. + */ +typedef struct dk_disk_id { + uint_t dkd_dtype; + union { + struct { + char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */ + char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */ + char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */ + } ata_disk_id; + struct { + char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */ + char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */ + char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */ + char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */ + } scsi_disk_id; + } disk_id; +} dk_disk_id_t; + +/* + * The ioctl is used to update the firmware of device. + */ +#define DKIOC_UPDATEFW (DKIOC|47) + +/* The argument type for DKIOC_UPDATEFW ioctl */ +typedef struct dk_updatefw { + caddr_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_t; + +#ifdef _SYSCALL32 +typedef struct dk_updatefw_32 { + caddr32_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_32_t; +#endif /* _SYSCALL32 */ + +/* + * firmware update type - temporary or permanent use + */ +#define FW_TYPE_TEMP 0x0 /* temporary use */ +#define FW_TYPE_PERM 0x1 /* permanent use */ + + +#ifdef __cplusplus +} +#endif + +#endif /* _OPENSOLARIS_SYS_DKIO_H_ */ diff --git a/include/os/freebsd/spl/sys/endian.h b/include/os/freebsd/spl/sys/endian.h new file mode 100644 index 000000000000..4de4b8829c54 --- /dev/null +++ b/include/os/freebsd/spl/sys/endian.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _SPL_SYS_ENDIAN_H_ +#define _SPL_SYS_ENDIAN_H_ + +#undef _MACHINE_ENDIAN_H_ +#include_next + +#if BYTE_ORDER == LITTLE_ENDIAN +#undef _BIG_ENDIAN +#undef BIG_ENDIAN +#define BIG_ENDIAN 4321 +#endif + +#endif /* _SPL_SYS_ENDIAN_H_ */ diff --git a/include/os/freebsd/spl/sys/extdirent.h b/include/os/freebsd/spl/sys/extdirent.h new file mode 100644 index 000000000000..65ba11f345d2 --- /dev/null +++ b/include/os/freebsd/spl/sys/extdirent.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_EXTDIRENT_H +#define _SYS_EXTDIRENT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* + * Extended file-system independent directory entry. This style of + * dirent provides additional informational flag bits for each + * directory entry. This dirent will be returned instead of the + * standard dirent if a VOP_READDIR() requests dirent flags via + * V_RDDIR_ENTFLAGS, and if the file system supports the flags. + */ +typedef struct edirent { + ino64_t ed_ino; /* "inode number" of entry */ + off64_t ed_off; /* offset of disk directory entry */ + uint32_t ed_eflags; /* per-entry flags */ + unsigned short ed_reclen; /* length of this record */ + char ed_name[1]; /* name of file */ +} edirent_t; + +#define EDIRENT_RECLEN(namelen) \ + ((offsetof(edirent_t, ed_name[0]) + 1 + (namelen) + 7) & ~ 7) +#define EDIRENT_NAMELEN(reclen) \ + ((reclen) - (offsetof(edirent_t, ed_name[0]))) + +/* + * Extended entry flags + * Extended entries include a bitfield of extra information + * regarding that entry. + */ +#define ED_CASE_CONFLICT 0x10 /* Disconsidering case, entry is not unique */ + +/* + * Extended flags accessor function + */ +#define ED_CASE_CONFLICTS(x) ((x)->ed_eflags & ED_CASE_CONFLICT) +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EXTDIRENT_H */ diff --git a/include/os/freebsd/spl/sys/file.h b/include/os/freebsd/spl/sys/file.h new file mode 100644 index 000000000000..10a82c204859 --- /dev/null +++ b/include/os/freebsd/spl/sys/file.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_FILE_H_ +#define _OPENSOLARIS_SYS_FILE_H_ + +#include_next + +#define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */ + +typedef struct file file_t; + +#include + +static __inline file_t * +getf_caps(int fd, cap_rights_t *rightsp) +{ + struct file *fp; + + if (fget(curthread, fd, rightsp, &fp) == 0) + return (fp); + return (NULL); +} + +#endif /* !_OPENSOLARIS_SYS_FILE_H_ */ diff --git a/include/os/freebsd/spl/sys/freebsd_rwlock.h b/include/os/freebsd/spl/sys/freebsd_rwlock.h new file mode 100644 index 000000000000..b760f8cf23d4 --- /dev/null +++ b/include/os/freebsd/spl/sys/freebsd_rwlock.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_FREEBSD_RWLOCK_H_ +#define _OPENSOLARIS_SYS_FREEBSD_RWLOCK_H_ + +#include_next + +#endif diff --git a/include/os/freebsd/spl/sys/inttypes.h b/include/os/freebsd/spl/sys/inttypes.h new file mode 100644 index 000000000000..651685d30473 --- /dev/null +++ b/include/os/freebsd/spl/sys/inttypes.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/isa_defs.h b/include/os/freebsd/spl/sys/isa_defs.h new file mode 100644 index 000000000000..a9d1a4e1fd64 --- /dev/null +++ b/include/os/freebsd/spl/sys/isa_defs.h @@ -0,0 +1,688 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ISA_DEFS_H +#define _SYS_ISA_DEFS_H + +/* + * This header file serves to group a set of well known defines and to + * set these for each instruction set architecture. These defines may + * be divided into two groups; characteristics of the processor and + * implementation choices for Solaris on a processor. + * + * Processor Characteristics: + * + * _LITTLE_ENDIAN / _BIG_ENDIAN: + * The natural byte order of the processor. A pointer to an int points + * to the least/most significant byte of that int. + * + * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD: + * The processor specific direction of stack growth. A push onto the + * stack increases/decreases the stack pointer, so it stores data at + * successively higher/lower addresses. (Stackless machines ignored + * without regrets). + * + * _LONG_LONG_HTOL / _LONG_LONG_LTOH: + * A pointer to a long long points to the most/least significant long + * within that long long. + * + * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH: + * The C compiler assigns bit fields from the high/low to the low/high end + * of an int (most to least significant vs. least to most significant). + * + * _IEEE_754: + * The processor (or supported implementations of the processor) + * supports the ieee-754 floating point standard. No other floating + * point standards are supported (or significant). Any other supported + * floating point formats are expected to be cased on the ISA processor + * symbol. + * + * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED: + * The C Compiler implements objects of type `char' as `unsigned' or + * `signed' respectively. This is really an implementation choice of + * the compiler writer, but it is specified in the ABI and tends to + * be uniform across compilers for an instruction set architecture. + * Hence, it has the properties of a processor characteristic. + * + * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT / + * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT / + * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT: + * The ABI defines alignment requirements of each of the primitive + * object types. Some, if not all, may be hardware requirements as + * well. The values are expressed in "byte-alignment" units. + * + * _MAX_ALIGNMENT: + * The most stringent alignment requirement as specified by the ABI. + * Equal to the maximum of all the above _XXX_ALIGNMENT values. + * + * _ALIGNMENT_REQUIRED: + * True or false (1 or 0) whether or not the hardware requires the ABI + * alignment. + * + * _LONG_LONG_ALIGNMENT_32 + * The 32-bit ABI supported by a 64-bit kernel may have different + * alignment requirements for primitive object types. The value of this + * identifier is expressed in "byte-alignment" units. + * + * _HAVE_CPUID_INSN + * This indicates that the architecture supports the 'cpuid' + * instruction as defined by Intel. (Intel allows other vendors + * to extend the instruction for their own purposes.) + * + * + * Implementation Choices: + * + * _ILP32 / _LP64: + * This specifies the compiler data type implementation as specified in + * the relevant ABI. The choice between these is strongly influenced + * by the underlying hardware, but is not absolutely tied to it. + * Currently only two data type models are supported: + * + * _ILP32: + * Int/Long/Pointer are 32 bits. This is the historical UNIX + * and Solaris implementation. Due to its historical standing, + * this is the default case. + * + * _LP64: + * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen + * implementation for 64-bit ABIs such as SPARC V9. + * + * _I32LPx: + * A compilation environment where 'int' is 32-bit, and + * longs and pointers are simply the same size. + * + * In all cases, Char is 8 bits and Short is 16 bits. + * + * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16: + * This specifies the form of the disk VTOC (or label): + * + * _SUNOS_VTOC_8: + * This is a VTOC form which is upwardly compatible with the + * SunOS 4.x disk label and allows 8 partitions per disk. + * + * _SUNOS_VTOC_16: + * In this format the incore vtoc image matches the ondisk + * version. It allows 16 slices per disk, and is not + * compatible with the SunOS 4.x disk label. + * + * Note that these are not the only two VTOC forms possible and + * additional forms may be added. One possible form would be the + * SVr4 VTOC form. The symbol for that is reserved now, although + * it is not implemented. + * + * _SVR4_VTOC_16: + * This VTOC form is compatible with the System V Release 4 + * VTOC (as implemented on the SVr4 Intel and 3b ports) with + * 16 partitions per disk. + * + * + * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR + * This describes the type of addresses used by system DMA: + * + * _DMA_USES_PHYSADDR: + * This type of DMA, used in the x86 implementation, + * requires physical addresses for DMA buffers. The 24-bit + * addresses used by some legacy boards is the source of the + * "low-memory" (<16MB) requirement for some devices using DMA. + * + * _DMA_USES_VIRTADDR: + * This method of DMA allows the use of virtual addresses for + * DMA transfers. + * + * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT + * This indicates the presence/absence of an fdisk table. + * + * _FIRMWARE_NEEDS_FDISK + * The fdisk table is required by system firmware. If present, + * it allows a disk to be subdivided into multiple fdisk + * partitions, each of which is equivalent to a separate, + * virtual disk. This enables the co-existence of multiple + * operating systems on a shared hard disk. + * + * _NO_FDISK_PRESENT + * If the fdisk table is absent, it is assumed that the entire + * media is allocated for a single operating system. + * + * _HAVE_TEM_FIRMWARE + * Defined if this architecture has the (fallback) option of + * using prom_* calls for doing I/O if a suitable kernel driver + * is not available to do it. + * + * _DONT_USE_1275_GENERIC_NAMES + * Controls whether or not device tree node names should + * comply with the IEEE 1275 "Generic Names" Recommended + * Practice. With _DONT_USE_GENERIC_NAMES, device-specific + * names identifying the particular device will be used. + * + * __i386_COMPAT + * This indicates whether the i386 ABI is supported as a *non-native* + * mode for the platform. When this symbol is defined: + * - 32-bit xstat-style system calls are enabled + * - 32-bit xmknod-style system calls are enabled + * - 32-bit system calls use i386 sizes -and- alignments + * + * Note that this is NOT defined for the i386 native environment! + * + * __x86 + * This is ONLY a synonym for defined(__i386) || defined(__amd64) + * which is useful only insofar as these two architectures share + * common attributes. Analogous to __sparc. + * + * _PSM_MODULES + * This indicates whether or not the implementation uses PSM + * modules for processor support, reading /etc/mach from inside + * the kernel to extract a list. + * + * _RTC_CONFIG + * This indicates whether or not the implementation uses /etc/rtc_config + * to configure the real-time clock in the kernel. + * + * _UNIX_KRTLD + * This indicates that the implementation uses a dynamically + * linked unix + krtld to form the core kernel image at boot + * time, or (in the absence of this symbol) a prelinked kernel image. + * + * _OBP + * This indicates the firmware interface is OBP. + * + * _SOFT_HOSTID + * This indicates that the implementation obtains the hostid + * from the file /etc/hostid, rather than from hardware. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The following set of definitions characterize Solaris on AMD's + * 64-bit systems. + */ +#if defined(__x86_64) || defined(__amd64) + +#if !defined(__amd64) +#define __amd64 /* preferred guard */ +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Different alignment constraints for the i386 ABI in compatibility mode + */ +#define _LONG_LONG_ALIGNMENT_32 4 + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define __i386_COMPAT +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +/* + * The feature test macro __i386 is generic for all processors implementing + * the Intel 386 instruction set or a superset of it. Specifically, this + * includes all members of the 386, 486, and Pentium family of processors. + */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__aarch64__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__riscv) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__arm__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__mips__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#if defined(__mips_n64) +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _INT_ALIGNMENT +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#else +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__powerpc__) + +#if defined(__BIG_ENDIAN__) +#define _BIT_FIELDS_HTOL +#else +#define _BIT_FIELDS_LTOH +#endif + +/* + * The following set of definitions characterize the Solaris on SPARC systems. + * + * The symbol __sparc indicates any of the SPARC family of processor + * architectures. This includes SPARC V7, SPARC V8 and SPARC V9. + * + * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined + * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough + * to SPARC V8 for the former to be subsumed into the latter definition.) + * + * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined + * by Version 9 of the SPARC Architecture Manual. + * + * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only + * relevant when the symbol __sparc is defined. + */ +/* + * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added + * to support backwards builds. This workaround should be removed in s10_71. + */ +#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__) +#if !defined(__sparc) +#define __sparc +#endif + +/* + * You can be 32-bit or 64-bit, but not both at the same time. + */ +#if defined(__sparcv8) && defined(__sparcv9) +#error "SPARC Versions 8 and 9 are mutually exclusive choices" +#endif + +/* + * Existing compilers do not set __sparcv8. Years will transpire before + * the compilers can be depended on to set the feature test macro. In + * the interim, we'll set it here on the basis of historical behaviour; + * if you haven't asked for SPARC V9, then you must've meant SPARC V8. + */ +#if !defined(__sparcv9) && !defined(__sparcv8) +#define __sparcv8 +#endif + +/* + * Define the appropriate "processor characteristics" shared between + * all Solaris on SPARC systems. + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_HTOL +#define _BIT_FIELDS_HTOL +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Define the appropriate "implementation choices" shared between versions. + */ +#define _SUNOS_VTOC_8 +#define _DMA_USES_VIRTADDR +#define _NO_FDISK_PRESENT +#define _HAVE_TEM_FIRMWARE +#define _OBP + +/* + * The following set of definitions characterize the implementation of + * 32-bit Solaris on SPARC V8 systems. + */ +#if defined(__sparcv8) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 8 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#define _ILP32 +#if !defined(_I32LPx) +#define _I32LPx +#endif + +/* + * The following set of definitions characterize the implementation of + * 64-bit Solaris on SPARC V9 systems. + */ +#elif defined(__sparcv9) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL + +#else +#error "unknown SPARC version" +#endif + +/* + * #error is strictly ansi-C, but works as well as anything for K&R systems. + */ +#else +#error "ISA not supported" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ISA_DEFS_H */ diff --git a/include/os/freebsd/spl/sys/kmem.h b/include/os/freebsd/spl/sys/kmem.h new file mode 100644 index 000000000000..cb61603d7dec --- /dev/null +++ b/include/os/freebsd/spl/sys/kmem.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_KMEM_H_ +#define _OPENSOLARIS_SYS_KMEM_H_ + +#include +#include +#include +#include + +#include +#include +#include + +MALLOC_DECLARE(M_SOLARIS); + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + +#define KM_SLEEP M_WAITOK +#define KM_PUSHPAGE M_WAITOK +#define KM_NOSLEEP M_NOWAIT +#define KM_NODEBUG M_NODUMP +#define KM_NORMALPRI 0 +#define KMC_NODEBUG UMA_ZONE_NODUMP +#define KMC_NOTOUCH 0 + +typedef struct vmem vmem_t; + +extern char *kmem_asprintf(const char *, ...); +extern char *kmem_vasprintf(const char *fmt, va_list ap); + +typedef struct kmem_cache { + char kc_name[32]; +#if !defined(KMEM_DEBUG) + uma_zone_t kc_zone; +#else + size_t kc_size; +#endif + int (*kc_constructor)(void *, void *, int); + void (*kc_destructor)(void *, void *); + void *kc_private; +} kmem_cache_t; + +extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); +extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); + +void *zfs_kmem_alloc(size_t size, int kmflags); +void zfs_kmem_free(void *buf, size_t size); +uint64_t kmem_size(void); +kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), + void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); +void kmem_cache_destroy(kmem_cache_t *cache); +void *kmem_cache_alloc(kmem_cache_t *cache, int flags); +void kmem_cache_free(kmem_cache_t *cache, void *buf); +boolean_t kmem_cache_reap_active(void); +void kmem_cache_reap_soon(kmem_cache_t *); +void kmem_reap(void); +int kmem_debugging(void); +void *calloc(size_t n, size_t s); + + +#define kmem_cache_reap_now kmem_cache_reap_soon +#define freemem vm_free_count() +#define minfree vm_cnt.v_free_min +#define heap_arena kernel_arena +#define zio_arena NULL +#define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) +#define kmem_zalloc(size, kmflags) \ + zfs_kmem_alloc((size), (kmflags) | M_ZERO) +#define kmem_free(buf, size) zfs_kmem_free((buf), (size)) +#define vmem_qcache_reap(ptr) ((void)0) + + +#endif /* _OPENSOLARIS_SYS_KMEM_H_ */ diff --git a/include/os/freebsd/spl/sys/kmem_cache.h b/include/os/freebsd/spl/sys/kmem_cache.h new file mode 100644 index 000000000000..d8e0349e4ca3 --- /dev/null +++ b/include/os/freebsd/spl/sys/kmem_cache.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#include + +/* kmem move callback return values */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES = 0, /* Object moved */ + KMEM_CBRC_NO = 1, /* Object not moved */ + KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ + KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ + KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ +} kmem_cbrc_t; + +extern void spl_kmem_cache_set_move(kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); + +#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) + +#endif diff --git a/include/os/freebsd/spl/sys/kstat.h b/include/os/freebsd/spl/sys/kstat.h new file mode 100644 index 000000000000..740f24b70720 --- /dev/null +++ b/include/os/freebsd/spl/sys/kstat.h @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_KSTAT_H +#define _SPL_KSTAT_H +#include +struct list_head {}; +#include +#include + +#define KSTAT_STRLEN 255 +#define KSTAT_RAW_MAX (128*1024) + +/* + * For reference valid classes are: + * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ +#define KSTAT_NUM_TYPES 5 + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_LONG 5 +#define KSTAT_DATA_ULONG 6 +#define KSTAT_DATA_STRING 7 +#define KSTAT_NUM_DATAS 8 + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 +#define KSTAT_NUM_INTRS 5 + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_INVALID 0x20 +#define KSTAT_FLAG_LONGSTRINGS 0x40 +#define KSTAT_FLAG_NO_HEADERS 0x80 + +#define KS_MAGIC 0x9d9d9d9d + +/* Dynamic updates */ +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +struct kstat_s; +typedef struct kstat_s kstat_t; + +typedef int kid_t; /* unique kstat id */ +typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ + +typedef struct kstat_module { + char ksm_name[KSTAT_STRLEN+1]; /* module name */ + struct list_head ksm_module_list; /* module linkage */ + struct list_head ksm_kstat_list; /* list of kstat entries */ + struct proc_dir_entry *ksm_proc; /* proc entry */ +} kstat_module_t; + +typedef struct kstat_raw_ops { + int (*headers)(char *buf, size_t size); + int (*data)(char *buf, size_t size, void *data); + void *(*addr)(kstat_t *ksp, loff_t index); +} kstat_raw_ops_t; + +struct kstat_s { + int ks_magic; /* magic value */ + kid_t ks_kid; /* unique kstat ID */ + hrtime_t ks_crtime; /* creation time */ + hrtime_t ks_snaptime; /* last access time */ + char ks_module[KSTAT_STRLEN+1]; /* provider module name */ + int ks_instance; /* provider module instance */ + char ks_name[KSTAT_STRLEN+1]; /* kstat name */ + char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + uchar_t ks_type; /* kstat data type */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of data records */ + size_t ks_data_size; /* size of kstat data section */ + kstat_update_t *ks_update; /* dynamic updates */ + void *ks_private; /* private data */ + kmutex_t ks_private_lock; /* kstat private data lock */ + kmutex_t *ks_lock; /* kstat data lock */ + struct list_head ks_list; /* kstat linkage */ + kstat_module_t *ks_owner; /* kstat module linkage */ + kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ + char *ks_raw_buf; /* buf used for raw ops */ + size_t ks_raw_bufsize; /* size of raw ops buffer */ + struct sysctl_ctx_list ks_sysctl_ctx; + struct sysctl_oid *ks_sysctl_root; + +}; + +typedef struct kstat_named_s { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* 128-bit int */ + int32_t i32; /* 32-bit signed int */ + uint32_t ui32; /* 32-bit unsigned int */ + int64_t i64; /* 64-bit signed int */ + uint64_t ui64; /* 64-bit unsigned int */ + long l; /* native signed long */ + ulong_t ul; /* native unsigned long */ + struct { + union { + char *ptr; /* NULL-term string */ + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } string; + } value; +} kstat_named_t; + +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; +} kstat_intr_t; + +typedef struct kstat_io { + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait len*time product */ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ +} kstat_io_t; + +typedef struct kstat_timer { + char name[KSTAT_STRLEN+1]; /* event name */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +int spl_kstat_init(void); +void spl_kstat_fini(void); + +extern void __kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + +extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, + const char *ks_name, const char *ks_class, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags); + +extern void __kstat_install(kstat_t *ksp); +extern void __kstat_delete(kstat_t *ksp); +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); + +#define kstat_set_raw_ops(k, h, d, a) \ + __kstat_set_raw_ops(k, h, d, a) +#define kstat_create(m, i, n, c, t, s, f) \ + __kstat_create(m, i, n, c, t, s, f) + +#define kstat_install(k) __kstat_install(k) +#define kstat_delete(k) __kstat_delete(k) + +#endif /* _SPL_KSTAT_H */ diff --git a/include/os/freebsd/spl/sys/list.h b/include/os/freebsd/spl/sys/list.h new file mode 100644 index 000000000000..8339b6226d11 --- /dev/null +++ b/include/os/freebsd/spl/sys/list.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_H +#define _SYS_LIST_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct list_node list_node_t; +typedef struct list list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void *list_remove_head(list_t *); +void *list_remove_tail(list_t *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); +int list_is_empty(list_t *); + +void list_link_init(list_node_t *); +void list_link_replace(list_node_t *, list_node_t *); + +int list_link_active(list_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_H */ diff --git a/include/os/freebsd/spl/sys/list_impl.h b/include/os/freebsd/spl/sys/list_impl.h new file mode 100644 index 000000000000..9c42f8832023 --- /dev/null +++ b/include/os/freebsd/spl/sys/list_impl.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_IMPL_H +#define _SYS_LIST_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +}; + +struct list { + size_t list_size; + size_t list_offset; + struct list_node list_head; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_IMPL_H */ diff --git a/include/os/freebsd/spl/sys/lock.h b/include/os/freebsd/spl/sys/lock.h new file mode 100644 index 000000000000..7d5dc26abc74 --- /dev/null +++ b/include/os/freebsd/spl/sys/lock.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_LOCK_H_ +#define _OPENSOLARIS_SYS_LOCK_H_ + +#include_next + +#define LO_ALLMASK (LO_INITIALIZED | LO_WITNESS | LO_QUIET | \ + LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE | \ + LO_DUPOK | LO_CLASSMASK | LO_NOPROFILE) +#define LO_EXPECTED (LO_INITIALIZED | LO_WITNESS | LO_RECURSABLE | \ + LO_SLEEPABLE | LO_UPGRADABLE | LO_DUPOK | (2 << LO_CLASSSHIFT)) + +#endif /* _OPENSOLARIS_SYS_LOCK_H_ */ diff --git a/include/os/freebsd/spl/sys/misc.h b/include/os/freebsd/spl/sys/misc.h new file mode 100644 index 000000000000..e39bb07b2f4c --- /dev/null +++ b/include/os/freebsd/spl/sys/misc.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MISC_H_ +#define _OPENSOLARIS_SYS_MISC_H_ + +#include + +#define MAXUID UID_MAX + +#define _ACL_ACLENT_ENABLED 0x1 +#define _ACL_ACE_ENABLED 0x2 + +#define _FIOFFS (INT_MIN) +#define _FIOGDIO (INT_MIN+1) +#define _FIOSDIO (INT_MIN+2) + +#define _FIO_SEEK_DATA FIOSEEKDATA +#define _FIO_SEEK_HOLE FIOSEEKHOLE + +struct opensolaris_utsname { + char *sysname; + char *nodename; + char *release; + char version[32]; + char *machine; +}; + +extern char hw_serial[11]; + +#endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h new file mode 100644 index 000000000000..d81d927fd6eb --- /dev/null +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_MOD_H +#define _SPL_MOD_H + +#include + +#define ZFS_MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) + +#define EXPORT_SYMBOL(x) +#define module_param(a, b, c) +#define MODULE_PARM_DESC(a, b) + +#define ZMOD_RW CTLFLAG_RWTUN +#define ZMOD_RD CTLFLAG_RDTUN + +/* BEGIN CSTYLED */ +#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) \ + SYSCTL_DECL(_vfs_ ## scope_prefix); \ + SYSCTL_##type(_vfs_ ## scope_prefix, OID_AUTO, name, perm, &name_prefix ## name, 0, desc) + +#define ZFS_MODULE_PARAM_ARGS SYSCTL_HANDLER_ARGS + +#define ZFS_MODULE_PARAM_CALL_IMPL(parent, name, perm, args, desc) \ + SYSCTL_DECL(parent); \ + SYSCTL_PROC(parent, OID_AUTO, name, perm | args, desc) + +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, func, _, perm, desc) \ + ZFS_MODULE_PARAM_CALL_IMPL(_vfs_ ## scope_prefix, name, perm, func ## _args(name_prefix ## name), desc) + +#define param_set_arc_long_args(var) \ + CTLTYPE_ULONG, &var, 0, param_set_arc_long, "LU" + +#define param_set_arc_int_args(var) \ + CTLTYPE_INT, &var, 0, param_set_arc_int, "I" + +#define param_set_deadman_failmode_args(var) \ + CTLTYPE_STRING, NULL, 0, param_set_deadman_failmode, "A" + +#define param_set_deadman_synctime_args(var) \ + CTLTYPE_ULONG, NULL, 0, param_set_deadman_synctime, "LU" + +#define param_set_deadman_ziotime_args(var) \ + CTLTYPE_ULONG, NULL, 0, param_set_deadman_ziotime, "LU" + +#define param_set_slop_shift_args(var) \ + CTLTYPE_INT, &var, 0, param_set_slop_shift, "I" + +#include +#define module_init(fn) \ +static void \ +wrap_ ## fn(void *dummy __unused) \ +{ \ + fn(); \ +} \ +SYSINIT(zfs_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) + + +#define module_exit(fn) \ +static void \ +wrap_ ## fn(void *dummy __unused) \ +{ \ + fn(); \ +} \ +SYSUNINIT(zfs_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) +/* END CSTYLED */ + +#endif /* SPL_MOD_H */ diff --git a/include/os/freebsd/spl/sys/mode.h b/include/os/freebsd/spl/sys/mode.h new file mode 100644 index 000000000000..651685d30473 --- /dev/null +++ b/include/os/freebsd/spl/sys/mode.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/mount.h b/include/os/freebsd/spl/sys/mount.h new file mode 100644 index 000000000000..4732d283bb9a --- /dev/null +++ b/include/os/freebsd/spl/sys/mount.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MOUNT_H_ +#define _OPENSOLARIS_SYS_MOUNT_H_ + +#include +#include_next +#include + +#define MS_FORCE MNT_FORCE +#define MS_REMOUNT MNT_UPDATE + +typedef struct fid fid_t; + +#endif /* !_OPENSOLARIS_SYS_MOUNT_H_ */ diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h new file mode 100644 index 000000000000..ef556872a2bd --- /dev/null +++ b/include/os/freebsd/spl/sys/mutex.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MUTEX_H_ +#define _OPENSOLARIS_SYS_MUTEX_H_ + +typedef struct sx kmutex_t; + +#include +#include +#include_next +#include_next +#include +#include + +typedef enum { + MUTEX_DEFAULT = 6 /* kernel default mutex */ +} kmutex_type_t; + +#define MUTEX_HELD(x) (mutex_owned(x)) +#define MUTEX_NOT_HELD(x) (!mutex_owned(x) || panicstr) + +#ifndef OPENSOLARIS_WITNESS +#define MUTEX_FLAGS (SX_DUPOK | SX_NEW | SX_NOWITNESS) +#else +#define MUTEX_FLAGS (SX_DUPOK | SX_NEW) +#endif + +#define mutex_init(lock, desc, type, arg) do { \ + const char *_name; \ + ASSERT((type) == 0 || (type) == MUTEX_DEFAULT); \ + KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \ + LO_EXPECTED, ("lock %s already initialized", #lock)); \ + for (_name = #lock; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #lock; \ + sx_init_flags((lock), _name, MUTEX_FLAGS); \ +} while (0) +#define mutex_destroy(lock) sx_destroy(lock) +#define mutex_enter(lock) sx_xlock(lock) +#define mutex_enter_nested(lock, type) sx_xlock(lock) +#define mutex_tryenter(lock) sx_try_xlock(lock) +#define mutex_exit(lock) sx_xunlock(lock) +#define mutex_owned(lock) sx_xlocked(lock) +#define mutex_owner(lock) sx_xholder(lock) +#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/freebsd/spl/sys/param.h b/include/os/freebsd/spl/sys/param.h new file mode 100644 index 000000000000..01392dd67201 --- /dev/null +++ b/include/os/freebsd/spl/sys/param.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2007 John Birrell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _COMPAT_OPENSOLARIS_SYS_PARAM_H_ +#define _COMPAT_OPENSOLARIS_SYS_PARAM_H_ + +#include_next + +#define PAGESIZE PAGE_SIZE +#define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) + +#endif diff --git a/include/os/freebsd/spl/sys/policy.h b/include/os/freebsd/spl/sys/policy.h new file mode 100644 index 000000000000..6637202025e1 --- /dev/null +++ b/include/os/freebsd/spl/sys/policy.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $ $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_POLICY_H_ +#define _OPENSOLARIS_SYS_POLICY_H_ + +#include +#include +#include +struct mount; +struct vattr; + +int secpolicy_nfs(cred_t *cr); +int secpolicy_zfs(cred_t *crd); +int secpolicy_sys_config(cred_t *cr, int checkonly); +int secpolicy_zinject(cred_t *cr); +int secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp); +int secpolicy_basic_link(vnode_t *vp, cred_t *cr); +int secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_stky_modify(cred_t *cr); +int secpolicy_vnode_remove(vnode_t *vp, cred_t *cr); +int secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t accmode); +int secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t curmode, accmode_t wantmode); +int secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner); +int secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node); +int secpolicy_vnode_create_gid(cred_t *cr); +int secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid); +int secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, + boolean_t issuidroot); +void secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr); +int secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, cred_t *cr); +int secpolicy_fs_owner(struct mount *vfsp, cred_t *cr); +int secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp); +void secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp); +int secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr, + vtype_t vtype); +int secpolicy_smb(cred_t *cr); + + +#if __FreeBSD_version >= 1300005 +#define spl_priv_check_cred(a, b) priv_check_cred((a), (b)) +#else +#define spl_priv_check_cred(a, b) priv_check_cred((a), (b), 0) +#endif +#endif /* _OPENSOLARIS_SYS_POLICY_H_ */ diff --git a/include/os/freebsd/spl/sys/proc.h b/include/os/freebsd/spl/sys/proc.h new file mode 100644 index 000000000000..fca833018ea0 --- /dev/null +++ b/include/os/freebsd/spl/sys/proc.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_PROC_H_ +#define _OPENSOLARIS_SYS_PROC_H_ + +#include +#include +#include_next +#include +#include +#include +#include +#include +#include +#include +#include + + +#define CPU curcpu +#define minclsyspri PRIBIO +#define defclsyspri minclsyspri +#define maxclsyspri PVM +#define max_ncpus (mp_maxid + 1) +#define boot_max_ncpus (mp_maxid + 1) + +#define TS_RUN 0 + +#define p0 proc0 + +#define t_tid td_tid + +typedef short pri_t; +typedef struct thread _kthread; +typedef struct thread kthread_t; +typedef struct thread *kthread_id_t; +typedef struct proc proc_t; + +extern struct proc *zfsproc; + +struct thread_wrap { + void *tw_arg; + void (*tw_proc)(void*); +}; + +static __inline void +solthread_wrapper(void *arg) +{ + struct thread_wrap *tw = arg; + + tw->tw_proc(tw->tw_arg); + free(tw, M_SOLARIS); + kthread_exit(); +} + +static __inline kthread_t * +do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, + size_t len, proc_t *pp, int state, pri_t pri) +{ + kthread_t *td = NULL; + int error; + struct thread_wrap *tw; + + /* + * Be sure there are no surprises. + */ + ASSERT(stk == NULL); + ASSERT(len == 0); + ASSERT(state == TS_RUN); + tw = malloc(sizeof (*tw), M_SOLARIS, M_WAITOK); + tw->tw_proc = proc; + tw->tw_arg = arg; + + error = kproc_kthread_add(solthread_wrapper, tw, &zfsproc, &td, + RFSTOPPED, stksize / PAGE_SIZE, "zfskern", "solthread %p", proc); + if (error == 0) { + thread_lock(td); + sched_prio(td, pri); + sched_add(td, SRQ_BORING); +#if __FreeBSD_version < 1300068 + thread_unlock(td); +#endif + } else { + free(tw, M_SOLARIS); + } + return (td); +} + +#define thread_create(stk, stksize, proc, arg, len, pp, state, pri) \ + do_thread_create(stk, stksize, proc, arg, len, pp, state, pri) +#define thread_exit() kthread_exit() + +int uread(proc_t *, void *, size_t, uintptr_t); +int uwrite(proc_t *, void *, size_t, uintptr_t); +#endif /* _OPENSOLARIS_SYS_PROC_H_ */ diff --git a/include/os/freebsd/spl/sys/processor.h b/include/os/freebsd/spl/sys/processor.h new file mode 100644 index 000000000000..53149840f21f --- /dev/null +++ b/include/os/freebsd/spl/sys/processor.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + */ + +/* + * Copyright 2014 Garrett D'Amore + * + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PROCESSOR_H +#define _SYS_PROCESSOR_H + +#include +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Definitions for p_online, processor_info & lgrp system calls. + */ + +/* + * Type for an lgrpid + */ +typedef uint16_t lgrpid_t; + +/* + * Type for processor name (CPU number). + */ +typedef int processorid_t; +typedef int chipid_t; + +#define getcpuid() curcpu + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROCESSOR_H */ diff --git a/include/os/freebsd/spl/sys/procfs_list.h b/include/os/freebsd/spl/sys/procfs_list.h new file mode 100644 index 000000000000..5d623c369c4c --- /dev/null +++ b/include/os/freebsd/spl/sys/procfs_list.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SPL_PROCFS_LIST_H +#define _SPL_PROCFS_LIST_H + +#include +#include + + +/* + * procfs list manipulation + */ + +struct seq_file { }; +void seq_printf(struct seq_file *m, const char *fmt, ...); + +typedef struct procfs_list { + void *pl_private; + kmutex_t pl_lock; + list_t pl_list; + uint64_t pl_next_id; + size_t pl_node_offset; +} procfs_list_t; + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); +void procfs_list_add(procfs_list_t *procfs_list, void *p); + +#endif /* _SPL_PROCFS_LIST_H */ diff --git a/include/os/freebsd/spl/sys/random.h b/include/os/freebsd/spl/sys/random.h new file mode 100644 index 000000000000..b3c9115f5305 --- /dev/null +++ b/include/os/freebsd/spl/sys/random.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_RANDOM_H_ +#define _OPENSOLARIS_SYS_RANDOM_H_ + +#include_next + +static inline int +random_get_bytes(uint8_t *p, size_t s) +{ + arc4rand(p, (int)s, 0); + return (0); +} + +static inline int +random_get_pseudo_bytes(uint8_t *p, size_t s) +{ + arc4rand(p, (int)s, 0); + return (0); +} + +#endif /* !_OPENSOLARIS_SYS_RANDOM_H_ */ diff --git a/include/os/freebsd/spl/sys/rwlock.h b/include/os/freebsd/spl/sys/rwlock.h new file mode 100644 index 000000000000..10107a9bee84 --- /dev/null +++ b/include/os/freebsd/spl/sys/rwlock.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_RWLOCK_H_ +#define _OPENSOLARIS_SYS_RWLOCK_H_ + +#include +#include +#include +#include + +typedef enum { + RW_DEFAULT = 4 /* kernel default rwlock */ +} krw_type_t; + + +typedef enum { + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 +} krw_t; + +typedef struct sx krwlock_t; + +#ifndef OPENSOLARIS_WITNESS +#define RW_FLAGS (SX_DUPOK | SX_NOWITNESS) +#else +#define RW_FLAGS (SX_DUPOK) +#endif + +#define RW_READ_HELD(x) (rw_read_held((x))) +#define RW_WRITE_HELD(x) (rw_write_held((x))) +#define RW_LOCK_HELD(x) (rw_lock_held((x))) +#define RW_ISWRITER(x) (rw_iswriter(x)) +/* BEGIN CSTYLED */ +#define rw_init(lock, desc, type, arg) do { \ + const char *_name; \ + ASSERT((type) == 0 || (type) == RW_DEFAULT); \ + KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \ + LO_EXPECTED, ("lock %s already initialized", #lock)); \ + bzero((lock), sizeof(struct sx)); \ + for (_name = #lock; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #lock; \ + sx_init_flags((lock), _name, RW_FLAGS); \ +} while (0) +#define rw_destroy(lock) sx_destroy(lock) +#define rw_enter(lock, how) do { \ + if ((how) == RW_READER) \ + sx_slock(lock); \ + else /* if ((how) == RW_WRITER) */ \ + sx_xlock(lock); \ + } while (0) + +#define rw_tryenter(lock, how) \ + ((how) == RW_READER ? sx_try_slock(lock) : sx_try_xlock(lock)) +#define rw_exit(lock) sx_unlock(lock) +#define rw_downgrade(lock) sx_downgrade(lock) +#define rw_tryupgrade(lock) sx_try_upgrade(lock) +#define rw_read_held(lock) \ + ((lock)->sx_lock != SX_LOCK_UNLOCKED && \ + ((lock)->sx_lock & SX_LOCK_SHARED)) +#define rw_write_held(lock) sx_xlocked(lock) +#define rw_lock_held(lock) (rw_read_held(lock) || rw_write_held(lock)) +#define rw_iswriter(lock) sx_xlocked(lock) +#define rw_owner(lock) sx_xholder(lock) + +/* END CSTYLED */ +#endif /* _OPENSOLARIS_SYS_RWLOCK_H_ */ diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h new file mode 100644 index 000000000000..496fc58d7c7b --- /dev/null +++ b/include/os/freebsd/spl/sys/sdt.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SDT_H_ +#define _OPENSOLARIS_SYS_SDT_H_ + +#include_next +/* BEGIN CSTYLED */ +#ifdef KDTRACE_HOOKS +SDT_PROBE_DECLARE(sdt, , , set__error); + +#define SET_ERROR(err) \ + ((sdt_sdt___set__error->id ? \ + (*sdt_probe_func)(sdt_sdt___set__error->id, \ + (uintptr_t)err, 0, 0, 0, 0) : 0), err) +#else +#define SET_ERROR(err) (err) +#endif + +#endif /* _OPENSOLARIS_SYS_SDT_H_ */ diff --git a/include/os/freebsd/spl/sys/sid.h b/include/os/freebsd/spl/sys/sid.h new file mode 100644 index 000000000000..18b6834250d5 --- /dev/null +++ b/include/os/freebsd/spl/sys/sid.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SID_H_ +#define _OPENSOLARIS_SYS_SID_H_ + +typedef struct ksiddomain { + char *kd_name; /* Domain part of SID */ + uint_t kd_len; +} ksiddomain_t; +typedef void ksid_t; + +static __inline ksiddomain_t * +ksid_lookupdomain(const char *domain) +{ + ksiddomain_t *kd; + size_t len; + + len = strlen(domain) + 1; + kd = kmem_alloc(sizeof (*kd), KM_SLEEP); + kd->kd_len = (uint_t)len; + kd->kd_name = kmem_alloc(len, KM_SLEEP); + strcpy(kd->kd_name, domain); + return (kd); +} + +static __inline void +ksiddomain_rele(ksiddomain_t *kd) +{ + + kmem_free(kd->kd_name, kd->kd_len); + kmem_free(kd, sizeof (*kd)); +} + +static __inline uint_t +ksid_getid(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +static __inline const char * +ksid_getdomain(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +static __inline uint_t +ksid_getrid(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +#define kidmap_getsidbyuid(zone, uid, sid_prefix, rid) (1) +#define kidmap_getsidbygid(zone, gid, sid_prefix, rid) (1) + +#endif /* _OPENSOLARIS_SYS_SID_H_ */ diff --git a/include/os/freebsd/spl/sys/sig.h b/include/os/freebsd/spl/sys/sig.h new file mode 100644 index 000000000000..426a9e827ecb --- /dev/null +++ b/include/os/freebsd/spl/sys/sig.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2008 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SIG_H_ +#define _OPENSOLARIS_SYS_SIG_H_ + +#include_next +#include +#include +#include +#include +#include +#include + +#define FORREAL 0 +#define JUSTLOOKING 1 + +static __inline int +issig(int why) +{ + struct thread *td = curthread; + struct proc *p; + int sig; + + ASSERT(why == FORREAL || why == JUSTLOOKING); + if (SIGPENDING(td)) { + if (why == JUSTLOOKING) + return (1); + p = td->td_proc; + PROC_LOCK(p); + mtx_lock(&p->p_sigacts->ps_mtx); + sig = cursig(td); + mtx_unlock(&p->p_sigacts->ps_mtx); + PROC_UNLOCK(p); + if (sig != 0) + return (1); + } + return (0); +} +#endif /* _OPENSOLARIS_SYS_SIG_H_ */ diff --git a/include/os/freebsd/spl/sys/simd.h b/include/os/freebsd/spl/sys/simd.h new file mode 100644 index 000000000000..53503e838912 --- /dev/null +++ b/include/os/freebsd/spl/sys/simd.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _FREEBSD_SIMD_H +#define _FREEBSD_SIMD_H +#if defined(__amd64__) || defined(__i386__) +#include +#else + +#define kfpu_allowed() 0 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) +#endif +#endif diff --git a/include/os/freebsd/spl/sys/simd_x86.h b/include/os/freebsd/spl/sys/simd_x86.h new file mode 100644 index 000000000000..da846fcbe6c2 --- /dev/null +++ b/include/os/freebsd/spl/sys/simd_x86.h @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#include +#include +#include + +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) + +#define kfpu_begin() { \ + critical_enter(); \ + fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); \ +} + +#define kfpu_end() \ + { \ + fpu_kern_leave(curthread, NULL); \ + critical_exit(); \ + } + +/* + * Check if OS supports AVX and AVX2 by checking XCR0 + * Only call this function if CPUID indicates that AVX feature is + * supported by the CPU, otherwise it might be an illegal instruction. + */ +static inline uint64_t +xgetbv(uint32_t index) +{ + uint32_t eax, edx; + /* xgetbv - instruction byte code */ + __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (index)); + + return ((((uint64_t)edx)<<32) | (uint64_t)eax); +} + + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + + has_osxsave = !!(cpu_feature2 & CPUID2_OSXSAVE); + + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ + return (!!(cpu_feature & CPUID_SSE)); +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ + return (!!(cpu_feature & CPUID_SSE2)); +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE3)); +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSSE3)); +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE41)); +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE42)); +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + boolean_t has_avx; + + has_avx = !!(cpu_feature2 & CPUID2_AVX); + + return (has_avx && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + boolean_t has_avx2; + + has_avx2 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX2); + + return (has_avx2 && __ymm_enabled()); +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + + +/* Check if AVX512F instruction set is available */ +static inline boolean_t +zfs_avx512f_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512CD instruction set is available */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512CD); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512ER instruction set is available */ +static inline boolean_t +zfs_avx512er_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512CD); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512PF instruction set is available */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512PF); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512BW instruction set is available */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + boolean_t has_avx512 = B_FALSE; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512BW); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512DQ instruction set is available */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512DQ); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VL instruction set is available */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512VL); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512IFMA instruction set is available */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512IFMA); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VBMI instruction set is available */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_BMI1); + + return (has_avx512 && __zmm_enabled()); +} diff --git a/include/os/freebsd/spl/sys/spl_condvar.h b/include/os/freebsd/spl/sys/spl_condvar.h new file mode 100644 index 000000000000..7405f647d59a --- /dev/null +++ b/include/os/freebsd/spl/sys/spl_condvar.h @@ -0,0 +1,81 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2000 Jake Burkholder . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_SYS_CONDVAR_H_ +#define _SPL_SYS_CONDVAR_H_ + +#ifndef LOCORE +#include + +struct lock_object; +struct thread; + +TAILQ_HEAD(cv_waitq, thread); + +/* + * Condition variable. The waiters count is protected by the mutex that + * protects the condition; that is, the mutex that is passed to cv_wait*() + * and is held across calls to cv_signal() and cv_broadcast(). It is an + * optimization to avoid looking up the sleep queue if there are no waiters. + */ +struct cv { + const char *cv_description; + int cv_waiters; +}; + +void cv_init(struct cv *cvp, const char *desc); +void cv_destroy(struct cv *cvp); + +void _cv_wait(struct cv *cvp, struct lock_object *lock); +void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock); +int _cv_wait_sig(struct cv *cvp, struct lock_object *lock); +int _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, + sbintime_t sbt, sbintime_t pr, int flags); +int _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock, + sbintime_t sbt, sbintime_t pr, int flags); + +void cv_signal(struct cv *cvp); +void cv_broadcastpri(struct cv *cvp, int pri); + +#define cv_wait(cvp, lock) \ + _cv_wait((cvp), &(lock)->lock_object) +#define cv_wait_unlock(cvp, lock) \ + _cv_wait_unlock((cvp), &(lock)->lock_object) +#define cv_timedwait_sbt(cvp, lock, sbt, pr, flags) \ + _cv_timedwait_sbt((cvp), &(lock)->lock_object, (sbt), (pr), (flags)) +#define cv_timedwait_sig_sbt(cvp, lock, sbt, pr, flags) \ + _cv_timedwait_sig_sbt((cvp), &(lock)->lock_object, (sbt), (pr), (flags)) + +#define cv_broadcast(cvp) cv_broadcastpri(cvp, 0) + +#define cv_wmesg(cvp) ((cvp)->cv_description) + +#endif /* !LOCORE */ +#endif /* _SYS_CONDVAR_H_ */ diff --git a/include/os/freebsd/spl/sys/string.h b/include/os/freebsd/spl/sys/string.h new file mode 100644 index 000000000000..859b40285a94 --- /dev/null +++ b/include/os/freebsd/spl/sys/string.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_STRING_H_ +#define _OPENSOLARIS_SYS_STRING_H_ + +#include + +char *strpbrk(const char *, const char *); +void strident_canon(char *, size_t); +void kmem_strfree(char *); +char *kmem_strdup(const char *s); + +#endif /* _OPENSOLARIS_SYS_STRING_H_ */ diff --git a/include/os/freebsd/spl/sys/strings.h b/include/os/freebsd/spl/sys/strings.h new file mode 100644 index 000000000000..651685d30473 --- /dev/null +++ b/include/os/freebsd/spl/sys/strings.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/sunddi.h b/include/os/freebsd/spl/sys/sunddi.h new file mode 100644 index 000000000000..bb76cd9641c7 --- /dev/null +++ b/include/os/freebsd/spl/sys/sunddi.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SUNDDI_H +#define _SPL_SUNDDI_H + +#include +#include +#include +#include +#include + +typedef int ddi_devid_t; + +#define DDI_DEV_T_NONE ((dev_t)-1) +#define DDI_DEV_T_ANY ((dev_t)-2) +#define DI_MAJOR_T_UNKNOWN ((major_t)0) + +#define DDI_PROP_DONTPASS 0x0001 +#define DDI_PROP_CANSLEEP 0x0002 + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL) +#define ddi_prop_free(x) (void)0 +#define ddi_root_node() (void)0 + +extern int ddi_strtoul(const char *, char **, int, unsigned long *); +extern int ddi_strtol(const char *, char **, int, long *); +extern int ddi_strtoull(const char *, char **, int, unsigned long long *); +extern int ddi_strtoll(const char *, char **, int, long long *); + +extern int ddi_copyin(const void *from, void *to, size_t len, int flags); +extern int ddi_copyout(const void *from, void *to, size_t len, int flags); +extern void ddi_sysevent_init(void); + + +int ddi_soft_state_init(void **statep, size_t size, size_t nitems); +void ddi_soft_state_fini(void **statep); + +void *ddi_get_soft_state(void *state, int item); +int ddi_soft_state_zalloc(void *state, int item); +void ddi_soft_state_free(void *state, int item); + +#endif /* SPL_SUNDDI_H */ diff --git a/include/os/freebsd/spl/sys/sysmacros.h b/include/os/freebsd/spl/sys/sysmacros.h new file mode 100644 index 000000000000..5afca10447e7 --- /dev/null +++ b/include/os/freebsd/spl/sys/sysmacros.h @@ -0,0 +1,404 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSMACROS_H +#define _SYS_SYSMACROS_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Some macros for units conversion + */ +/* + * Disk blocks (sectors) and bytes. + */ +#define dtob(DD) ((DD) << DEV_BSHIFT) +#define btod(BB) (((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) +#define btodt(BB) ((BB) >> DEV_BSHIFT) +#define lbtod(BB) (((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef SIGNOF +#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0) +#endif +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#define boot_ncpus mp_ncpus +#define kpreempt_disable() critical_enter() +#define kpreempt_enable() critical_exit() +#define CPU_SEQID curcpu +#define is_system_labeled() 0 +/* + * Convert a single byte to/from binary-coded decimal (BCD). + */ +extern unsigned char byte_to_bcd[256]; +extern unsigned char bcd_to_byte[256]; + +#define BYTE_TO_BCD(x) byte_to_bcd[(x) & 0xff] +#define BCD_TO_BYTE(x) bcd_to_byte[(x) & 0xff] + +/* + * WARNING: The device number macros defined here should not be used by device + * drivers or user software. Device drivers should use the device functions + * defined in the DDI/DKI interface (see also ddi.h). Application software + * should make use of the library routines available in makedev(3). A set of + * new device macros are provided to operate on the expanded device number + * format supported in SVR4. Macro versions of the DDI device functions are + * provided for use by kernel proper routines only. Macro routines bmajor(), + * major(), minor(), emajor(), eminor(), and makedev() will be removed or + * their definitions changed at the next major release following SVR4. + */ + +#define O_BITSMAJOR 7 /* # of SVR3 major device bits */ +#define O_BITSMINOR 8 /* # of SVR3 minor device bits */ +#define O_MAXMAJ 0x7f /* SVR3 max major value */ +#define O_MAXMIN 0xff /* SVR3 max minor value */ + + +#define L_BITSMAJOR32 14 /* # of SVR4 major device bits */ +#define L_BITSMINOR32 18 /* # of SVR4 minor device bits */ +#define L_MAXMAJ32 0x3fff /* SVR4 max major value */ +#define L_MAXMIN32 0x3ffff /* MAX minor for 3b2 software drivers. */ + /* For 3b2 hardware devices the minor is */ + /* restricted to 256 (0-255) */ + +#ifdef _LP64 +#define L_BITSMAJOR 32 /* # of major device bits in 64-bit Solaris */ +#define L_BITSMINOR 32 /* # of minor device bits in 64-bit Solaris */ +#define L_MAXMAJ 0xfffffffful /* max major value */ +#define L_MAXMIN 0xfffffffful /* max minor value */ +#else +#define L_BITSMAJOR L_BITSMAJOR32 +#define L_BITSMINOR L_BITSMINOR32 +#define L_MAXMAJ L_MAXMAJ32 +#define L_MAXMIN L_MAXMIN32 +#endif + +/* + * These are versions of the kernel routines for compressing and + * expanding long device numbers that don't return errors. + */ +#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR) + +#define DEVCMPL(x) (x) +#define DEVEXPL(x) (x) + +#else + +#define DEVCMPL(x) \ + (dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \ + ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \ + ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32))) + +#define DEVEXPL(x) \ + (((x) == NODEV32) ? NODEV : \ + makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32)) + +#endif /* L_BITSMAJOR32 ... */ + +/* convert to old (SVR3.2) dev format */ + +#define cmpdev(x) \ + (o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \ + ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \ + ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN))) + +/* convert to new (SVR4) dev format */ + +#define expdev(x) \ + (dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \ + ((x) & O_MAXMIN)) + +/* + * Macro for checking power of 2 address alignment. + */ +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) + +/* + * Macros for counting and rounding. + */ +#define howmany(x, y) (((x)+((y)-1))/(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) + +/* + * Macro to determine if value is a power of 2 + */ +#define ISP2(x) (((x) & ((x) - 1)) == 0) + +/* + * Macros for various sorts of alignment and rounding. The "align" must + * be a power of 2. Often times it is a block, sector, or page. + */ + +/* + * return x rounded down to an align boundary + * eg, P2ALIGN(1200, 1024) == 1024 (1*align) + * eg, P2ALIGN(1024, 1024) == 1024 (1*align) + * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align) + * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align) + */ +#define P2ALIGN(x, align) ((x) & -(align)) + +/* + * return x % (mod) align + * eg, P2PHASE(0x1234, 0x100) == 0x34 (x-0x12*align) + * eg, P2PHASE(0x5600, 0x100) == 0x00 (x-0x56*align) + */ +#define P2PHASE(x, align) ((x) & ((align) - 1)) + +/* + * return how much space is left in this block (but if it's perfectly + * aligned, return 0). + * eg, P2NPHASE(0x1234, 0x100) == 0xcc (0x13*align-x) + * eg, P2NPHASE(0x5600, 0x100) == 0x00 (0x56*align-x) + */ +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) + +/* + * return x rounded up to an align boundary + * eg, P2ROUNDUP(0x1234, 0x100) == 0x1300 (0x13*align) + * eg, P2ROUNDUP(0x5600, 0x100) == 0x5600 (0x56*align) + */ +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) + +/* + * return the ending address of the block that x is in + * eg, P2END(0x1234, 0x100) == 0x12ff (0x13*align - 1) + * eg, P2END(0x5600, 0x100) == 0x56ff (0x57*align - 1) + */ +#define P2END(x, align) (-(~(x) & -(align))) + +/* + * return x rounded up to the next phase (offset) within align. + * phase should be < align. + * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase) + * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase) + */ +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) + +/* + * return TRUE if adding len to off would cause it to cross an align + * boundary. + * eg, P2BOUNDARY(0x1234, 0xe0, 0x100) == TRUE (0x1234 + 0xe0 == 0x1314) + * eg, P2BOUNDARY(0x1234, 0x50, 0x100) == FALSE (0x1234 + 0x50 == 0x1284) + */ +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Return TRUE if they have the same highest bit set. + * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000) + * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000) + */ +#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +/* + * Macros to atomically increment/decrement a variable. mutex and var + * must be pointers. + */ +#define INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex) +#define DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex) + +/* + * Macros to declare bitfields - the order in the parameter list is + * Low to High - that is, declare bit 0 first. We only support 8-bit bitfields + * because if a field crosses a byte boundary it's not likely to be meaningful + * without reassembly in its nonnative endianness. + */ +#if defined(_BIT_FIELDS_LTOH) +#define DECL_BITFIELD2(_a, _b) \ + uint8_t _a, _b +#define DECL_BITFIELD3(_a, _b, _c) \ + uint8_t _a, _b, _c +#define DECL_BITFIELD4(_a, _b, _c, _d) \ + uint8_t _a, _b, _c, _d +#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \ + uint8_t _a, _b, _c, _d, _e +#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \ + uint8_t _a, _b, _c, _d, _e, _f +#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \ + uint8_t _a, _b, _c, _d, _e, _f, _g +#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \ + uint8_t _a, _b, _c, _d, _e, _f, _g, _h +#elif defined(_BIT_FIELDS_HTOL) +#define DECL_BITFIELD2(_a, _b) \ + uint8_t _b, _a +#define DECL_BITFIELD3(_a, _b, _c) \ + uint8_t _c, _b, _a +#define DECL_BITFIELD4(_a, _b, _c, _d) \ + uint8_t _d, _c, _b, _a +#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \ + uint8_t _e, _d, _c, _b, _a +#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \ + uint8_t _f, _e, _d, _c, _b, _a +#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \ + uint8_t _g, _f, _e, _d, _c, _b, _a +#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \ + uint8_t _h, _g, _f, _e, _d, _c, _b, _a +#else +#error One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined +#endif /* _BIT_FIELDS_LTOH */ + +#if !defined(_KMEMUSER) && !defined(offsetof) + +/* avoid any possibility of clashing with version */ + +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * High order bit is 31 (or 63 in _LP64 kernel). + */ +static __inline int +highbit(ulong_t i) +{ +#if defined(HAVE_INLINE_FLSL) + return (flsl(i)); +#else + int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (i & 0xffffffff00000000ul) { + h += 32; i >>= 32; + } +#endif + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +#endif +} + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + */ +static __inline int +highbit64(uint64_t i) +{ +#if defined(HAVE_INLINE_FLSLL) + return (flsll(i)); +#else + int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ULL) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSMACROS_H */ diff --git a/include/os/freebsd/spl/sys/systeminfo.h b/include/os/freebsd/spl/sys/systeminfo.h new file mode 100644 index 000000000000..4028cd7cc6fd --- /dev/null +++ b/include/os/freebsd/spl/sys/systeminfo.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_SYSTEMINFO_H_ +#define _SYS_SYSTEMINFO_H_ + +#define HW_HOSTID_LEN 11 + +#endif /* !_SYS_SYSTEMINFO_H_ */ diff --git a/include/os/freebsd/spl/sys/systm.h b/include/os/freebsd/spl/sys/systm.h new file mode 100644 index 000000000000..53cc6f52717e --- /dev/null +++ b/include/os/freebsd/spl/sys/systm.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SYSTM_H_ +#define _OPENSOLARIS_SYS_SYSTM_H_ + +#include +#include_next + +#include + +#define PAGESIZE PAGE_SIZE +#define PAGEOFFSET (PAGESIZE - 1) +#define PAGEMASK (~PAGEOFFSET) + +#define delay(x) pause("soldelay", (x)) + +#endif /* _OPENSOLARIS_SYS_SYSTM_H_ */ diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h new file mode 100644 index 000000000000..aaa435f2f372 --- /dev/null +++ b/include/os/freebsd/spl/sys/taskq.h @@ -0,0 +1,114 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TASKQ_H +#define _SYS_TASKQ_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TASKQ_NAMELEN 31 + +struct taskqueue; +struct taskq { + struct taskqueue *tq_queue; +}; + +typedef struct taskq taskq_t; +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +typedef struct taskq_ent { + struct task tqent_task; + task_func_t *tqent_func; + void *tqent_arg; + struct timeout_task tqent_timeout_task; + int tqent_type; + int tqent_gen; +} taskq_ent_t; + +struct proc; + +/* + * Public flags for taskq_create(): bit range 0-15 + */ +#define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. + */ +#define TQ_SLEEP 0x00 /* Can block for memory */ +#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ + +#define TASKQID_INVALID ((taskqid_t)0) + +#define taskq_init_ent(x) +extern taskq_t *system_taskq; +/* Global dynamic task queue for long delay */ +extern taskq_t *system_delay_taskq; + +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, + uint_t, clock_t); +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +extern int taskq_empty_ent(taskq_ent_t *); +taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + struct proc *, uint_t); +taskq_t *taskq_create_sysdc(const char *, int, int, int, + struct proc *, uint_t, uint_t); +void nulltask(void *); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_wait_outstanding(taskq_t *, taskqid_t); +extern void taskq_wait(taskq_t *); +extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern int taskq_member(taskq_t *, kthread_t *); +extern taskq_t *taskq_of_curthread(void); +void taskq_suspend(taskq_t *); +int taskq_suspended(taskq_t *); +void taskq_resume(taskq_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_H */ diff --git a/include/os/freebsd/spl/sys/thread.h b/include/os/freebsd/spl/sys/thread.h new file mode 100644 index 000000000000..4fb1a542f55f --- /dev/null +++ b/include/os/freebsd/spl/sys/thread.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_THREAD_H_ +#define _SPL_THREAD_H_ + +#define getcomm() curthread->td_name +#define getpid() curthread->td_tid +#endif diff --git a/include/os/freebsd/spl/sys/time.h b/include/os/freebsd/spl/sys/time.h new file mode 100644 index 000000000000..fbc679aacf93 --- /dev/null +++ b/include/os/freebsd/spl/sys/time.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_TIME_H_ +#define _OPENSOLARIS_SYS_TIME_H_ +#pragma once +#include_next +#include +#ifndef _SYS_KERNEL_H_ +extern int hz; +#endif + +#define SEC 1 +#define MILLISEC 1000UL +#define MICROSEC 1000000UL +#define NANOSEC 1000000000UL +#define TIME_MAX LLONG_MAX + +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) + +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) + +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + +typedef longlong_t hrtime_t; + +#if defined(__i386__) || defined(__powerpc__) +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX) +#else +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX) +#endif + +#define SEC_TO_TICK(sec) ((sec) * hz) +#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) + +static __inline hrtime_t +gethrtime(void) +{ + struct timespec ts; + hrtime_t nsec; + + nanouptime(&ts); + nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec; + return (nsec); +} + +#define gethrestime_sec() (time_second) +#define gethrestime(ts) getnanotime(ts) +#define gethrtime_waitfree() gethrtime() + +extern int nsec_per_tick; /* nanoseconds per clock tick */ + +#define ddi_get_lbolt64() \ + (int64_t)(((getsbinuptime() >> 16) * hz) >> 16) +#define ddi_get_lbolt() (clock_t)ddi_get_lbolt64() + +#else + +static __inline hrtime_t +gethrtime(void) +{ + struct timespec ts; + clock_gettime(CLOCK_UPTIME, &ts); + return (((u_int64_t)ts.tv_sec) * NANOSEC + ts.tv_nsec); +} +#endif /* !_OPENSOLARIS_SYS_TIME_H_ */ diff --git a/include/os/freebsd/spl/sys/timer.h b/include/os/freebsd/spl/sys/timer.h new file mode 100644 index 000000000000..d4694bb7c09c --- /dev/null +++ b/include/os/freebsd/spl/sys/timer.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_TIMER_H_ +#define _SPL_TIMER_H_ +#define ddi_time_after(a, b) ((a) > (b)) +#define ddi_time_after64(a, b) ((a) > (b)) +#define usleep_range(wakeup, wakeupepsilon) \ + pause_sbt("usleep_range", ustosbt(wakeup), \ + ustosbt(wakeupepsilon - wakeup), 0) + +#define schedule() pause("schedule", 1) +#endif diff --git a/include/os/freebsd/spl/sys/trace.h b/include/os/freebsd/spl/sys/trace.h new file mode 100644 index 000000000000..d9639d27b60e --- /dev/null +++ b/include/os/freebsd/spl/sys/trace.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/include/os/freebsd/spl/sys/trace_zfs.h b/include/os/freebsd/spl/sys/trace_zfs.h new file mode 100644 index 000000000000..d9639d27b60e --- /dev/null +++ b/include/os/freebsd/spl/sys/trace_zfs.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/include/os/freebsd/spl/sys/types.h b/include/os/freebsd/spl/sys/types.h new file mode 100644 index 000000000000..ed5e8ef80bf9 --- /dev/null +++ b/include/os/freebsd/spl/sys/types.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_SYS_TYPES_H_ +#define _SPL_SYS_TYPES_H_ + +#pragma once +/* + * This is a bag of dirty hacks to keep things compiling. + */ + +#include + +typedef int64_t clock_t; +#define _CLOCK_T_DECLARED + +#include_next +#include +#include + +#define MAXNAMELEN 256 + +typedef struct timespec timestruc_t; +typedef struct timespec timespec_t; +typedef struct timespec inode_timespec_t; +/* BEGIN CSTYLED */ +typedef u_int uint_t; +typedef u_char uchar_t; +typedef u_short ushort_t; +typedef u_long ulong_t; +typedef u_int minor_t; +/* END CSTYLED */ +#ifndef _OFF64_T_DECLARED +#define _OFF64_T_DECLARED +typedef off_t off64_t; +#endif +typedef id_t taskid_t; +typedef id_t projid_t; +typedef id_t poolid_t; +typedef id_t zoneid_t; +typedef id_t ctid_t; +typedef mode_t o_mode_t; +typedef uint64_t pgcnt_t; + +#define B_FALSE 0 +#define B_TRUE 1 + +typedef short index_t; +typedef off_t offset_t; +#ifndef _PTRDIFF_T_DECLARED +typedef __ptrdiff_t ptrdiff_t; /* pointer difference */ +#define _PTRDIFF_T_DECLARED +#endif +typedef int64_t rlim64_t; +typedef int major_t; + +#else +#ifdef NEED_SOLARIS_BOOLEAN +#if defined(__XOPEN_OR_POSIX) +typedef enum { _B_FALSE, _B_TRUE } boolean_t; +#else +typedef enum { B_FALSE, B_TRUE } boolean_t; +#endif /* defined(__XOPEN_OR_POSIX) */ +#endif + +typedef u_longlong_t u_offset_t; +typedef u_longlong_t len_t; + +typedef longlong_t diskaddr_t; + + +#endif /* !_OPENSOLARIS_SYS_TYPES_H_ */ diff --git a/include/os/freebsd/spl/sys/types32.h b/include/os/freebsd/spl/sys/types32.h new file mode 100644 index 000000000000..907b667e5d80 --- /dev/null +++ b/include/os/freebsd/spl/sys/types32.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_TYPES32_H +#define _SPL_TYPES32_H + +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t time32_t; +typedef uint32_t size32_t; + +#endif /* _SPL_TYPES32_H */ diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h new file mode 100644 index 000000000000..fe5e24b99d10 --- /dev/null +++ b/include/os/freebsd/spl/sys/uio.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_UIO_H_ +#define _OPENSOLARIS_SYS_UIO_H_ + +#include_next +#include +#include + + + +#define uio_loffset uio_offset + +typedef struct uio uio_t; +typedef struct iovec iovec_t; +typedef enum uio_seg uio_seg_t; + +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY +} xuio_type_t; + +typedef struct xuio { + uio_t xu_uio; + + /* Extended uio fields */ + enum xuio_type xu_type; /* What kind of uio structure? */ + union { + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +static __inline int +zfs_uiomove(void *cp, size_t n, enum uio_rw dir, uio_t *uio) +{ + + ASSERT(uio->uio_rw == dir); + return (uiomove(cp, (int)n, uio)); +} +#define uiomove(cp, n, dir, uio) zfs_uiomove((cp), (n), (dir), (uio)) + +int uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes); +void uioskip(uio_t *uiop, size_t n); + +#endif /* !_OPENSOLARIS_SYS_UIO_H_ */ diff --git a/include/os/freebsd/spl/sys/uuid.h b/include/os/freebsd/spl/sys/uuid.h new file mode 100644 index 000000000000..26d46e8d6214 --- /dev/null +++ b/include/os/freebsd/spl/sys/uuid.h @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UUID_H +#define _SYS_UUID_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The copyright in this file is taken from the original Leach + * & Salz UUID specification, from which this implementation + * is derived. + */ + +/* + * Copyright (c) 1990- 1993, 1996 Open Software Foundation, Inc. + * Copyright (c) 1989 by Hewlett-Packard Company, Palo Alto, Ca. & + * Digital Equipment Corporation, Maynard, Mass. Copyright (c) 1998 + * Microsoft. To anyone who acknowledges that this file is provided + * "AS IS" without any express or implied warranty: permission to use, + * copy, modify, and distribute this file for any purpose is hereby + * granted without fee, provided that the above copyright notices and + * this notice appears in all source code copies, and that none of the + * names of Open Software Foundation, Inc., Hewlett-Packard Company, + * or Digital Equipment Corporation be used in advertising or + * publicity pertaining to distribution of the software without + * specific, written prior permission. Neither Open Software + * Foundation, Inc., Hewlett-Packard Company, Microsoft, nor Digital + * Equipment Corporation makes any representations about the + * suitability of this software for any purpose. + */ + +#include +#include + +typedef struct { + uint8_t nodeID[6]; +} uuid_node_t; + +/* + * The uuid type used throughout when referencing uuids themselves + */ +typedef struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint8_t clock_seq_hi_and_reserved; + uint8_t clock_seq_low; + uint8_t node_addr[6]; +} uuid_t; + +#define UUID_PRINTABLE_STRING_LENGTH 37 + +/* + * Convert a uuid to/from little-endian format + */ +#define UUID_LE_CONVERT(dest, src) \ +{ \ + (dest) = (src); \ + (dest).time_low = LE_32((dest).time_low); \ + (dest).time_mid = LE_16((dest).time_mid); \ + (dest).time_hi_and_version = LE_16((dest).time_hi_and_version); \ +} + +static __inline int +uuid_is_null(const caddr_t uuid) +{ + return (0); +} +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UUID_H */ diff --git a/include/os/freebsd/spl/sys/vfs.h b/include/os/freebsd/spl/sys/vfs.h new file mode 100644 index 000000000000..a432f6c56739 --- /dev/null +++ b/include/os/freebsd/spl/sys/vfs.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VFS_H_ +#define _OPENSOLARIS_SYS_VFS_H_ + +#include +#include +#include + +#define rootdir rootvnode + +struct thread; +struct vnode; +typedef struct mount vfs_t; + +typedef int umode_t; + +#define vfs_flag mnt_flag +#define vfs_data mnt_data +#define vfs_count mnt_ref +#define vfs_fsid mnt_stat.f_fsid +#define vfs_bsize mnt_stat.f_bsize +#define vfs_resource mnt_stat.f_mntfromname + +#define v_flag v_vflag +#define v_vfsp v_mount + +#define VFS_RDONLY MNT_RDONLY +#define VFS_NOSETUID MNT_NOSUID +#define VFS_NOEXEC MNT_NOEXEC + +#define fs_vscan(vp, cr, async) (0) + +#define VROOT VV_ROOT + +#define XU_NGROUPS 16 + +/* + * Structure defining a mount option for a filesystem. + * option names are found in mntent.h + */ +typedef struct mntopt { + char *mo_name; /* option name */ + char **mo_cancel; /* list of options cancelled by this one */ + char *mo_arg; /* argument string for this option */ + int mo_flags; /* flags for this mount option */ + void *mo_data; /* filesystem specific data */ +} mntopt_t; + +/* + * Flags that apply to mount options + */ + +#define MO_SET 0x01 /* option is set */ +#define MO_NODISPLAY 0x02 /* option not listed in mnttab */ +#define MO_HASVALUE 0x04 /* option takes a value */ +#define MO_IGNORE 0x08 /* option ignored by parser */ +#define MO_DEFAULT MO_SET /* option is on by default */ +#define MO_TAG 0x10 /* flags a tag set by user program */ +#define MO_EMPTY 0x20 /* empty space in option table */ + +#define VFS_NOFORCEOPT 0x01 /* honor MO_IGNORE (don't set option) */ +#define VFS_DISPLAY 0x02 /* Turn off MO_NODISPLAY bit for opt */ +#define VFS_NODISPLAY 0x04 /* Turn on MO_NODISPLAY bit for opt */ +#define VFS_CREATEOPT 0x08 /* Create the opt if it's not there */ + +/* + * Structure holding mount option strings for the mounted file system. + */ +typedef struct mntopts { + uint_t mo_count; /* number of entries in table */ + mntopt_t *mo_list; /* list of mount options */ +} mntopts_t; + +void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, + int flags __unused); +void vfs_clearmntopt(vfs_t *vfsp, const char *name); +int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp); +int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, + char *fspath, char *fspec, int fsflags); + +typedef uint64_t vfs_feature_t; + +#define VFSFT_XVATTR 0x100000001 /* Supports xvattr for attrs */ +#define VFSFT_CASEINSENSITIVE 0x100000002 /* Supports case-insensitive */ +#define VFSFT_NOCASESENSITIVE 0x100000004 /* NOT case-sensitive */ +#define VFSFT_DIRENTFLAGS 0x100000008 /* Supports dirent flags */ +#define VFSFT_ACLONCREATE 0x100000010 /* Supports ACL on create */ +#define VFSFT_ACEMASKONACCESS 0x100000020 /* Can use ACEMASK for access */ +#define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */ +#define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */ +#define VFSFT_REPARSE 0x100000100 /* Supports reparse point */ +#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200 + /* Support loaning /returning cache buffer */ + +#define vfs_set_feature(vfsp, feature) do { } while (0) +#define vfs_clear_feature(vfsp, feature) do { } while (0) +#define vfs_has_feature(vfsp, feature) (0) + +#include +#endif /* _OPENSOLARIS_SYS_VFS_H_ */ diff --git a/include/os/freebsd/spl/sys/vm.h b/include/os/freebsd/spl/sys/vm.h new file mode 100644 index 000000000000..07ee6bc191a7 --- /dev/null +++ b/include/os/freebsd/spl/sys/vm.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VM_H_ +#define _OPENSOLARIS_SYS_VM_H_ + +#include + +extern const int zfs_vm_pagerret_bad; +extern const int zfs_vm_pagerret_error; +extern const int zfs_vm_pagerret_ok; +extern const int zfs_vm_pagerput_sync; +extern const int zfs_vm_pagerput_inval; + +void zfs_vmobject_assert_wlocked(vm_object_t object); +void zfs_vmobject_wlock(vm_object_t object); +void zfs_vmobject_wunlock(vm_object_t object); + +static inline caddr_t +zfs_map_page(vm_page_t pp, struct sf_buf **sfp) +{ + *sfp = sf_buf_alloc(pp, 0); + return ((caddr_t)sf_buf_kva(*sfp)); +} + +static inline void +zfs_unmap_page(struct sf_buf *sf) +{ + sf_buf_free(sf); +} + +#endif /* _OPENSOLARIS_SYS_VM_H_ */ diff --git a/include/os/freebsd/spl/sys/vmsystm.h b/include/os/freebsd/spl/sys/vmsystm.h new file mode 100644 index 000000000000..0db34bbe438b --- /dev/null +++ b/include/os/freebsd/spl/sys/vmsystm.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_VMSYSTM_H_ +#define _SPL_VMSYSTM_H_ + +#define xcopyout copyout + +#endif diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h new file mode 100644 index 000000000000..e330bc079f60 --- /dev/null +++ b/include/os/freebsd/spl/sys/vnode.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VNODE_H_ +#define _OPENSOLARIS_SYS_VNODE_H_ + +struct vnode; +struct vattr; +struct xucred; + +typedef struct flock flock64_t; +typedef struct vnode vnode_t; +typedef struct vattr vattr_t; +typedef enum vtype vtype_t; + +#include +#include +#include_next +#include +enum symfollow { NO_FOLLOW = NOFOLLOW }; + +#define NOCRED ((struct ucred *)0) /* no credential available */ +#define F_FREESP 11 /* Free file space */ + +#include +#include +#include_next +#include +#include +#include +#include_next +#include +#include +#include + +typedef struct vop_vector vnodeops_t; +#define VOP_FID VOP_VPTOFH +#define vop_fid vop_vptofh +#define vop_fid_args vop_vptofh_args +#define a_fid a_fhp + +#define IS_XATTRDIR(dvp) (0) + +#define v_count v_usecount + +#define rootvfs (rootvnode == NULL ? NULL : rootvnode->v_mount) + +static __inline int +vn_is_readonly(vnode_t *vp) +{ + return (vp->v_mount->mnt_flag & MNT_RDONLY); +} +#define vn_vfswlock(vp) (0) +#define vn_vfsunlock(vp) do { } while (0) +#define vn_ismntpt(vp) \ + ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) +#define vn_mountedvfs(vp) ((vp)->v_mountedhere) +#define vn_has_cached_data(vp) \ + ((vp)->v_object != NULL && \ + (vp)->v_object->resident_page_count > 0) +#define vn_exists(vp) do { } while (0) +#define vn_invalid(vp) do { } while (0) +#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) +#define vn_free(vp) do { } while (0) +#define vn_matchops(vp, vops) ((vp)->v_op == &(vops)) + +#define VN_HOLD(v) vref(v) +#define VN_RELE(v) vrele(v) +#define VN_URELE(v) vput(v) + +#define vnevent_create(vp, ct) do { } while (0) +#define vnevent_link(vp, ct) do { } while (0) +#define vnevent_remove(vp, dvp, name, ct) do { } while (0) +#define vnevent_rmdir(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_src(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_dest_dir(vp, ct) do { } while (0) + +#define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) +#define MANDLOCK(vp, mode) (0) + +/* + * We will use va_spare is place of Solaris' va_mask. + * This field is initialized in zfs_setattr(). + */ +#define va_mask va_spare +/* TODO: va_fileid is shorter than va_nodeid !!! */ +#define va_nodeid va_fileid +/* TODO: This field needs conversion! */ +#define va_nblocks va_bytes +#define va_blksize va_blocksize +#define va_seq va_gen + +#define MAXOFFSET_T OFF_MAX +#define EXCL 0 + +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FEXCL O_EXCL +#define FDSYNC FFSYNC +#define FRSYNC FFSYNC +#define FSYNC FFSYNC +#define FOFFMAX 0x00 +#define FIGNORECASE 0x00 + +/* + * Attributes of interest to the caller of setattr or getattr. + */ +#define AT_MODE 0x00002 +#define AT_UID 0x00004 +#define AT_GID 0x00008 +#define AT_FSID 0x00010 +#define AT_NODEID 0x00020 +#define AT_NLINK 0x00040 +#define AT_SIZE 0x00080 +#define AT_ATIME 0x00100 +#define AT_MTIME 0x00200 +#define AT_CTIME 0x00400 +#define AT_RDEV 0x00800 +#define AT_BLKSIZE 0x01000 +#define AT_NBLOCKS 0x02000 +/* 0x04000 */ /* unused */ +#define AT_SEQ 0x08000 +/* + * If AT_XVATTR is set then there are additional bits to process in + * the xvattr_t's attribute bitmap. If this is not set then the bitmap + * MUST be ignored. Note that this bit must be set/cleared explicitly. + * That is, setting AT_ALL will NOT set AT_XVATTR. + */ +#define AT_XVATTR 0x10000 + +#define AT_ALL (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\ + AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\ + AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) + +#define AT_STAT (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\ + AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV) + +#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME) + +#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|\ + AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) + +static __inline void +vattr_init_mask(vattr_t *vap) +{ + + vap->va_mask = 0; + + if (vap->va_uid != (uid_t)VNOVAL) + vap->va_mask |= AT_UID; + if (vap->va_gid != (gid_t)VNOVAL) + vap->va_mask |= AT_GID; + if (vap->va_size != (u_quad_t)VNOVAL) + vap->va_mask |= AT_SIZE; + if (vap->va_atime.tv_sec != VNOVAL) + vap->va_mask |= AT_ATIME; + if (vap->va_mtime.tv_sec != VNOVAL) + vap->va_mask |= AT_MTIME; + if (vap->va_mode != (uint16_t)VNOVAL) + vap->va_mask |= AT_MODE; + if (vap->va_flags != VNOVAL) + vap->va_mask |= AT_XVATTR; +} + +#define RLIM64_INFINITY 0 + +static __inline int +vn_rename(char *from, char *to, enum uio_seg seg) +{ + + ASSERT(seg == UIO_SYSSPACE); + + return (kern_renameat(curthread, AT_FDCWD, from, AT_FDCWD, to, seg)); +} + +#include + +#endif /* _OPENSOLARIS_SYS_VNODE_H_ */ diff --git a/include/os/freebsd/spl/sys/vnode_impl.h b/include/os/freebsd/spl/sys/vnode_impl.h new file mode 100644 index 000000000000..78de740d9918 --- /dev/null +++ b/include/os/freebsd/spl/sys/vnode_impl.h @@ -0,0 +1,268 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_VNODE_IMPL_H +#define _SYS_VNODE_IMPL_H + + +#define IS_DEVVP(vp) \ + ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) + +#define V_XATTRDIR 0x0000 /* attribute unnamed directory */ + +#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ + +/* + * The xvattr structure is really a variable length structure that + * is made up of: + * - The classic vattr_t (xva_vattr) + * - a 32 bit quantity (xva_mapsize) that specifies the size of the + * attribute bitmaps in 32 bit words. + * - A pointer to the returned attribute bitmap (needed because the + * previous element, the requested attribute bitmap) is variable lenth. + * - The requested attribute bitmap, which is an array of 32 bit words. + * Callers use the XVA_SET_REQ() macro to set the bits corresponding to + * the attributes that are being requested. + * - The returned attribute bitmap, which is an array of 32 bit words. + * File systems that support optional attributes use the XVA_SET_RTN() + * macro to set the bits corresponding to the attributes that are being + * returned. + * - The xoptattr_t structure which contains the attribute values + * + * xva_mapsize determines how many words in the attribute bitmaps. + * Immediately following the attribute bitmaps is the xoptattr_t. + * xva_getxoptattr() is used to get the pointer to the xoptattr_t + * section. + */ + +#define XVA_MAPSIZE 3 /* Size of attr bitmaps */ +#define XVA_MAGIC 0x78766174 /* Magic # for verification */ + +/* + * The xvattr structure is an extensible structure which permits optional + * attributes to be requested/returned. File systems may or may not support + * optional attributes. They do so at their own discretion but if they do + * support optional attributes, they must register the VFSFT_XVATTR feature + * so that the optional attributes can be set/retrived. + * + * The fields of the xvattr structure are: + * + * xva_vattr - The first element of an xvattr is a legacy vattr structure + * which includes the common attributes. If AT_XVATTR is set in the va_mask + * then the entire structure is treated as an xvattr. If AT_XVATTR is not + * set, then only the xva_vattr structure can be used. + * + * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification. + * + * xva_mapsize - Size of requested and returned attribute bitmaps. + * + * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the + * size of the array before it, xva_reqattrmap[], could change which means + * the location of xva_rtnattrmap[] could change. This will allow unbundled + * file systems to find the location of xva_rtnattrmap[] when the sizes change. + * + * xva_reqattrmap[] - Array of requested attributes. Attributes are + * represented by a specific bit in a specific element of the attribute + * map array. Callers set the bits corresponding to the attributes + * that the caller wants to get/set. + * + * xva_rtnattrmap[] - Array of attributes that the file system was able to + * process. Not all file systems support all optional attributes. This map + * informs the caller which attributes the underlying file system was able + * to set/get. (Same structure as the requested attributes array in terms + * of each attribute corresponding to specific bits and array elements.) + * + * xva_xoptattrs - Structure containing values of optional attributes. + * These values are only valid if the corresponding bits in xva_reqattrmap + * are set and the underlying file system supports those attributes. + */ + + + +/* + * Attribute bits used in the extensible attribute's (xva's) attribute + * bitmaps. Note that the bitmaps are made up of a variable length number + * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n" + * is the element in the bitmap (starting at 1). This convention is for + * the convenience of the maintainer to keep track of which element each + * attribute belongs to. + * + * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS + * MUST USE THE XAT_* DEFINES. + */ +#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */ +#define XAT0_CREATETIME 0x00000001 /* Create time of file */ +#define XAT0_ARCHIVE 0x00000002 /* Archive */ +#define XAT0_SYSTEM 0x00000004 /* System */ +#define XAT0_READONLY 0x00000008 /* Readonly */ +#define XAT0_HIDDEN 0x00000010 /* Hidden */ +#define XAT0_NOUNLINK 0x00000020 /* Nounlink */ +#define XAT0_IMMUTABLE 0x00000040 /* immutable */ +#define XAT0_APPENDONLY 0x00000080 /* appendonly */ +#define XAT0_NODUMP 0x00000100 /* nodump */ +#define XAT0_OPAQUE 0x00000200 /* opaque */ +#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */ +#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */ +#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */ +#define XAT0_REPARSE 0x00002000 /* FS reparse point */ +#define XAT0_GEN 0x00004000 /* object generation number */ +#define XAT0_OFFLINE 0x00008000 /* offline */ +#define XAT0_SPARSE 0x00010000 /* sparse */ + +/* Support for XAT_* optional attributes */ +#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */ +#define XVA_SHFT 32 /* Used to shift index */ + +/* + * Used to pry out the index and attribute bits from the XAT_* attributes + * defined below. Note that we're masking things down to 32 bits then + * casting to uint32_t. + */ +#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK)) +#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK)) + +/* + * The following defines present a "flat namespace" so that consumers don't + * need to keep track of which element belongs to which bitmap entry. + * + * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER + */ +#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME) +#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE) +#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM) +#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY) +#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN) +#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK) +#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE) +#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY) +#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP) +#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE) +#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED) +#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED) +#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP) +#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE) +#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN) +#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE) +#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE) + +/* + * The returned attribute map array (xva_rtnattrmap[]) is located past the + * requested attribute map array (xva_reqattrmap[]). Its location changes + * when the array sizes change. We use a separate pointer in a known location + * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is + * set in xva_init() + */ +#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp) + +#define MODEMASK 07777 /* mode bits plus permission bits */ +#define PERMMASK 00777 /* permission bits */ + +/* + * VOP_ACCESS flags + */ +#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ + +/* + * Flags for vnode operations. + */ +enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ +enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ + +/* + * Structure used by various vnode operations to determine + * the context (pid, host, identity) of a caller. + * + * The cc_caller_id is used to identify one or more callers who invoke + * operations, possibly on behalf of others. For example, the NFS + * server could have it's own cc_caller_id which can be detected by + * vnode/vfs operations or (FEM) monitors on those operations. New + * caller IDs are generated by fs_new_caller_id(). + */ +typedef struct caller_context { + pid_t cc_pid; /* Process ID of the caller */ + int cc_sysid; /* System ID, used for remote calls */ + u_longlong_t cc_caller_id; /* Identifier for (set of) caller(s) */ + ulong_t cc_flags; +} caller_context_t; + +struct taskq; + +/* + * Flags for VOP_LOOKUP + * + * Defined in file.h, but also possible, FIGNORECASE and FSEARCH + * + */ +#define LOOKUP_DIR 0x01 /* want parent dir vp */ +#define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */ +#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ +#define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ + +/* + * Flags for VOP_READDIR + */ +#define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */ +#define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */ + +/* + * Public vnode manipulation functions. + */ + +void vn_rele_async(struct vnode *vp, struct taskq *taskq); + +#define VN_RELE_ASYNC(vp, taskq) { \ + vn_rele_async(vp, taskq); \ +} + +/* + * Flags to VOP_SETATTR/VOP_GETATTR. + */ +#define ATTR_UTIME 0x01 /* non-default utime(2) request */ +#define ATTR_EXEC 0x02 /* invocation from exec(2) */ +#define ATTR_COMM 0x04 /* yield common vp attributes */ +#define ATTR_HINT 0x08 /* information returned will be `hint' */ +#define ATTR_REAL 0x10 /* yield attributes of the real vp */ +#define ATTR_NOACLCHECK 0x20 /* Don't check ACL when checking permissions */ +#define ATTR_TRIGGER 0x40 /* Mount first if vnode is a trigger mount */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VNODE_H */ diff --git a/include/os/freebsd/spl/sys/zmod.h b/include/os/freebsd/spl/sys/zmod.h new file mode 100644 index 000000000000..ba0267203ce3 --- /dev/null +++ b/include/os/freebsd/spl/sys/zmod.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZMOD_H +#define _ZMOD_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * zmod - RFC-1950-compatible decompression routines + * + * This file provides the public interfaces to zmod, an in-kernel RFC 1950 + * decompression library. More information about the implementation of these + * interfaces can be found in the usr/src/uts/common/zmod/ directory. + */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) + +extern int z_uncompress(void *, size_t *, const void *, size_t); +extern int z_compress(void *, size_t *, const void *, size_t); +extern int z_compress_level(void *, size_t *, const void *, size_t, int); +extern const char *z_strerror(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZMOD_H */ diff --git a/include/os/freebsd/spl/sys/zone.h b/include/os/freebsd/spl/sys/zone.h new file mode 100644 index 000000000000..71a28adaf26e --- /dev/null +++ b/include/os/freebsd/spl/sys/zone.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_ZONE_H_ +#define _OPENSOLARIS_SYS_ZONE_H_ + +/* + * Macros to help with zone visibility restrictions. + */ + +#define GLOBAL_ZONEID 0 + +/* + * Is thread in the global zone? + */ +#define INGLOBALZONE(p) in_globalzone((p)) + + +extern boolean_t in_globalzone(struct proc *); + +/* + * Attach the given dataset to the given jail. + */ +extern int zone_dataset_attach(struct ucred *, const char *, int); + +/* + * Detach the given dataset to the given jail. + */ +extern int zone_dataset_detach(struct ucred *, const char *, int); + +/* + * Returns true if the named pool/dataset is visible in the current zone. + */ +extern int zone_dataset_visible(const char *, int *); + +/* + * Safely get the hostid of the specified zone (defaults to machine's hostid + * if the specified zone doesn't emulate a hostid). Passing NULL retrieves + * the global zone's (i.e., physical system's) hostid. + */ +extern uint32_t zone_get_hostid(void *); + +#endif /* !_OPENSOLARIS_SYS_ZONE_H_ */ diff --git a/include/os/freebsd/zfs/Makefile.am b/include/os/freebsd/zfs/Makefile.am new file mode 100644 index 000000000000..081839c48c8f --- /dev/null +++ b/include/os/freebsd/zfs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/include/os/freebsd/zfs/sys/Makefile.am b/include/os/freebsd/zfs/sys/Makefile.am new file mode 100644 index 000000000000..1dec5ef5ff59 --- /dev/null +++ b/include/os/freebsd/zfs/sys/Makefile.am @@ -0,0 +1,14 @@ +KERNEL_H = \ + $(top_srcdir)/include/os/freebsd/zfs/sys/freebsd_crypto.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/sha2.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/vdev_os.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/zfs_context_os.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/zfs_ctldir.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/zfs_dir.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/zfs_vfsops.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/zfs_vnops.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/zfs_znode_impl.h \ + $(top_srcdir)/include/os/freebsd/zfs/sys/zpl.h + +EXTRA_DIST = $(KERNEL_H) diff --git a/include/os/freebsd/zfs/sys/freebsd_crypto.h b/include/os/freebsd/zfs/sys/freebsd_crypto.h new file mode 100644 index 000000000000..08e058d6affa --- /dev/null +++ b/include/os/freebsd/zfs/sys/freebsd_crypto.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018 Sean Eric Fagan + * Portions Copyright (c) 2005-2011 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Portions of this file were taken from GELI's implementation of hmac. + * + * $FreeBSD$ + */ + +#ifndef _ZFS_FREEBSD_CRYPTO_H +#define _ZFS_FREEBSD_CRYPTO_H + +#include +#include +#include +#include +#include + +#define SUN_CKM_AES_CCM "CKM_AES_CCM" +#define SUN_CKM_AES_GCM "CKM_AES_GCM" +#define SUN_CKM_SHA512_HMAC "CKM_SHA512_HMAC" + +#define CRYPTO_KEY_RAW 1 + +#define CRYPTO_BITS2BYTES(n) ((n) == 0 ? 0 : (((n) - 1) >> 3) + 1) +#define CRYPTO_BYTES2BITS(n) ((n) << 3) + +struct zio_crypt_info; + +typedef struct freebsd_crypt_session { + struct mtx fs_lock; + crypto_session_t fs_sid; + boolean_t fs_done; +} freebsd_crypt_session_t; + +/* + * Unused types to minimize code differences. + */ +typedef void *crypto_mechanism_t; +typedef void *crypto_ctx_template_t; +/* + * Unlike the ICP crypto_key type, this only + * supports (the equivalent of + * CRYPTO_KEY_RAW). + */ +typedef struct crypto_key { + int ck_format; /* Unused, but minimizes code diff */ + void *ck_data; + size_t ck_length; +} crypto_key_t; + +typedef struct hmac_ctx { + SHA512_CTX innerctx; + SHA512_CTX outerctx; +} *crypto_context_t; + +/* + * The only algorithm ZFS uses for hashing is SHA512_HMAC. + */ +void crypto_mac(const crypto_key_t *key, const void *in_data, + size_t in_data_size, void *out_data, size_t out_data_size); +void crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *key); +void crypto_mac_update(struct hmac_ctx *ctx, const void *data, + size_t data_size); +void crypto_mac_final(struct hmac_ctx *ctx, void *out_data, + size_t out_data_size); + +int freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *, crypto_key_t *); +void freebsd_crypt_freesession(freebsd_crypt_session_t *sessp); + +int freebsd_crypt_uio(boolean_t, freebsd_crypt_session_t *, + struct zio_crypt_info *, uio_t *, crypto_key_t *, uint8_t *, + size_t, size_t); + +#endif /* _ZFS_FREEBSD_CRYPTO_H */ diff --git a/include/os/freebsd/zfs/sys/sha2.h b/include/os/freebsd/zfs/sys/sha2.h new file mode 100644 index 000000000000..9d848e1fc2d1 --- /dev/null +++ b/include/os/freebsd/zfs/sys/sha2.h @@ -0,0 +1,200 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2013 Saso Kiselkov. All rights reserved. */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#include /* for uint_* */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ +#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ + +#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ +#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ +#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ + +/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ +#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ +#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ + +#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ +#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* + * SHA2 context. + * The contents of this structure are a private interface between the + * Init/Update/Final calls of the functions defined below. + * Callers must never attempt to read or write any of the fields + * in this structure directly. + */ + +#include +#include +#include +#include +typedef struct { + uint32_t algotype; /* Algorithm Type */ + union { + SHA256_CTX SHA256_ctx; + SHA384_CTX SHA384_ctx; + SHA512_CTX SHA512_ctx; + }; +} SHA2_CTX; + +extern void SHA256Init(SHA256_CTX *); + +extern void SHA256Update(SHA256_CTX *, const void *, size_t); + +extern void SHA256Final(void *, SHA256_CTX *); + +extern void SHA384Init(SHA384_CTX *); + +extern void SHA384Update(SHA384_CTX *, const void *, size_t); + +extern void SHA384Final(void *, SHA384_CTX *); + +extern void SHA512Init(SHA512_CTX *); + +extern void SHA512Update(SHA512_CTX *, const void *, size_t); + +extern void SHA512Final(void *, SHA512_CTX *); + + +static inline void +SHA2Init(uint64_t mech, SHA2_CTX *c) +{ + switch (mech) { + case SHA256: + SHA256_Init(&c->SHA256_ctx); + break; + case SHA384: + SHA384_Init(&c->SHA384_ctx); + break; + case SHA512: + SHA512_Init(&c->SHA512_ctx); + break; + case SHA512_256: + SHA512_256_Init(&c->SHA512_ctx); + break; + default: + panic("unknown mechanism %lu", mech); + } + c->algotype = (uint32_t)mech; +} + +static inline void +SHA2Update(SHA2_CTX *c, const void *p, size_t s) +{ + switch (c->algotype) { + case SHA256: + SHA256_Update(&c->SHA256_ctx, p, s); + break; + case SHA384: + SHA384_Update(&c->SHA384_ctx, p, s); + break; + case SHA512: + SHA512_Update(&c->SHA512_ctx, p, s); + break; + case SHA512_256: + SHA512_256_Update(&c->SHA512_ctx, p, s); + break; + default: + panic("unknown mechanism %d", c->algotype); + } +} + +static inline void +SHA2Final(void *p, SHA2_CTX *c) +{ + switch (c->algotype) { + case SHA256: + SHA256_Final(p, &c->SHA256_ctx); + break; + case SHA384: + SHA384_Final(p, &c->SHA384_ctx); + break; + case SHA512: + SHA512_Final(p, &c->SHA512_ctx); + break; + case SHA512_256: + SHA512_256_Final(p, &c->SHA512_ctx); + break; + default: + panic("unknown mechanism %d", c->algotype); + } +} + +#ifdef _SHA2_IMPL +/* + * The following types/functions are all private to the implementation + * of the SHA2 functions and must not be used by consumers of the interface + */ + +/* + * List of support mechanisms in this module. + * + * It is important to note that in the module, division or modulus calculations + * are used on the enumerated type to determine which mechanism is being used; + * therefore, changing the order or additional mechanisms should be done + * carefully + */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#endif /* _SHA2_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_H */ diff --git a/include/os/freebsd/zfs/sys/vdev_os.h b/include/os/freebsd/zfs/sys/vdev_os.h new file mode 100644 index 000000000000..e2780fdbb671 --- /dev/null +++ b/include/os/freebsd/zfs/sys/vdev_os.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_VDEV_OS_H +#define _SYS_VDEV_OS_H + +extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size); +#endif diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h new file mode 100644 index 000000000000..529ba7204548 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_CONTEXT_OS_H_ +#define ZFS_CONTEXT_OS_H_ + +#include +#include +#include +#include_next +#include +#include +#include +#include + +#define cv_wait_io(cv, mp) cv_wait(cv, mp) +#define cv_wait_io_sig(cv, mp) cv_wait_sig(cv, mp) + +#define cond_resched() kern_yield(PRI_USER) + +#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ + (taskq_create(a, b, maxclsyspri, d, e, f)) + +#define tsd_create(keyp, destructor) do { \ + *(keyp) = osd_thread_register((destructor)); \ + KASSERT(*(keyp) > 0, ("cannot register OSD")); \ +} while (0) + +#define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) +#define tsd_get(key) osd_thread_get(curthread, (key)) +#define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) +#define fm_panic panic + +#define cond_resched() kern_yield(PRI_USER) +extern int zfs_debug_level; +extern struct mtx zfs_debug_mtx; +#define ZFS_LOG(lvl, ...) do { \ + if (((lvl) & 0xff) <= zfs_debug_level) { \ + mtx_lock(&zfs_debug_mtx); \ + printf("%s:%u[%d]: ", \ + __func__, __LINE__, (lvl)); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + if ((lvl) & 0x100) \ + kdb_backtrace(); \ + mtx_unlock(&zfs_debug_mtx); \ + } \ +} while (0) + +#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz)) +extern int hz; +extern int tick; +typedef int fstrans_cookie_t; +#define spl_fstrans_mark() (0) +#define spl_fstrans_unmark(x) (x = 0) +#define signal_pending(x) SIGPENDING(x) +#define current curthread +#define thread_join(x) +#define cv_wait_io(cv, mp) cv_wait(cv, mp) +typedef struct opensolaris_utsname utsname_t; +extern utsname_t *utsname(void); +extern int spa_import_rootpool(const char *name); +#else +#if BYTE_ORDER != BIG_ENDIAN +#undef _BIG_ENDIAN +#endif +#endif diff --git a/include/os/freebsd/zfs/sys/zfs_ctldir.h b/include/os/freebsd/zfs/sys/zfs_ctldir.h new file mode 100644 index 000000000000..28a026603f07 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_ctldir.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_CTLDIR_NAME ".zfs" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ + ((zdp)->z_zfsvfs->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + ((zdp)->z_zfsvfs->z_show_ctldir)) + +void zfsctl_create(zfsvfs_t *); +void zfsctl_destroy(zfsvfs_t *); +int zfsctl_root(zfsvfs_t *, int, vnode_t **); +void zfsctl_init(void); +void zfsctl_fini(void); +boolean_t zfsctl_is_node(vnode_t *); +int zfsctl_snapshot_unmount(char *snapname, int flags); +int zfsctl_rename_snapshot(const char *from, const char *to); +int zfsctl_destroy_snapshot(const char *snapname, int force); +int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); + +int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); + +#define ZFSCTL_INO_ROOT 0x1 +#define ZFSCTL_INO_SNAPDIR 0x2 + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CTLDIR_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_dir.h b/include/os/freebsd/zfs/sys/zfs_dir.h new file mode 100644 index 000000000000..f6f8ab5c4e69 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_dir.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ + +extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int); +extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int, + boolean_t *); +#if 0 +extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int); +#else +extern int zfs_dirlook(znode_t *, const char *name, znode_t **); +#endif +extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, + uint_t, znode_t **, zfs_acl_ids_t *); +extern void zfs_rmnode(znode_t *); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, znode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, znode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h b/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h new file mode 100644 index 000000000000..a8db8bee5209 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h @@ -0,0 +1,677 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Xin Li . All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_COMPAT_H +#define _SYS_ZFS_IOCTL_COMPAT_H + +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Backwards ioctl compatibility + */ + +/* ioctl versions for vfs.zfs.version.ioctl */ +#define ZFS_IOCVER_UNDEF -1 +#define ZFS_IOCVER_NONE 0 +#define ZFS_IOCVER_DEADMAN 1 +#define ZFS_IOCVER_LZC 2 +#define ZFS_IOCVER_ZCMD 3 +#define ZFS_IOCVER_EDBP 4 +#define ZFS_IOCVER_RESUME 5 +#define ZFS_IOCVER_INLANES 6 +#define ZFS_IOCVER_PAD 7 +#define ZFS_IOCVER_FREEBSD ZFS_IOCVER_PAD +#define ZFS_IOCVER_ZOF 15 + +/* compatibility conversion flag */ +#define ZFS_CMD_COMPAT_NONE 0 +#define ZFS_CMD_COMPAT_V15 1 +#define ZFS_CMD_COMPAT_V28 2 +#define ZFS_CMD_COMPAT_DEADMAN 3 +#define ZFS_CMD_COMPAT_LZC 4 +#define ZFS_CMD_COMPAT_ZCMD 5 +#define ZFS_CMD_COMPAT_EDBP 6 +#define ZFS_CMD_COMPAT_RESUME 7 +#define ZFS_CMD_COMPAT_INLANES 8 + +#define ZFS_IOC_COMPAT_PASS 254 +#define ZFS_IOC_COMPAT_FAIL 255 + +#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) + +typedef struct zfs_iocparm { + uint32_t zfs_ioctl_version; + uint64_t zfs_cmd; + uint64_t zfs_cmd_size; +} zfs_iocparm_t; + +typedef struct zinject_record_v15 { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; +} zinject_record_v15_t; + +typedef struct zfs_cmd_v15 { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_v15_t zc_inject_record; +} zfs_cmd_v15_t; + +typedef struct zinject_record_v28 { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; +} zinject_record_v28_t; + +typedef struct zfs_cmd_v28 { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + char zc_top_ds[MAXPATHLEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_v28_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_v28_t; + +typedef struct zinject_record_deadman { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; + uint32_t zi_cmd; + uint32_t zi_pad; +} zinject_record_deadman_t; + +typedef struct zfs_cmd_deadman { + char zc_name[MAXPATHLEN]; + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + char zc_top_ds[MAXPATHLEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history; /* really (char *) */ + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + /* zc_inject_record doesn't change in libzfs_core */ + zinject_record_deadman_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_deadman_t; + +typedef struct zfs_cmd_zcmd { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_deadman_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_zcmd_t; + +typedef struct zfs_cmd_edbp { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_deadman_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_edbp_t; + +typedef struct zfs_cmd_resume { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + dmu_replay_record_t zc_begin_record; + zinject_record_deadman_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + boolean_t zc_resumable; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_resume_t; + +typedef struct zfs_cmd_inlanes { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + dmu_replay_record_t zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + boolean_t zc_resumable; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_inlanes_t; + +#ifdef _KERNEL +/* + * Note: this struct must have the same layout in 32-bit and 64-bit, so + * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit + * kernel. Therefore, we add padding to it so that no "hidden" padding + * is automatically added on 64-bit (but not on 32-bit). + */ +typedef struct zfs_cmd_legacy { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + + dmu_objset_stats_t zc_objset_stats; + uint64_t zc_freebsd_drr_pad; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad3[3]; + boolean_t zc_resumable; + uint32_t zc_pad4; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_legacy_t; + +unsigned static long zfs_ioctl_bsd12_to_zof[] = { + ZFS_IOC_POOL_CREATE, /* 0x00 */ + ZFS_IOC_POOL_DESTROY, /* 0x01 */ + ZFS_IOC_POOL_IMPORT, /* 0x02 */ + ZFS_IOC_POOL_EXPORT, /* 0x03 */ + ZFS_IOC_POOL_CONFIGS, /* 0x04 */ + ZFS_IOC_POOL_STATS, /* 0x05 */ + ZFS_IOC_POOL_TRYIMPORT, /* 0x06 */ + ZFS_IOC_POOL_SCAN, /* 0x07 */ + ZFS_IOC_POOL_FREEZE, /* 0x08 */ + ZFS_IOC_POOL_UPGRADE, /* 0x09 */ + ZFS_IOC_POOL_GET_HISTORY, /* 0x0a */ + ZFS_IOC_VDEV_ADD, /* 0x0b */ + ZFS_IOC_VDEV_REMOVE, /* 0x0c */ + ZFS_IOC_VDEV_SET_STATE, /* 0x0d */ + ZFS_IOC_VDEV_ATTACH, /* 0x0e */ + ZFS_IOC_VDEV_DETACH, /* 0x0f */ + ZFS_IOC_VDEV_SETPATH, /* 0x10 */ + ZFS_IOC_VDEV_SETFRU, /* 0x11 */ + ZFS_IOC_OBJSET_STATS, /* 0x12 */ + ZFS_IOC_OBJSET_ZPLPROPS, /* 0x13 */ + ZFS_IOC_DATASET_LIST_NEXT, /* 0x14 */ + ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x15 */ + ZFS_IOC_SET_PROP, /* 0x16 */ + ZFS_IOC_CREATE, /* 0x17 */ + ZFS_IOC_DESTROY, /* 0x18 */ + ZFS_IOC_ROLLBACK, /* 0x19 */ + ZFS_IOC_RENAME, /* 0x1a */ + ZFS_IOC_RECV, /* 0x1b */ + ZFS_IOC_SEND, /* 0x1c */ + ZFS_IOC_INJECT_FAULT, /* 0x1d */ + ZFS_IOC_CLEAR_FAULT, /* 0x1e */ + ZFS_IOC_INJECT_LIST_NEXT, /* 0x1f */ + ZFS_IOC_ERROR_LOG, /* 0x20 */ + ZFS_IOC_CLEAR, /* 0x21 */ + ZFS_IOC_PROMOTE, /* 0x22 */ + /* start of mismatch */ + ZFS_IOC_DESTROY_SNAPS, /* 0x23:0x3b */ + ZFS_IOC_SNAPSHOT, /* 0x24:0x23 */ + ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x25:0x24 */ + ZFS_IOC_OBJ_TO_PATH, /* 0x26:0x25 */ + ZFS_IOC_POOL_SET_PROPS, /* 0x27:0x26 */ + ZFS_IOC_POOL_GET_PROPS, /* 0x28:0x27 */ + ZFS_IOC_SET_FSACL, /* 0x29:0x28 */ + ZFS_IOC_GET_FSACL, /* 0x30:0x29 */ + ZFS_IOC_SHARE, /* 0x2b:0x2a */ + ZFS_IOC_INHERIT_PROP, /* 0x2c:0x2b */ + ZFS_IOC_SMB_ACL, /* 0x2d:0x2c */ + ZFS_IOC_USERSPACE_ONE, /* 0x2e:0x2d */ + ZFS_IOC_USERSPACE_MANY, /* 0x2f:0x2e */ + ZFS_IOC_USERSPACE_UPGRADE, /* 0x30:0x2f */ + ZFS_IOC_HOLD, /* 0x31:0x30 */ + ZFS_IOC_RELEASE, /* 0x32:0x31 */ + ZFS_IOC_GET_HOLDS, /* 0x33:0x32 */ + ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x34:0x33 */ + ZFS_IOC_VDEV_SPLIT, /* 0x35:0x34 */ + ZFS_IOC_NEXT_OBJ, /* 0x36:0x35 */ + ZFS_IOC_DIFF, /* 0x37:0x36 */ + ZFS_IOC_TMP_SNAPSHOT, /* 0x38:0x37 */ + ZFS_IOC_OBJ_TO_STATS, /* 0x39:0x38 */ + ZFS_IOC_JAIL, /* 0x3a:0xc2 */ + ZFS_IOC_UNJAIL, /* 0x3b:0xc3 */ + ZFS_IOC_POOL_REGUID, /* 0x3c:0x3c */ + ZFS_IOC_SPACE_WRITTEN, /* 0x3d:0x39 */ + ZFS_IOC_SPACE_SNAPS, /* 0x3e:0x3a */ + ZFS_IOC_SEND_PROGRESS, /* 0x3f:0x3e */ + ZFS_IOC_POOL_REOPEN, /* 0x40:0x3d */ + ZFS_IOC_LOG_HISTORY, /* 0x41:0x3f */ + ZFS_IOC_SEND_NEW, /* 0x42:0x40 */ + ZFS_IOC_SEND_SPACE, /* 0x43:0x41 */ + ZFS_IOC_CLONE, /* 0x44:0x42 */ + ZFS_IOC_BOOKMARK, /* 0x45:0x43 */ + ZFS_IOC_GET_BOOKMARKS, /* 0x46:0x44 */ + ZFS_IOC_DESTROY_BOOKMARKS, /* 0x47:0x45 */ + ZFS_IOC_NEXTBOOT, /* 0x48:0xc1 */ + ZFS_IOC_CHANNEL_PROGRAM, /* 0x49:0x48 */ + ZFS_IOC_REMAP, /* 0x4a:0x4c */ + ZFS_IOC_POOL_CHECKPOINT, /* 0x4b:0x4d */ + ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x4c:0x4e */ + ZFS_IOC_POOL_INITIALIZE, /* 0x4d:0x4f */ +}; + +unsigned static long zfs_ioctl_v15_to_v28[] = { + 0, /* 0 ZFS_IOC_POOL_CREATE */ + 1, /* 1 ZFS_IOC_POOL_DESTROY */ + 2, /* 2 ZFS_IOC_POOL_IMPORT */ + 3, /* 3 ZFS_IOC_POOL_EXPORT */ + 4, /* 4 ZFS_IOC_POOL_CONFIGS */ + 5, /* 5 ZFS_IOC_POOL_STATS */ + 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ + 7, /* 7 ZFS_IOC_POOL_SCRUB */ + 8, /* 8 ZFS_IOC_POOL_FREEZE */ + 9, /* 9 ZFS_IOC_POOL_UPGRADE */ + 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ + 11, /* 11 ZFS_IOC_VDEV_ADD */ + 12, /* 12 ZFS_IOC_VDEV_REMOVE */ + 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ + 14, /* 14 ZFS_IOC_VDEV_ATTACH */ + 15, /* 15 ZFS_IOC_VDEV_DETACH */ + 16, /* 16 ZFS_IOC_VDEV_SETPATH */ + 18, /* 17 ZFS_IOC_OBJSET_STATS */ + 19, /* 18 ZFS_IOC_OBJSET_ZPLPROPS */ + 20, /* 19 ZFS_IOC_DATASET_LIST_NEXT */ + 21, /* 20 ZFS_IOC_SNAPSHOT_LIST_NEXT */ + 22, /* 21 ZFS_IOC_SET_PROP */ + ZFS_IOC_COMPAT_PASS, /* 22 ZFS_IOC_CREATE_MINOR */ + ZFS_IOC_COMPAT_PASS, /* 23 ZFS_IOC_REMOVE_MINOR */ + 23, /* 24 ZFS_IOC_CREATE */ + 24, /* 25 ZFS_IOC_DESTROY */ + 25, /* 26 ZFS_IOC_ROLLBACK */ + 26, /* 27 ZFS_IOC_RENAME */ + 27, /* 28 ZFS_IOC_RECV */ + 28, /* 29 ZFS_IOC_SEND */ + 29, /* 30 ZFS_IOC_INJECT_FAULT */ + 30, /* 31 ZFS_IOC_CLEAR_FAULT */ + 31, /* 32 ZFS_IOC_INJECT_LIST_NEXT */ + 32, /* 33 ZFS_IOC_ERROR_LOG */ + 33, /* 34 ZFS_IOC_CLEAR */ + 34, /* 35 ZFS_IOC_PROMOTE */ + 35, /* 36 ZFS_IOC_DESTROY_SNAPS */ + 36, /* 37 ZFS_IOC_SNAPSHOT */ + 37, /* 38 ZFS_IOC_DSOBJ_TO_DSNAME */ + 38, /* 39 ZFS_IOC_OBJ_TO_PATH */ + 39, /* 40 ZFS_IOC_POOL_SET_PROPS */ + 40, /* 41 ZFS_IOC_POOL_GET_PROPS */ + 41, /* 42 ZFS_IOC_SET_FSACL */ + 42, /* 43 ZFS_IOC_GET_FSACL */ + ZFS_IOC_COMPAT_PASS, /* 44 ZFS_IOC_ISCSI_PERM_CHECK */ + 43, /* 45 ZFS_IOC_SHARE */ + 44, /* 46 ZFS_IOC_IHNERIT_PROP */ + 58, /* 47 ZFS_IOC_JAIL */ + 59, /* 48 ZFS_IOC_UNJAIL */ + 45, /* 49 ZFS_IOC_SMB_ACL */ + 46, /* 50 ZFS_IOC_USERSPACE_ONE */ + 47, /* 51 ZFS_IOC_USERSPACE_MANY */ + 48, /* 52 ZFS_IOC_USERSPACE_UPGRADE */ + 17, /* 53 ZFS_IOC_SETFRU */ +}; + +#else /* KERNEL */ +unsigned static long zfs_ioctl_v28_to_v15[] = { + 0, /* 0 ZFS_IOC_POOL_CREATE */ + 1, /* 1 ZFS_IOC_POOL_DESTROY */ + 2, /* 2 ZFS_IOC_POOL_IMPORT */ + 3, /* 3 ZFS_IOC_POOL_EXPORT */ + 4, /* 4 ZFS_IOC_POOL_CONFIGS */ + 5, /* 5 ZFS_IOC_POOL_STATS */ + 6, /* 6 ZFS_IOC_POOL_TRYIMPORT */ + 7, /* 7 ZFS_IOC_POOL_SCAN */ + 8, /* 8 ZFS_IOC_POOL_FREEZE */ + 9, /* 9 ZFS_IOC_POOL_UPGRADE */ + 10, /* 10 ZFS_IOC_POOL_GET_HISTORY */ + 11, /* 11 ZFS_IOC_VDEV_ADD */ + 12, /* 12 ZFS_IOC_VDEV_REMOVE */ + 13, /* 13 ZFS_IOC_VDEV_SET_STATE */ + 14, /* 14 ZFS_IOC_VDEV_ATTACH */ + 15, /* 15 ZFS_IOC_VDEV_DETACH */ + 16, /* 16 ZFS_IOC_VDEV_SETPATH */ + 53, /* 17 ZFS_IOC_VDEV_SETFRU */ + 17, /* 18 ZFS_IOC_OBJSET_STATS */ + 18, /* 19 ZFS_IOC_OBJSET_ZPLPROPS */ + 19, /* 20 ZFS_IOC_DATASET_LIST_NEXT */ + 20, /* 21 ZFS_IOC_SNAPSHOT_LIST_NEXT */ + 21, /* 22 ZFS_IOC_SET_PROP */ + 24, /* 23 ZFS_IOC_CREATE */ + 25, /* 24 ZFS_IOC_DESTROY */ + 26, /* 25 ZFS_IOC_ROLLBACK */ + 27, /* 26 ZFS_IOC_RENAME */ + 28, /* 27 ZFS_IOC_RECV */ + 29, /* 28 ZFS_IOC_SEND */ + 30, /* 39 ZFS_IOC_INJECT_FAULT */ + 31, /* 30 ZFS_IOC_CLEAR_FAULT */ + 32, /* 31 ZFS_IOC_INJECT_LIST_NEXT */ + 33, /* 32 ZFS_IOC_ERROR_LOG */ + 34, /* 33 ZFS_IOC_CLEAR */ + 35, /* 34 ZFS_IOC_PROMOTE */ + 36, /* 35 ZFS_IOC_DESTROY_SNAPS */ + 37, /* 36 ZFS_IOC_SNAPSHOT */ + 38, /* 37 ZFS_IOC_DSOBJ_TO_DSNAME */ + 39, /* 38 ZFS_IOC_OBJ_TO_PATH */ + 40, /* 39 ZFS_IOC_POOL_SET_PROPS */ + 41, /* 40 ZFS_IOC_POOL_GET_PROPS */ + 42, /* 41 ZFS_IOC_SET_FSACL */ + 43, /* 42 ZFS_IOC_GET_FSACL */ + 45, /* 43 ZFS_IOC_SHARE */ + 46, /* 44 ZFS_IOC_IHNERIT_PROP */ + 49, /* 45 ZFS_IOC_SMB_ACL */ + 50, /* 46 ZFS_IOC_USERSPACE_ONE */ + 51, /* 47 ZFS_IOC_USERSPACE_MANY */ + 52, /* 48 ZFS_IOC_USERSPACE_UPGRADE */ + ZFS_IOC_COMPAT_FAIL, /* 49 ZFS_IOC_HOLD */ + ZFS_IOC_COMPAT_FAIL, /* 50 ZFS_IOC_RELEASE */ + ZFS_IOC_COMPAT_FAIL, /* 51 ZFS_IOC_GET_HOLDS */ + ZFS_IOC_COMPAT_FAIL, /* 52 ZFS_IOC_OBJSET_RECVD_PROPS */ + ZFS_IOC_COMPAT_FAIL, /* 53 ZFS_IOC_VDEV_SPLIT */ + ZFS_IOC_COMPAT_FAIL, /* 54 ZFS_IOC_NEXT_OBJ */ + ZFS_IOC_COMPAT_FAIL, /* 55 ZFS_IOC_DIFF */ + ZFS_IOC_COMPAT_FAIL, /* 56 ZFS_IOC_TMP_SNAPSHOT */ + ZFS_IOC_COMPAT_FAIL, /* 57 ZFS_IOC_OBJ_TO_STATS */ + 47, /* 58 ZFS_IOC_JAIL */ + 48, /* 59 ZFS_IOC_UNJAIL */ +}; +#endif /* ! _KERNEL */ + +#ifdef _KERNEL +int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); +void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); +nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +#endif /* _KERNEL */ +void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); +void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_COMPAT_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops.h b/include/os/freebsd/zfs/sys/zfs_vfsops.h new file mode 100644 index 000000000000..d17b80330295 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_vfsops.h @@ -0,0 +1,172 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfsvfs zfsvfs_t; +struct znode; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_flags; /* super_block flags */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted; /* unmounted */ + rrmlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all vnodes in the fs */ + uint64_t z_nr_znodes; /* number of znodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ + boolean_t z_use_namecache; /* make use of FreeBSD name cache */ + uint8_t z_xattr; /* xattr type in use */ + uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; + uint64_t z_userobjquota_obj; + uint64_t z_groupobjquota_obj; + uint64_t z_projectquota_obj; + uint64_t z_projectobjquota_obj; + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ +#define ZFS_OBJ_MTX_SZ 64 + kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ + struct task z_unlinked_drain_task; +}; + +#define ZSB_XATTR 0x0001 /* Enable user xattrs */ +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes[**] currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + * + * [*] 20 bytes on FreeBSD to fit into the size of struct fid. + * [**] 2 bytes on FreeBSD for the above reason. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern uint_t zfs_fsyncer_key; +extern int zfs_super_owner; + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, boolean_t readonly, zfsvfs_t **zfvp); +extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); +extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); +extern int zfs_get_temporary_prop(struct dsl_dataset *ds, zfs_prop_t zfs_prop, + uint64_t *val, char *setpoint); +extern int zfs_busy(void); +extern void zfsvfs_update_fromname(const char *oldname, const char *newname); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_vnops.h b/include/os/freebsd/zfs/sys/zfs_vnops.h new file mode 100644 index 000000000000..6237372b905f --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_vnops.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_ZFS_VNOPS_H_ +#define _SYS_ZFS_VNOPS_H_ +int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); +int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size); +extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); +extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp); +extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, + cred_t *cr, int flags); +extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); +extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, + char *tnm, cred_t *cr, int flags); +extern int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, + const char *link, znode_t **zpp, cred_t *cr, int flags); +extern int zfs_link(znode_t *tdzp, znode_t *sp, + char *name, cred_t *cr, int flags); +extern int zfs_space(znode_t *zp, int cmd, struct flock *bfp, int flag, + offset_t offset, cred_t *cr); +extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, + cred_t *cr); +extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *resid); + +#endif diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h new file mode 100644 index 000000000000..c0430467572c --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -0,0 +1,182 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _FREEBSD_ZFS_SYS_ZNODE_IMPL_H +#define _FREEBSD_ZFS_SYS_ZNODE_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#define ZNODE_OS_FIELDS \ + struct zfsvfs *z_zfsvfs; \ + vnode_t *z_vnode; \ + uint64_t z_uid; \ + uint64_t z_gid; \ + uint64_t z_gen; \ + uint64_t z_atime[2]; \ + uint64_t z_links; + +#define ZFS_LINK_MAX UINT64_MAX + +/* + * ZFS minor numbers can refer to either a control device instance or + * a zvol. Depending on the value of zss_type, zss_data points to either + * a zvol_state_t or a zfs_onexit_t. + */ +enum zfs_soft_state_type { + ZSST_ZVOL, + ZSST_CTLDEV +}; + +typedef struct zfs_soft_state { + enum zfs_soft_state_type zss_type; + void *zss_data; +} zfs_soft_state_t; + +extern minor_t zfsdev_minor_alloc(void); + +/* + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + */ + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define ZTOI(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((struct znode *)(VP)->v_data) +#define ITOZ(VP) ((struct znode *)(VP)->v_data) +#define zhold(zp) vhold(ZTOV((zp))) +#define zrele(zp) vrele(ZTOV((zp))) + +#define ZTOZSB(zp) ((zp)->z_zfsvfs) +#define ITOZSB(vp) (VTOZ(vp)->z_zfsvfs) +#define ZTOTYPE(zp) (ZTOV(zp)->v_type) +#define ZTOGID(zp) ((zp)->z_gid) +#define ZTOUID(zp) ((zp)->z_uid) +#define ZTONLNK(zp) ((zp)->z_links) +#define Z_ISBLK(type) ((type) == VBLK) +#define Z_ISCHR(type) ((type) == VCHR) +#define Z_ISLNK(type) ((type) == VLNK) + + +/* Called on entry to each ZFS vnode and vfs operation */ +#define ZFS_ENTER(zfsvfs) \ + { \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ + ZFS_EXIT(zfsvfs); \ + return (EIO); \ + } \ + } + +/* Must be called before exiting the vop */ +#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG) + +/* Verifies the znode is valid */ +#define ZFS_VERIFY_ZP(zp) \ + if ((zp)->z_sa_hdl == NULL) { \ + ZFS_EXIT((zp)->z_zfsvfs); \ + return (EIO); \ + } \ + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ + (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) +#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ + mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ + mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ + mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) + +/* Encode ZFS stored time values from a struct timespec */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +{ \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ +} + +/* Decode ZFS stored time values to a struct timespec */ +#define ZFS_TIME_DECODE(tp, stmp) \ +{ \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ + zfs_tstamp_update_setup_ext(zp, ACCESSED, NULL, NULL, B_FALSE); + +extern void zfs_tstamp_update_setup_ext(struct znode *, + uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx); +extern void zfs_znode_free(struct znode *); + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, + char *buf); + +#ifdef __cplusplus +} +#endif + +#endif /* _FREEBSD_SYS_FS_ZFS_ZNODE_H */ diff --git a/include/os/freebsd/zfs/sys/zpl.h b/include/os/freebsd/zfs/sys/zpl.h new file mode 100644 index 000000000000..fb2b4e02d441 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zpl.h @@ -0,0 +1 @@ +/* Don't remove */ diff --git a/lib/Makefile.am b/lib/Makefile.am index 8dff773df40b..4f59aa359c4b 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -1,6 +1,13 @@ # NB: GNU Automake Manual, Chapter 8.3.5: Libtool Convenience Libraries # These six libraries are intermediary build components. -SUBDIRS = libavl libefi libicp libshare libspl libtpool libzutil libunicode +SUBDIRS = libavl libicp libshare libspl libtpool + +if BUILD_LINUX +SUBDIRS += libefi +endif + +# libzutil depends on libefi if present +SUBDIRS += libzutil libunicode # These four libraries, which are installed as the final build product, # incorporate the six convenience libraries given above. diff --git a/lib/libnvpair/Makefile.am b/lib/libnvpair/Makefile.am index 6626b6d05483..984ca520c0a6 100644 --- a/lib/libnvpair/Makefile.am +++ b/lib/libnvpair/Makefile.am @@ -24,7 +24,14 @@ nodist_libnvpair_la_SOURCES = \ $(USER_C) \ $(KERNEL_C) +if BUILD_FREEBSD +libnvpair_la_LIBADD = $(LIBTIRPC_LIBS) -L/usr/local/lib -lintl +libnvpair_la_LDFLAGS = -version-info 3:0:0 +else libnvpair_la_LIBADD = $(LIBTIRPC_LIBS) libnvpair_la_LDFLAGS = -version-info 1:1:0 +endif + + EXTRA_DIST = $(USER_C) diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am index 3101b5fc50df..77d8aba0ed00 100644 --- a/lib/libspl/Makefile.am +++ b/lib/libspl/Makefile.am @@ -42,6 +42,15 @@ USER_C += \ os/linux/getmntany.c endif +if BUILD_FREEBSD +USER_C += \ + os/freebsd/getexecname.c \ + os/freebsd/gethostid.c \ + os/freebsd/getmntany.c \ + os/freebsd/mnttab.c + +endif + USER_ASM = atomic.S nodist_libspl_la_SOURCES = \ diff --git a/lib/libspl/include/os/Makefile.am b/lib/libspl/include/os/Makefile.am index 09c0beec4757..7b362e02ad59 100644 --- a/lib/libspl/include/os/Makefile.am +++ b/lib/libspl/include/os/Makefile.am @@ -1,3 +1,7 @@ +if BUILD_FREEBSD +SUBDIRS = freebsd +endif + if BUILD_LINUX SUBDIRS = linux endif diff --git a/lib/libspl/include/os/freebsd/Makefile.am b/lib/libspl/include/os/freebsd/Makefile.am new file mode 100644 index 000000000000..081839c48c8f --- /dev/null +++ b/lib/libspl/include/os/freebsd/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/lib/libspl/include/os/freebsd/sys/Makefile.am b/lib/libspl/include/os/freebsd/sys/Makefile.am new file mode 100644 index 000000000000..896c9387129a --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/Makefile.am @@ -0,0 +1,12 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/byteorder.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/file.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/mnttab.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/mount.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/param.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/stat.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/sysmacros.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/uio.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/vfs.h \ + $(top_srcdir)/lib/libspl/include/os/freebsd/sys/zfs_context_os.h diff --git a/lib/libspl/include/os/freebsd/sys/byteorder.h b/lib/libspl/include/os/freebsd/sys/byteorder.h new file mode 100644 index 000000000000..74649cc4e0a1 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/byteorder.h @@ -0,0 +1,311 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_BYTEORDER_H +#define _SYS_BYTEORDER_H + +/* + * XXX FIXME + * on FreeBSD _BIG_ENDIAN is defined on all architectures so we have + * to exclude _MACHINE_ENDIAN_H_ and define the bulk of it here + */ + +#include +#include + +/* + * Define the order of 32-bit words in 64-bit words. + */ +#define _QUAD_HIGHWORD 1 +#define _QUAD_LOWWORD 0 + +/* + * Definitions for byte order, according to byte significance from low + * address to high. + */ +#undef _LITTLE_ENDIAN +/* LSB first: i386, vax */ +#define _LITTLE_ENDIAN 1234 +/* LSB first in word, MSW first in long */ +#define _PDP_ENDIAN 3412 + +#define _BYTE_ORDER _LITTLE_ENDIAN + +/* + * Deprecated variants that don't have enough underscores to be useful in more + * strict namespaces. + */ +#if __BSD_VISIBLE +#define LITTLE_ENDIAN _LITTLE_ENDIAN +#define PDP_ENDIAN _PDP_ENDIAN +#define BYTE_ORDER _BYTE_ORDER +#endif + +#define __bswap16_gen(x) (__uint16_t)((x) << 8 | (x) >> 8) +#define __bswap32_gen(x) \ + (((__uint32_t)__bswap16((x) & 0xffff) << 16) | __bswap16((x) >> 16)) +#define __bswap64_gen(x) \ + (((__uint64_t)__bswap32((x) & 0xffffffff) << 32) | __bswap32((x) >> 32)) + +#ifdef __GNUCLIKE_BUILTIN_CONSTANT_P +#define __bswap16(x) \ + ((__uint16_t)(__builtin_constant_p(x) ? \ + __bswap16_gen((__uint16_t)(x)) : __bswap16_var(x))) +#define __bswap32(x) \ + (__builtin_constant_p(x) ? \ + __bswap32_gen((__uint32_t)(x)) : __bswap32_var(x)) +#define __bswap64(x) \ + (__builtin_constant_p(x) ? \ + __bswap64_gen((__uint64_t)(x)) : __bswap64_var(x)) +#else +/* XXX these are broken for use in static initializers. */ +#define __bswap16(x) __bswap16_var(x) +#define __bswap32(x) __bswap32_var(x) +#define __bswap64(x) __bswap64_var(x) +#endif + +/* These are defined as functions to avoid multiple evaluation of x. */ + +static __inline __uint16_t +__bswap16_var(__uint16_t _x) +{ + + return (__bswap16_gen(_x)); +} + +static __inline __uint32_t +__bswap32_var(__uint32_t _x) +{ + +#ifdef __GNUCLIKE_ASM + __asm("bswap %0" : "+r" (_x)); + return (_x); +#else + return (__bswap32_gen(_x)); +#endif +} +#define __htonl(x) __bswap32(x) +#define __htons(x) __bswap16(x) +#define __ntohl(x) __bswap32(x) +#define __ntohs(x) __bswap16(x) + +#include +#include + +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * macros for conversion between host and (internet) network byte order + */ + +#if defined(_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint) +/* big-endian */ +#if defined(_BIG_ENDIAN) && (defined(__amd64__) || defined(__amd64)) +#error "incompatible ENDIAN / ARCH combination" +#endif +#define ntohl(x) (x) +#define ntohs(x) (x) +#define htonl(x) (x) +#define htons(x) (x) + +#elif !defined(ntohl) /* little-endian */ + +#ifndef _IN_PORT_T +#define _IN_PORT_T +typedef uint16_t in_port_t; +#endif + +#ifndef _IN_ADDR_T +#define _IN_ADDR_T +typedef uint32_t in_addr_t; +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) +extern uint32_t htonl(uint32_t); +extern uint16_t htons(uint16_t); +extern uint32_t ntohl(uint32_t); +extern uint16_t ntohs(uint16_t); +#else +extern in_addr_t htonl(in_addr_t); +extern in_port_t htons(in_port_t); +extern in_addr_t ntohl(in_addr_t); +extern in_port_t ntohs(in_port_t); +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) */ +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#ifdef _BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +#ifdef _BIG_ENDIAN +static __inline__ uint64_t +htonll(uint64_t n) +{ + return (n); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return (n); +} +#else +static __inline__ uint64_t +htonll(uint64_t n) +{ + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#endif + +/* + * Macros to read unaligned values from a specific byte order to + * native byte order + */ + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + +#define BE_IN64(xa) \ + (((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa)+4)) + +#define LE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define LE_IN16(xa) \ + (((uint16_t)LE_IN8((uint8_t *)(xa) + 1) << 8) | LE_IN8(xa)) + +#define LE_IN32(xa) \ + (((uint32_t)LE_IN16((uint8_t *)(xa) + 2) << 16) | LE_IN16(xa)) + +#define LE_IN64(xa) \ + (((uint64_t)LE_IN32((uint8_t *)(xa) + 4) << 32) | LE_IN32(xa)) + +/* + * Macros to write unaligned values from native byte order to a specific byte + * order. + */ + +#define BE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define BE_OUT16(xa, yv) \ + BE_OUT8((uint8_t *)(xa) + 1, yv); \ + BE_OUT8((uint8_t *)(xa), (yv) >> 8); + +#define BE_OUT32(xa, yv) \ + BE_OUT16((uint8_t *)(xa) + 2, yv); \ + BE_OUT16((uint8_t *)(xa), (yv) >> 16); + +#define BE_OUT64(xa, yv) \ + BE_OUT32((uint8_t *)(xa) + 4, yv); \ + BE_OUT32((uint8_t *)(xa), (yv) >> 32); + +#define LE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define LE_OUT16(xa, yv) \ + LE_OUT8((uint8_t *)(xa), yv); \ + LE_OUT8((uint8_t *)(xa) + 1, (yv) >> 8); + +#define LE_OUT32(xa, yv) \ + LE_OUT16((uint8_t *)(xa), yv); \ + LE_OUT16((uint8_t *)(xa) + 2, (yv) >> 16); + +#define LE_OUT64(xa, yv) \ + LE_OUT32((uint8_t *)(xa), yv); \ + LE_OUT32((uint8_t *)(xa) + 4, (yv) >> 32); + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BYTEORDER_H */ diff --git a/lib/libspl/include/os/freebsd/sys/file.h b/lib/libspl/include/os/freebsd/sys/file.h new file mode 100644 index 000000000000..27fd2888f326 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/file.h @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_FILE_H +#define _LIBSPL_SYS_FILE_H + +#include_next + +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FSYNC O_SYNC +#define FDSYNC O_DSYNC +#define FEXCL O_EXCL + +#define FNODSYNC 0x10000 /* fsync pseudo flag */ +#define FNOFOLLOW 0x20000 /* don't follow symlinks */ +#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ + +#endif diff --git a/lib/libspl/include/os/freebsd/sys/mnttab.h b/lib/libspl/include/os/freebsd/sys/mnttab.h new file mode 100644 index 000000000000..eb6ea2433a6f --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/mnttab.h @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2006 Ricardo Correia */ + +#ifndef _SYS_MNTTAB_H +#define _SYS_MNTTAB_H + +#include +#include + +#ifdef MNTTAB +#undef MNTTAB +#endif /* MNTTAB */ + +#include +#include +#define MNTTAB _PATH_DEVZERO +#define MS_NOMNTTAB 0x0 +#define MS_RDONLY 0x1 +#define umount2(p, f) unmount(p, f) +#define MNT_LINE_MAX 4096 + +#define MNT_TOOLONG 1 /* entry exceeds MNT_LINE_MAX */ +#define MNT_TOOMANY 2 /* too many fields in line */ +#define MNT_TOOFEW 3 /* too few fields in line */ + +struct mnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; +}; + +/* + * NOTE: fields in extmnttab should match struct mnttab till new fields + * are encountered, this allows hasmntopt to work properly when its arg is + * a pointer to an extmnttab struct cast to a mnttab struct pointer. + */ + +struct extmnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; + uint_t mnt_major; + uint_t mnt_minor; +}; + +struct stat64; +struct statfs; + +extern int getmntany(FILE *fp, struct mnttab *mp, struct mnttab *mpref); +extern int _sol_getmntent(FILE *fp, struct mnttab *mp); +extern int getextmntent(const char *path, struct extmnttab *entry, + struct stat64 *statbuf); +extern void statfs2mnttab(struct statfs *sfs, struct mnttab *mp); +char *hasmntopt(struct mnttab *mnt, char *opt); +int getmntent(FILE *fp, struct mnttab *mp); + +#endif diff --git a/lib/libspl/include/os/freebsd/sys/mount.h b/lib/libspl/include/os/freebsd/sys/mount.h new file mode 100644 index 000000000000..b4023910005b --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/mount.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _LIBSPL_SYS_MOUNT_H +#define _LIBSPL_SYS_MOUNT_H + +#undef _SYS_MOUNT_H_ +#include_next + +#include +#include +#include + +/* + * Some old glibc headers don't define BLKGETSIZE64 + * and we don't want to require the kernel headers + */ +#if !defined(BLKGETSIZE64) +#define BLKGETSIZE64 _IOR(0x12, 114, size_t) +#endif + +/* + * Some old glibc headers don't correctly define MS_DIRSYNC and + * instead use the enum name S_WRITE. When using these older + * headers define MS_DIRSYNC to be S_WRITE. + */ +#if !defined(MS_DIRSYNC) +#define MS_DIRSYNC S_WRITE +#endif + +/* + * Some old glibc headers don't correctly define MS_POSIXACL and + * instead leave it undefined. When using these older headers define + * MS_POSIXACL to the reserved value of (1<<16). + */ +#if !defined(MS_POSIXACL) +#define MS_POSIXACL (1<<16) +#endif + +#define MS_NOSUID MNT_NOSUID +#define MS_NOEXEC MNT_NOEXEC +#define MS_NODEV 0 +#define S_WRITE 0 +#define MS_BIND 0 +#define MS_REMOUNT 0 +#define MS_SYNCHRONOUS MNT_SYNCHRONOUS + +#define MS_USERS (MS_NOEXEC|MS_NOSUID|MS_NODEV) +#define MS_OWNER (MS_NOSUID|MS_NODEV) +#define MS_GROUP (MS_NOSUID|MS_NODEV) +#define MS_COMMENT 0 + +/* + * Older glibc headers did not define all the available + * umount2(2) flags. Both MNT_FORCE and MNT_DETACH are supported in the + * kernel back to 2.4.11 so we define them correctly if they are missing. + */ +#ifdef MNT_FORCE +#define MS_FORCE MNT_FORCE +#else +#define MS_FORCE 0x00000001 +#endif /* MNT_FORCE */ + +#ifdef MNT_DETACH +#define MS_DETACH MNT_DETACH +#else +#define MS_DETACH 0x00000002 +#endif /* MNT_DETACH */ + +/* + * Overlay mount is default in Linux, but for solaris/zfs + * compatibility, MS_OVERLAY is defined to explicitly have the user + * provide a flag (-O) to mount over a non empty directory. + */ +#define MS_OVERLAY 0x00000004 + +/* + * MS_CRYPT indicates that encryption keys should be loaded if they are not + * already available. This is not defined in glibc, but it is never seen by + * the kernel so it will not cause any problems. + */ +#define MS_CRYPT 0x00000008 + +#endif /* _LIBSPL_SYS_MOUNT_H */ diff --git a/lib/libspl/include/os/freebsd/sys/param.h b/lib/libspl/include/os/freebsd/sys/param.h new file mode 100644 index 000000000000..c0ad8d6798c6 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/param.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_PARAM_H +#define _LIBSPL_SYS_PARAM_H + +#include_next +#include + +/* + * File system parameters and macros. + * + * The file system is made out of blocks of at most MAXBSIZE units, + * with smaller units (fragments) only in the last direct block. + * MAXBSIZE primarily determines the size of buffers in the buffer + * pool. It may be made larger without any effect on existing + * file systems; however making it smaller may make some file + * systems unmountable. + * + * Note that the blocked devices are assumed to have DEV_BSIZE + * "sectors" and that fragments must be some multiple of this size. + */ +#define MAXNAMELEN 256 + +#define UID_NOBODY 60001 /* user ID no body */ +#define GID_NOBODY UID_NOBODY +#define UID_NOACCESS 60002 /* user ID no access */ + +#define MAXUID UINT32_MAX /* max user id */ +#define MAXPROJID MAXUID /* max project id */ + +#ifdef PAGESIZE +#undef PAGESIZE +#endif /* PAGESIZE */ + +extern size_t spl_pagesize(void); +#define PAGESIZE (spl_pagesize()) + +extern int execvpe(const char *name, char * const argv[], char * const envp[]); + +struct zfs_handle; +/* + * Attach/detach the given filesystem to/from the given jail. + */ +extern int zfs_jail(struct zfs_handle *zhp, int jailid, int attach); + +#endif diff --git a/lib/libspl/include/os/freebsd/sys/stat.h b/lib/libspl/include/os/freebsd/sys/stat.h new file mode 100644 index 000000000000..82c86262fff3 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/stat.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_SYS_STAT_H +#define _LIBSPL_SYS_STAT_H + +#include_next + +#include /* for BLKGETSIZE64 */ + +#define stat64 stat + +#define MAXOFFSET_T OFF_MAX + +#ifndef _KERNEL +#include + +static __inline int +fstat64(int fd, struct stat *sb) +{ + int ret; + + ret = fstat(fd, sb); + if (ret == 0) { + if (S_ISCHR(sb->st_mode)) + (void) ioctl(fd, DIOCGMEDIASIZE, &sb->st_size); + } + return (ret); +} +#endif + +/* + * Emulate Solaris' behavior of returning the block device size in fstat64(). + */ +static inline int +fstat64_blk(int fd, struct stat64 *st) +{ + if (fstat64(fd, st) == -1) + return (-1); + + /* In Linux we need to use an ioctl to get the size of a block device */ + if (S_ISBLK(st->st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &st->st_size) != 0) + return (-1); + } + + return (0); +} +#endif /* _LIBSPL_SYS_STAT_H */ diff --git a/lib/libspl/include/os/freebsd/sys/sysmacros.h b/lib/libspl/include/os/freebsd/sys/sysmacros.h new file mode 100644 index 000000000000..d9639d27b60e --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/sysmacros.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/lib/libspl/include/os/freebsd/sys/uio.h b/lib/libspl/include/os/freebsd/sys/uio.h new file mode 100644 index 000000000000..d978b6ad077c --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/uio.h @@ -0,0 +1,98 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _LIBSPL_SYS_UIO_H +#define _LIBSPL_SYS_UIO_H + +#include_next + +typedef struct iovec iovec_t; +typedef enum uio_seg uio_seg_t; + +typedef struct uio { + struct iovec *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs */ + offset_t uio_loffset; /* file offset */ + uio_seg_t uio_segflg; /* address space (kernel or user) */ + uint16_t uio_fmode; /* file mode flags */ + uint16_t uio_extflg; /* extended flags */ + offset_t uio_limit; /* u-limit (maximum byte offset) */ + ssize_t uio_resid; /* residual count */ +} uio_t; + +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { /* locked uio_iov state */ + int uioa_pfncnt; /* count of pfn_t(s) in *uioa_ppp */ + void **uioa_ppp; /* page_t or pfn_t array */ + caddr_t uioa_base; /* address base */ + size_t uioa_len; /* span length */ +} uioa_page_t; + +typedef struct xuio { + uio_t xu_uio; /* embedded UIO structure */ + + /* Extended uio fields */ + enum xuio_type xu_type; /* uio type */ + union { + struct { + uint32_t xu_a_state; /* state of async i/o */ + ssize_t xu_a_mbytes; /* bytes moved */ + uioa_page_t *xu_a_lcur; /* uioa_locked[] pointer */ + void **xu_a_lppp; /* lcur->uioa_pppp[] pointer */ + void *xu_a_hwst[4]; /* opaque hardware state */ + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + + struct { + int xu_zc_rw; /* read or write buffer */ + void *xu_zc_priv; /* fs specific */ + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +#endif /* _SYS_UIO_H */ diff --git a/lib/libspl/include/os/freebsd/sys/vfs.h b/lib/libspl/include/os/freebsd/sys/vfs.h new file mode 100644 index 000000000000..55eb3c23b22e --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/vfs.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_SYS_VFS_H_ +#define ZFS_SYS_VFS_H_ + +#include_next + +int fsshare(const char *, const char *, const char *); +int fsunshare(const char *, const char *); + +#endif /* !ZFS_SYS_VFS_H_ */ diff --git a/lib/libspl/include/os/freebsd/sys/zfs_context_os.h b/lib/libspl/include/os/freebsd/sys/zfs_context_os.h new file mode 100644 index 000000000000..25b5a47df92a --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/zfs_context_os.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_CONTEXT_OS_H_ +#define ZFS_CONTEXT_OS_H_ + +#if BYTE_ORDER != BIG_ENDIAN +#undef _BIG_ENDIAN +#endif + +#define ZFS_EXPORTS_PATH "/etc/zfs/exports" + +#endif diff --git a/lib/libspl/os/freebsd/getexecname.c b/lib/libspl/os/freebsd/getexecname.c new file mode 100644 index 000000000000..13e50d32404d --- /dev/null +++ b/lib/libspl/os/freebsd/getexecname.c @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include +#include +#include +#include +#include +#include +#include +#include + +const char * +getexecname(void) +{ + static char execname[PATH_MAX + 1] = ""; + static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; + char *ptr = NULL; + ssize_t rc; + + (void) pthread_mutex_lock(&mtx); + + if (strlen(execname) == 0) { + int error, name[4]; + size_t len; + + name[0] = CTL_KERN; + name[1] = KERN_PROC; + name[2] = KERN_PROC_PATHNAME; + name[3] = -1; + len = PATH_MAX; + error = sysctl(name, nitems(name), execname, &len, NULL, 0); + if (error != 0) { + rc = -1; + } else { + rc = len; + } + if (rc == -1) { + execname[0] = '\0'; + } else { + execname[rc] = '\0'; + ptr = execname; + } + } else { + ptr = execname; + } + + (void) pthread_mutex_unlock(&mtx); + return (ptr); +} diff --git a/lib/libspl/os/freebsd/gethostid.c b/lib/libspl/os/freebsd/gethostid.c new file mode 100644 index 000000000000..7bd567fe61b5 --- /dev/null +++ b/lib/libspl/os/freebsd/gethostid.c @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include + +unsigned long +get_system_hostid(void) +{ + return (gethostid()); +} diff --git a/lib/libspl/os/freebsd/getmntany.c b/lib/libspl/os/freebsd/getmntany.c new file mode 100644 index 000000000000..b41e763cee43 --- /dev/null +++ b/lib/libspl/os/freebsd/getmntany.c @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Ricardo Correia. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUFSIZE (MNT_LINE_MAX + 2) + +__thread char buf[BUFSIZE]; + +int +getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) +{ + struct statfs sfs; + + if (strlen(path) >= MAXPATHLEN) { + (void) fprintf(stderr, "invalid object; pathname too long\n"); + return (-1); + } + + if (stat64(path, statbuf) != 0) { + (void) fprintf(stderr, "cannot open '%s': %s\n", + path, strerror(errno)); + return (-1); + } + + if (statfs(path, &sfs) != 0) { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + return (-1); + } + statfs2mnttab(&sfs, (struct mnttab *)entry); + return (0); +} diff --git a/lib/libspl/os/freebsd/mnttab.c b/lib/libspl/os/freebsd/mnttab.c new file mode 100644 index 000000000000..5b9e6429d9e3 --- /dev/null +++ b/lib/libspl/os/freebsd/mnttab.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file implements Solaris compatible getmntany() and hasmntopt() + * functions. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static char * +mntopt(char **p) +{ + char *cp = *p; + char *retstr; + + while (*cp && isspace(*cp)) + cp++; + + retstr = cp; + while (*cp && *cp != ',') + cp++; + + if (*cp) { + *cp = '\0'; + cp++; + } + + *p = cp; + return (retstr); +} + +char * +hasmntopt(struct mnttab *mnt, char *opt) +{ + char tmpopts[MNT_LINE_MAX]; + char *f, *opts = tmpopts; + + if (mnt->mnt_mntopts == NULL) + return (NULL); + (void) strcpy(opts, mnt->mnt_mntopts); + f = mntopt(&opts); + for (; *f; f = mntopt(&opts)) { + if (strncmp(opt, f, strlen(opt)) == 0) + return (f - tmpopts + mnt->mnt_mntopts); + } + return (NULL); +} + +static void +optadd(char *mntopts, size_t size, const char *opt) +{ + + if (mntopts[0] != '\0') + strlcat(mntopts, ",", size); + strlcat(mntopts, opt, size); +} + +void +statfs2mnttab(struct statfs *sfs, struct mnttab *mp) +{ + static char mntopts[MNTMAXSTR]; + long flags; + + mntopts[0] = '\0'; + + flags = sfs->f_flags; +#define OPTADD(opt) optadd(mntopts, sizeof (mntopts), (opt)) + if (flags & MNT_RDONLY) + OPTADD(MNTOPT_RO); + else + OPTADD(MNTOPT_RW); + if (flags & MNT_NOSUID) + OPTADD(MNTOPT_NOSETUID); + else + OPTADD(MNTOPT_SETUID); + if (flags & MNT_UPDATE) + OPTADD(MNTOPT_REMOUNT); + if (flags & MNT_NOATIME) + OPTADD(MNTOPT_NOATIME); + else + OPTADD(MNTOPT_ATIME); + OPTADD(MNTOPT_NOXATTR); + if (flags & MNT_NOEXEC) + OPTADD(MNTOPT_NOEXEC); + else + OPTADD(MNTOPT_EXEC); +#undef OPTADD + mp->mnt_special = strdup(sfs->f_mntfromname); + mp->mnt_mountp = strdup(sfs->f_mntonname); + mp->mnt_fstype = strdup(sfs->f_fstypename); + mp->mnt_mntopts = strdup(mntopts); +} + +static struct statfs *gsfs = NULL; +static int allfs = 0; + +static int +statfs_init(void) +{ + struct statfs *sfs; + int error; + + if (gsfs != NULL) { + free(gsfs); + gsfs = NULL; + } + allfs = getfsstat(NULL, 0, MNT_WAIT); + if (allfs == -1) + goto fail; + gsfs = malloc(sizeof (gsfs[0]) * allfs * 2); + if (gsfs == NULL) + goto fail; + allfs = getfsstat(gsfs, (long)(sizeof (gsfs[0]) * allfs * 2), + MNT_WAIT); + if (allfs == -1) + goto fail; + sfs = realloc(gsfs, allfs * sizeof (gsfs[0])); + if (sfs != NULL) + gsfs = sfs; + return (0); +fail: + error = errno; + if (gsfs != NULL) + free(gsfs); + gsfs = NULL; + allfs = 0; + return (error); +} + +int +getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp) +{ + // struct statfs *sfs; + int i, error; + + error = statfs_init(); + if (error != 0) + return (error); + + for (i = 0; i < allfs; i++) { + if (mrefp->mnt_special != NULL && + strcmp(mrefp->mnt_special, gsfs[i].f_mntfromname) != 0) { + continue; + } + if (mrefp->mnt_mountp != NULL && + strcmp(mrefp->mnt_mountp, gsfs[i].f_mntonname) != 0) { + continue; + } + if (mrefp->mnt_fstype != NULL && + strcmp(mrefp->mnt_fstype, gsfs[i].f_fstypename) != 0) { + continue; + } + statfs2mnttab(&gsfs[i], mgetp); + return (0); + } + return (-1); +} + +int +getmntent(FILE *fp, struct mnttab *mp) +{ + // struct statfs *sfs; + int error, nfs; + + nfs = (int)lseek(fileno(fp), 0, SEEK_CUR); + if (nfs == -1) + return (errno); + /* If nfs is 0, we want to refresh out cache. */ + if (nfs == 0 || gsfs == NULL) { + error = statfs_init(); + if (error != 0) + return (error); + } + if (nfs >= allfs) + return (-1); + statfs2mnttab(&gsfs[nfs], mp); + if (lseek(fileno(fp), 1, SEEK_CUR) == -1) + return (errno); + return (0); +} diff --git a/lib/libuutil/Makefile.am b/lib/libuutil/Makefile.am index c61b66fce32f..9eb81f090fca 100644 --- a/lib/libuutil/Makefile.am +++ b/lib/libuutil/Makefile.am @@ -19,6 +19,10 @@ libuutil_la_LIBADD = \ $(top_builddir)/lib/libavl/libavl.la \ $(top_builddir)/lib/libspl/libspl.la +if BUILD_FREEBSD +libuutil_la_LDFLAGS = -pthread -version-info 3:0:0 +else libuutil_la_LDFLAGS = -pthread -version-info 1:1:0 +endif EXTRA_DIST = $(USER_C) diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am index 2747a7e9b3e3..0e1e0a53ea7e 100644 --- a/lib/libzfs/Makefile.am +++ b/lib/libzfs/Makefile.am @@ -27,6 +27,15 @@ USER_C = \ libzfs_status.c \ libzfs_util.c + +if BUILD_FREEBSD +USER_C += \ + os/freebsd/libzfs_fsshare.c \ + os/freebsd/libzfs_compat.c \ + os/freebsd/libzfs_ioctl_compat.c \ + os/freebsd/libzfs_zmount.c +endif + if BUILD_LINUX USER_C += \ os/linux/libzfs_mount_os.c \ @@ -35,7 +44,6 @@ USER_C += \ os/linux/libzfs_util_os.c endif - KERNEL_C = \ algs/sha2/sha2.c \ cityhash.c \ @@ -70,7 +78,12 @@ libzfs_la_LIBADD += \ $(top_builddir)/lib/libshare/libshare.la endif +if BUILD_FREEBSD +libzfs_la_LIBADD += -lutil -lgeom +libzfs_la_LDFLAGS = -version-info 4:0:0 +else libzfs_la_LDFLAGS = -version-info 2:0:0 +endif libzfs_la_LIBADD += -lm $(LIBSSL) diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c index c6debd576d5f..71ac72ee1ae3 100644 --- a/lib/libzfs/libzfs_util.c +++ b/lib/libzfs/libzfs_util.c @@ -881,7 +881,7 @@ libzfs_init(void) return (NULL); } - if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { + if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR|O_EXCL)) < 0) { free(hdl); return (NULL); } diff --git a/lib/libzfs/os/freebsd/libzfs_compat.c b/lib/libzfs/os/freebsd/libzfs_compat.c new file mode 100644 index 000000000000..e1c6ef93d5b5 --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_compat.c @@ -0,0 +1,323 @@ +/* + * CDDL HEADER SART + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int zfs_ioctl_version = ZFS_IOCVER_UNDEF; +// static int zfs_spa_version = -1; + +void +libzfs_set_pipe_max(int infd) +{ + /* FreeBSD automatically resizes */ +} + +static int +execvPe(const char *name, const char *path, char * const *argv, + char * const *envp) +{ + const char **memp; + size_t cnt, lp, ln; + int eacces, save_errno; + char *cur, buf[MAXPATHLEN]; + const char *p, *bp; + struct stat sb; + + eacces = 0; + + /* If it's an absolute or relative path name, it's easy. */ + if (strchr(name, '/')) { + bp = name; + cur = NULL; + goto retry; + } + bp = buf; + + /* If it's an empty path name, fail in the usual POSIX way. */ + if (*name == '\0') { + errno = ENOENT; + return (-1); + } + + cur = alloca(strlen(path) + 1); + if (cur == NULL) { + errno = ENOMEM; + return (-1); + } + strcpy(cur, path); + while ((p = strsep(&cur, ":")) != NULL) { + /* + * It's a SHELL path -- double, leading and trailing colons + * mean the current directory. + */ + if (*p == '\0') { + p = "."; + lp = 1; + } else + lp = strlen(p); + ln = strlen(name); + + /* + * If the path is too long complain. This is a possible + * security issue; given a way to make the path too long + * the user may execute the wrong program. + */ + if (lp + ln + 2 > sizeof (buf)) { + (void) write(STDERR_FILENO, "execvP: ", 8); + (void) write(STDERR_FILENO, p, lp); + (void) write(STDERR_FILENO, ": path too long\n", + 16); + continue; + } + bcopy(p, buf, lp); + buf[lp] = '/'; + bcopy(name, buf + lp + 1, ln); + buf[lp + ln + 1] = '\0'; + +retry: (void) execve(bp, argv, envp); + switch (errno) { + case E2BIG: + goto done; + case ELOOP: + case ENAMETOOLONG: + case ENOENT: + break; + case ENOEXEC: + for (cnt = 0; argv[cnt]; ++cnt) + ; + memp = alloca((cnt + 2) * sizeof (char *)); + if (memp == NULL) { + /* errno = ENOMEM; XXX override ENOEXEC? */ + goto done; + } + memp[0] = "sh"; + memp[1] = bp; + bcopy(argv + 1, memp + 2, cnt * sizeof (char *)); + execve(_PATH_BSHELL, __DECONST(char **, memp), envp); + goto done; + case ENOMEM: + goto done; + case ENOTDIR: + break; + case ETXTBSY: + /* + * We used to retry here, but sh(1) doesn't. + */ + goto done; + default: + /* + * EACCES may be for an inaccessible directory or + * a non-executable file. Call stat() to decide + * which. This also handles ambiguities for EFAULT + * and EIO, and undocumented errors like ESTALE. + * We hope that the race for a stat() is unimportant. + */ + save_errno = errno; + if (stat(bp, &sb) != 0) + break; + if (save_errno == EACCES) { + eacces = 1; + continue; + } + errno = save_errno; + goto done; + } + } + if (eacces) + errno = EACCES; + else + errno = ENOENT; +done: + return (-1); +} + +int +execvpe(const char *name, char * const argv[], char * const envp[]) +{ + const char *path; + + /* Get the path we're searching. */ + if ((path = getenv("PATH")) == NULL) + path = _PATH_DEFPATH; + + return (execvPe(name, path, argv, envp)); +} + +#if 0 +/* + * Get the SPA version + */ +static int +get_zfs_spa_version(void) +{ + size_t ver_size; + int ver = 0; + + ver_size = sizeof (ver); + sysctlbyname("vfs.zfs.version.spa", &ver, &ver_size, NULL, 0); + + return (ver); +} +#endif + +/* + * Get zfs_ioctl_version + */ +int +get_zfs_ioctl_version(void) +{ + size_t ver_size; + int ver = ZFS_IOCVER_NONE; + + ver_size = sizeof (ver); + sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); + + return (ver); +} + +const char * +libzfs_error_init(int error) +{ + + return (strerror(error)); +} + +int +zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) +{ + return (zfs_ioctl_fd(hdl->libzfs_fd, request, zc)); +} + +/* + * Verify the required ZFS_DEV device is available and optionally attempt + * to load the ZFS modules. Under normal circumstances the modules + * should already have been loaded by some external mechanism. + * + * Environment variables: + * - ZFS_MODULE_LOADING="YES|yes|ON|on" - Attempt to load modules. + * - ZFS_MODULE_TIMEOUT="" - Seconds to wait for ZFS_DEV + */ +int +libzfs_load_module(void) +{ + /* XXX: modname is "zfs" but file is named "openzfs". */ + if (modfind("zfs") < 0) { + /* Not present in kernel, try loading it. */ + if (kldload("openzfs") < 0 && errno != EEXIST) { + return (errno); + } + } + return (0); +} + +int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) +{ + return (0); +} + +int +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, char *name) +{ + return (0); +} + +int +find_shares_object(differ_info_t *di) +{ + return (0); +} + +/* + * Attach/detach the given filesystem to/from the given jail. + */ +int +zfs_jail(zfs_handle_t *zhp, int jailid, int attach) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_cmd_t zc = { { 0 } }; + char errbuf[1024]; + unsigned long cmd; + int ret; + + if (attach) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name); + } else { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name); + } + + switch (zhp->zfs_type) { + case ZFS_TYPE_VOLUME: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volumes can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_SNAPSHOT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_BOOKMARK: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bookmarks can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_POOL: + case ZFS_TYPE_FILESYSTEM: + /* OK */ + ; + } + assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_zoneid = jailid; + + cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL; + if ((ret = ioctl(hdl->libzfs_fd, cmd, &zc)) != 0) + zfs_standard_error(hdl, errno, errbuf); + + return (ret); +} + +/* + * Fill given version buffer with zfs kernel version. + * Returns 0 on success, and -1 on error (with errno set) + */ +int +zfs_version_kernel(char *version, int len) +{ + size_t l = len; + + return (sysctlbyname("vfs.zfs.version.module", + version, &l, NULL, 0)); +} diff --git a/lib/libzfs/os/freebsd/libzfs_fsshare.c b/lib/libzfs/os/freebsd/libzfs_fsshare.c new file mode 100644 index 000000000000..0fd75bf2c546 --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_fsshare.c @@ -0,0 +1,406 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" + +#define _PATH_MOUNTDPID "/var/run/mountd.pid" +#define FILE_HEADER "# !!! DO NOT EDIT THIS FILE MANUALLY !!!\n\n" +#define OPTSSIZE 1024 +#define MAXLINESIZE (PATH_MAX + OPTSSIZE) + + +void +sa_fini(sa_handle_t handle) +{ +} + +int +sa_parse_legacy_options(sa_group_t group, char *options, char *proto) +{ + return (SA_OK); +} + + +int +zfs_init_libshare(libzfs_handle_t *zhandle, int service) +{ + return (SA_OK); +} + +/* + * Share the given filesystem according to the options in the specified + * protocol specific properties (sharenfs, sharesmb). We rely + * on "libshare" to do the dirty work for us. + */ +int +zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) +{ + char mountpoint[ZFS_MAXPROPLEN]; + char shareopts[ZFS_MAXPROPLEN]; + char sourcestr[ZFS_MAXPROPLEN]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_share_proto_t *curr_proto; + zprop_source_t sourcetype; + int err, ret; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, 0)) + return (0); + + for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { + /* + * Return success if there are no share options. + */ + if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop, + shareopts, sizeof (shareopts), &sourcetype, sourcestr, + ZFS_MAXPROPLEN, B_FALSE) != 0 || + strcmp(shareopts, "off") == 0) + continue; + + ret = zfs_init_libshare(hdl, SA_INIT_SHARE_API); + if (ret != SA_OK) { + (void) zfs_error_fmt(hdl, EZFS_SHARENFSFAILED, + dgettext(TEXT_DOMAIN, "cannot share '%s': %s"), + zfs_get_name(zhp), sa_errorstr(ret)); + return (-1); + } + + /* + * If the 'zoned' property is set, then zfs_is_mountable() + * will have already bailed out if we are in the global zone. + * But local zones cannot be NFS servers, so we ignore it for + * local zones as well. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) + continue; + + if (*curr_proto != PROTO_NFS) { + fprintf(stderr, "Unsupported share protocol: %d.\n", + *curr_proto); + continue; + } + + if (strcmp(shareopts, "on") == 0) + err = fsshare(ZFS_EXPORTS_PATH, mountpoint, ""); + else + err = fsshare(ZFS_EXPORTS_PATH, mountpoint, shareopts); + if (err != 0) { + (void) zfs_error_fmt(hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s'"), + zfs_get_name(zhp)); + return (-1); + } + + } + return (0); +} + +/* + * Unshare a filesystem by mountpoint. + */ +int +unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, + zfs_share_proto_t proto) +{ + int err; + + if (proto != PROTO_NFS) { + fprintf(stderr, "No SMB support in FreeBSD yet.\n"); + return (EOPNOTSUPP); + } + + err = fsunshare(ZFS_EXPORTS_PATH, mountpoint); + if (err != 0) { + zfs_error_aux(hdl, "%s", strerror(err)); + return (zfs_error_fmt(hdl, EZFS_UNSHARENFSFAILED, + dgettext(TEXT_DOMAIN, + "cannot unshare '%s'"), name)); + } + return (0); +} + +zfs_share_type_t +is_shared_impl(libzfs_handle_t *hdl, const char *mountpoint, + zfs_share_proto_t proto) +{ + char buf[MAXPATHLEN], *tab; + + if (hdl->libzfs_sharetab == NULL) + return (SHARED_NOT_SHARED); + + (void) fseek(hdl->libzfs_sharetab, 0, SEEK_SET); + + while (fgets(buf, sizeof (buf), hdl->libzfs_sharetab) != NULL) { + + /* the mountpoint is the first entry on each line */ + if ((tab = strchr(buf, '\t')) == NULL) + continue; + + *tab = '\0'; + if (strcmp(buf, mountpoint) == 0) { + if (proto == PROTO_NFS) + return (SHARED_NFS); + } + } + + return (SHARED_NOT_SHARED); +} + +static void +restart_mountd(void) +{ + struct pidfh *pfh; + pid_t mountdpid; + + pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &mountdpid); + if (pfh != NULL) { + /* Mountd is not running. */ + pidfile_remove(pfh); + return; + } + if (errno != EEXIST) { + /* Cannot open pidfile for some reason. */ + return; + } + /* We have mountd(8) PID in mountdpid varible. */ + kill(mountdpid, SIGHUP); +} + +/* + * Read one line from a file. Skip comments, empty lines and a line with a + * mountpoint specified in the 'skip' argument. + */ +static char * +zgetline(FILE *fd, const char *skip) +{ + static char line[MAXLINESIZE]; + size_t len, skiplen = 0; + char *s, last; + + if (skip != NULL) + skiplen = strlen(skip); + for (;;) { + s = fgets(line, sizeof (line), fd); + if (s == NULL) + return (NULL); + /* Skip empty lines and comments. */ + if (line[0] == '\n' || line[0] == '#') + continue; + len = strlen(line); + if (line[len - 1] == '\n') + line[len - 1] = '\0'; + last = line[skiplen]; + /* Skip the given mountpoint. */ + if (skip != NULL && strncmp(skip, line, skiplen) == 0 && + (last == '\t' || last == ' ' || last == '\0')) { + continue; + } + break; + } + return (line); +} +/* BEGIN CSTYLED */ +/* + * Function translate options to a format acceptable by exports(5), eg. + * + * -ro -network=192.168.0.0 -mask=255.255.255.0 -maproot=0 freefall.freebsd.org 69.147.83.54 + * + * Accepted input formats: + * + * ro,network=192.168.0.0,mask=255.255.255.0,maproot=0,freefall.freebsd.org + * ro network=192.168.0.0 mask=255.255.255.0 maproot=0 freefall.freebsd.org + * -ro,-network=192.168.0.0,-mask=255.255.255.0,-maproot=0,freefall.freebsd.org + * -ro -network=192.168.0.0 -mask=255.255.255.0 -maproot=0 freefall.freebsd.org + * + * Recognized keywords: + * + * ro, maproot, mapall, mask, network, sec, alldirs, public, webnfs, index, quiet + * + */ +/* END CSTYLED */ + +static const char *known_opts[] = { "ro", "maproot", "mapall", "mask", + "network", "sec", "alldirs", "public", "webnfs", "index", "quiet", + NULL }; +static char * +translate_opts(const char *shareopts) +{ + static char newopts[OPTSSIZE]; + char oldopts[OPTSSIZE]; + char *o, *s = NULL; + unsigned int i; + size_t len; + + strlcpy(oldopts, shareopts, sizeof (oldopts)); + newopts[0] = '\0'; + s = oldopts; + while ((o = strsep(&s, "-, ")) != NULL) { + if (o[0] == '\0') + continue; + for (i = 0; known_opts[i] != NULL; i++) { + len = strlen(known_opts[i]); + if (strncmp(known_opts[i], o, len) == 0 && + (o[len] == '\0' || o[len] == '=')) { + strlcat(newopts, "-", sizeof (newopts)); + break; + } + } + strlcat(newopts, o, sizeof (newopts)); + strlcat(newopts, " ", sizeof (newopts)); + } + return (newopts); +} + +static int +fsshare_main(const char *file, const char *mountpoint, const char *shareopts, + int share) +{ + char tmpfile[PATH_MAX]; + char *line; + FILE *newfd, *oldfd; + int fd, error; + + newfd = oldfd = NULL; + error = 0; + + /* + * Create temporary file in the same directory, so we can atomically + * rename it. + */ + if (strlcpy(tmpfile, file, sizeof (tmpfile)) >= sizeof (tmpfile)) + return (ENAMETOOLONG); + if (strlcat(tmpfile, ".XXXXXXXX", sizeof (tmpfile)) >= sizeof (tmpfile)) + return (ENAMETOOLONG); + fd = mkstemp(tmpfile); + if (fd == -1) + return (errno); + /* + * File name is random, so we don't really need file lock now, but it + * will be needed after rename(2). + */ + error = flock(fd, LOCK_EX); + assert(error == 0 || (error == -1 && errno == EOPNOTSUPP)); + newfd = fdopen(fd, "r+"); + assert(newfd != NULL); + /* Open old exports file. */ + oldfd = fopen(file, "r"); + if (oldfd == NULL) { + if (share) { + if (errno != ENOENT) { + error = errno; + goto out; + } + } else { + /* If there is no exports file, ignore the error. */ + if (errno == ENOENT) + errno = 0; + error = errno; + goto out; + } + } else { + error = flock(fileno(oldfd), LOCK_EX); + assert(error == 0 || (error == -1 && errno == EOPNOTSUPP)); + error = 0; + } + + /* Place big, fat warning at the begining of the file. */ + fprintf(newfd, "%s", FILE_HEADER); + while (oldfd != NULL && (line = zgetline(oldfd, mountpoint)) != NULL) + fprintf(newfd, "%s\n", line); + if (oldfd != NULL && ferror(oldfd) != 0) { + error = ferror(oldfd); + goto out; + } + if (ferror(newfd) != 0) { + error = ferror(newfd); + goto out; + } + if (share) { + fprintf(newfd, "%s\t%s\n", mountpoint, + translate_opts(shareopts)); + } + +out: + if (error != 0) + unlink(tmpfile); + else { + if (rename(tmpfile, file) == -1) { + error = errno; + unlink(tmpfile); + } else { + fflush(newfd); + /* + * Send SIGHUP to mountd, but unlock exports file later. + */ + restart_mountd(); + } + } + if (oldfd != NULL) { + flock(fileno(oldfd), LOCK_UN); + fclose(oldfd); + } + if (newfd != NULL) { + flock(fileno(newfd), LOCK_UN); + fclose(newfd); + } + return (error); +} + +/* + * Add the given mountpoint to the given exports file. + */ +int +fsshare(const char *file, const char *mountpoint, const char *shareopts) +{ + + return (fsshare_main(file, mountpoint, shareopts, 1)); +} + +/* + * Remove the given mountpoint from the given exports file. + */ +int +fsunshare(const char *file, const char *mountpoint) +{ + + return (fsshare_main(file, mountpoint, NULL, 0)); +} diff --git a/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c b/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c new file mode 100644 index 000000000000..18b93fe27969 --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c @@ -0,0 +1,432 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Xin Li . All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Portions Copyright 2005, 2010, Oracle and/or its affiliates. + * All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_namecheck.h" +#include + +/* + * FreeBSD zfs_cmd compatibility with older binaries + * appropriately remap/extend the zfs_cmd_t structure + */ +void +zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) +{ + +} +#if 0 +static int +zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag, + nvlist_t **nvp) +{ + char *packed; + int error; + nvlist_t *list = NULL; + + /* + * Read in and unpack the user-supplied nvlist. + */ + if (size == 0) + return (EINVAL); + +#ifdef _KERNEL + packed = kmem_alloc(size, KM_SLEEP); + if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, + iflag)) != 0) { + kmem_free(packed, size); + return (error); + } +#else + packed = (void *)(uintptr_t)nvl; +#endif + + error = nvlist_unpack(packed, size, &list, 0); + +#ifdef _KERNEL + kmem_free(packed, size); +#endif + + if (error != 0) + return (error); + + *nvp = list; + return (0); +} + +static int +zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) +{ + char *packed = NULL; + int error = 0; + size_t size; + + VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); + +#ifdef _KERNEL + packed = kmem_alloc(size, KM_SLEEP); + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + KM_SLEEP) == 0); + + if (ddi_copyout(packed, + (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) + error = EFAULT; + kmem_free(packed, size); +#else + packed = (void *)(uintptr_t)zc->zc_nvlist_dst; + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + 0) == 0); +#endif + + zc->zc_nvlist_dst_size = size; + return (error); +} + +static void +zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl) +{ + nvlist_t **child; + nvlist_t *nvroot = NULL; + vdev_stat_t *vs; + uint_t c, children, nelem; + + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + zfs_ioctl_compat_fix_stats_nvlist(child[c]); + } + } + + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0) + zfs_ioctl_compat_fix_stats_nvlist(nvroot); + if ((nvlist_lookup_uint64_array(nvl, "stats", + (uint64_t **)&vs, &nelem) == 0)) { + nvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)vs, nelem); + nvlist_remove(nvl, "stats", + DATA_TYPE_UINT64_ARRAY); + } +} + + +static int +zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc) +{ + nvlist_t *nv, *nvp = NULL; + nvpair_t *elem; + int error; + + if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) + return (error); + + if (nc == 5) { /* ZFS_IOC_POOL_STATS */ + elem = NULL; + while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) { + if (nvpair_value_nvlist(elem, &nvp) == 0) + zfs_ioctl_compat_fix_stats_nvlist(nvp); + } + elem = NULL; + } else + zfs_ioctl_compat_fix_stats_nvlist(nv); + + error = zfs_ioctl_compat_put_nvlist(zc, nv); + + nvlist_free(nv); + + return (error); +} + +static int +zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc) +{ + nvlist_t *nv, *nva = NULL; + int error; + + if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) + return (error); + + if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) { + nvlist_add_nvlist(nv, "allocated", nva); + nvlist_remove(nv, "used", DATA_TYPE_NVLIST); + } + + if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) { + nvlist_add_nvlist(nv, "free", nva); + nvlist_remove(nv, "available", DATA_TYPE_NVLIST); + } + + error = zfs_ioctl_compat_put_nvlist(zc, nv); + + nvlist_free(nv); + + return (error); +} +#endif + +#ifdef _KERNEL +int +zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag) +{ + int error = 0; + + /* are we creating a clone? */ + if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0') + *vec = ZFS_IOC_CLONE; + + if (cflag == ZFS_CMD_COMPAT_V15) { + switch (*vec) { + + case 7: /* ZFS_IOC_POOL_SCRUB (v15) */ + zc->zc_cookie = POOL_SCAN_SCRUB; + break; + } + } + + return (error); +} + +void +zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag) +{ + if (cflag == ZFS_CMD_COMPAT_V15) { + switch (vec) { + case ZFS_IOC_POOL_CONFIGS: + case ZFS_IOC_POOL_STATS: + case ZFS_IOC_POOL_TRYIMPORT: + zfs_ioctl_compat_fix_stats(zc, vec); + break; + case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ + zfs_ioctl_compat_pool_get_props(zc); + break; + } + } +} + +nvlist_t * +zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t *innvl, const int vec, + const int cflag) +{ + nvlist_t *nvl, *tmpnvl, *hnvl; + nvpair_t *elem; + char *poolname, *snapname; + int err; + + if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || + cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) + goto out; + + switch (vec) { + case ZFS_IOC_CREATE: + nvl = fnvlist_alloc(); + fnvlist_add_int32(nvl, "type", zc->zc_objset_type); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "props", innvl); + nvlist_free(innvl); + } + return (nvl); + break; + case ZFS_IOC_CLONE: + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, "origin", zc->zc_value); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "props", innvl); + nvlist_free(innvl); + } + return (nvl); + break; + case ZFS_IOC_SNAPSHOT: + if (innvl == NULL) + goto out; + nvl = fnvlist_alloc(); + fnvlist_add_nvlist(nvl, "props", innvl); + tmpnvl = fnvlist_alloc(); + snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + fnvlist_add_boolean(tmpnvl, snapname); + kmem_free(snapname, strlen(snapname + 1)); + /* check if we are doing a recursive snapshot */ + if (zc->zc_cookie) + dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, + tmpnvl); + fnvlist_add_nvlist(nvl, "snaps", tmpnvl); + fnvlist_free(tmpnvl); + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_SPACE_SNAPS: + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, "firstsnap", zc->zc_value); + if (innvl != NULL) + nvlist_free(innvl); + return (nvl); + break; + case ZFS_IOC_DESTROY_SNAPS: + if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN) + goto out; + nvl = fnvlist_alloc(); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "snaps", innvl); + } else { + /* + * We are probably called by even older binaries, + * allocate and populate nvlist with recursive + * snapshots + */ + if (zfs_component_namecheck(zc->zc_value, NULL, + NULL) == 0) { + tmpnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, tmpnvl) == 0) + fnvlist_add_nvlist(nvl, "snaps", + tmpnvl); + nvlist_free(tmpnvl); + } + } + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_HOLD: + nvl = fnvlist_alloc(); + tmpnvl = fnvlist_alloc(); + if (zc->zc_cleanup_fd != -1) + fnvlist_add_int32(nvl, "cleanup_fd", + (int32_t)zc->zc_cleanup_fd); + if (zc->zc_cookie) { + hnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, hnvl) == 0) { + elem = NULL; + while ((elem = nvlist_next_nvpair(hnvl, + elem)) != NULL) { + nvlist_add_string(tmpnvl, + nvpair_name(elem), zc->zc_string); + } + } + nvlist_free(hnvl); + } else { + snapname = kmem_asprintf("%s@%s", zc->zc_name, + zc->zc_value); + nvlist_add_string(tmpnvl, snapname, zc->zc_string); + kmem_free(snapname, strlen(snapname + 1)); + } + fnvlist_add_nvlist(nvl, "holds", tmpnvl); + nvlist_free(tmpnvl); + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_RELEASE: + nvl = fnvlist_alloc(); + tmpnvl = fnvlist_alloc(); + if (zc->zc_cookie) { + hnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, hnvl) == 0) { + elem = NULL; + while ((elem = nvlist_next_nvpair(hnvl, + elem)) != NULL) { + fnvlist_add_boolean(tmpnvl, + zc->zc_string); + fnvlist_add_nvlist(nvl, + nvpair_name(elem), tmpnvl); + } + } + nvlist_free(hnvl); + } else { + snapname = kmem_asprintf("%s@%s", zc->zc_name, + zc->zc_value); + fnvlist_add_boolean(tmpnvl, zc->zc_string); + fnvlist_add_nvlist(nvl, snapname, tmpnvl); + kmem_free(snapname, strlen(snapname + 1)); + } + nvlist_free(tmpnvl); + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + } +out: + return (innvl); +} + +nvlist_t * +zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t *outnvl, const int vec, + const int cflag) +{ + nvlist_t *tmpnvl; + + if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || + cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) + return (outnvl); + + switch (vec) { + case ZFS_IOC_SPACE_SNAPS: + (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie); + (void) nvlist_lookup_uint64(outnvl, "compressed", + &zc->zc_objset_type); + (void) nvlist_lookup_uint64(outnvl, "uncompressed", + &zc->zc_perm_action); + nvlist_free(outnvl); + /* return empty outnvl */ + tmpnvl = fnvlist_alloc(); + return (tmpnvl); + break; + case ZFS_IOC_CREATE: + case ZFS_IOC_CLONE: + case ZFS_IOC_HOLD: + case ZFS_IOC_RELEASE: + nvlist_free(outnvl); + /* return empty outnvl */ + tmpnvl = fnvlist_alloc(); + return (tmpnvl); + break; + } + + return (outnvl); +} +#endif /* KERNEL */ diff --git a/lib/libzfs/os/freebsd/libzfs_zmount.c b/lib/libzfs/os/freebsd/libzfs_zmount.c new file mode 100644 index 000000000000..8ff24f446bdc --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_zmount.c @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file implements Solaris compatible zmount() function. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +build_iovec(struct iovec **iov, int *iovlen, const char *name, void *val, + size_t len) +{ + int i; + + if (*iovlen < 0) + return; + i = *iovlen; + *iov = realloc(*iov, sizeof (**iov) * (i + 2)); + if (*iov == NULL) { + *iovlen = -1; + return; + } + (*iov)[i].iov_base = strdup(name); + (*iov)[i].iov_len = strlen(name) + 1; + i++; + (*iov)[i].iov_base = val; + if (len == (size_t)-1) { + if (val != NULL) + len = strlen(val) + 1; + else + len = 0; + } + (*iov)[i].iov_len = (int)len; + *iovlen = ++i; +} + +static int +do_mount_(const char *spec, const char *dir, int mflag, char *fstype, + char *dataptr, int datalen, char *optptr, int optlen) +{ + struct iovec *iov; + char *optstr, *p, *tofree; + int iovlen, rv; + + assert(spec != NULL); + assert(dir != NULL); + assert(fstype != NULL); + assert(strcmp(fstype, MNTTYPE_ZFS) == 0); + assert(dataptr == NULL); + assert(datalen == 0); + assert(optptr != NULL); + assert(optlen > 0); + + tofree = optstr = strdup(optptr); + assert(optstr != NULL); + + iov = NULL; + iovlen = 0; + if (strstr(optstr, MNTOPT_REMOUNT) != NULL) + build_iovec(&iov, &iovlen, "update", NULL, 0); + if (strstr(optstr, MNTOPT_NOXATTR) == NULL && + strstr(optstr, MNTOPT_XATTR) == NULL && + strstr(optstr, MNTOPT_SAXATTR) == NULL && + strstr(optstr, MNTOPT_DIRXATTR) == NULL) + build_iovec(&iov, &iovlen, "xattr", NULL, 0); + if (mflag & MS_RDONLY) + build_iovec(&iov, &iovlen, "ro", NULL, 0); + build_iovec(&iov, &iovlen, "fstype", fstype, (size_t)-1); + build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, dir), + (size_t)-1); + build_iovec(&iov, &iovlen, "from", __DECONST(char *, spec), (size_t)-1); + while ((p = strsep(&optstr, ",/")) != NULL) + build_iovec(&iov, &iovlen, p, NULL, (size_t)-1); + rv = nmount(iov, iovlen, 0); + free(tofree); + if (rv < 0) + return (errno); + return (rv); +} + +int +do_mount(const char *src, const char *mntpt, char *opts, int flags) +{ + + return (do_mount_(src, mntpt, flags, MNTTYPE_ZFS, NULL, 0, opts, + sizeof (mntpt))); +} + +int +do_unmount(const char *mntpt, int flags) +{ + + return (unmount(mntpt, flags)); +} + +int +zfs_mount_delegation_check(void) +{ + return (0); +} + +/* + * Check if we are doing an overlay mount. + * Returns B_TRUE if the mount would overlay, otherwise B_FALSE. + */ +boolean_t +zfs_mount_overlay_check(const char *mountpoint) +{ + /* FreeBSD always allows overlay mounts. */ + return (B_FALSE); +} diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am index dca81e01ab37..617b1cf3295b 100644 --- a/lib/libzfs_core/Makefile.am +++ b/lib/libzfs_core/Makefile.am @@ -12,6 +12,10 @@ libzfs_core_la_LIBADD = \ $(top_builddir)/lib/libuutil/libuutil.la \ $(top_builddir)/lib/libzutil/libzutil.la +if BUILD_FREEBSD +libzfs_core_la_LDFLAGS = -version-info 3:0:0 +libzfs_core_la_LIBADD += -lutil -lgeom +else libzfs_core_la_LDFLAGS = -version-info 1:0:0 - +endif EXTRA_DIST = $(USER_C) diff --git a/lib/libzfs_core/libzfs_core_compat.h b/lib/libzfs_core/libzfs_core_compat.h new file mode 100644 index 000000000000..6527c4b2576f --- /dev/null +++ b/lib/libzfs_core/libzfs_core_compat.h @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Martin Matuska . All rights reserved. + */ + +#ifndef _LIBZFS_CORE_COMPAT_H +#define _LIBZFS_CORE_COMPAT_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int lzc_compat_pre(zfs_cmd_t *, zfs_ioc_t *, nvlist_t **); +void lzc_compat_post(zfs_cmd_t *, const zfs_ioc_t); +int lzc_compat_outnvl(zfs_cmd_t *, const zfs_ioc_t, nvlist_t **); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_CORE_COMPAT_H */ diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 4225246b968c..a9396105bc6b 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -199,8 +199,13 @@ libzpool_la_LIBADD = \ $(top_builddir)/lib/libunicode/libunicode.la \ $(top_builddir)/lib/libzutil/libzutil.la +if BUILD_FREEBSD +libzpool_la_LIBADD += $(ZLIB) -ldl -lgeom +libzpool_la_LDFLAGS = -pthread -version-info 4:0:0 +else libzpool_la_LIBADD += $(ZLIB) -ldl libzpool_la_LDFLAGS = -pthread -version-info 2:0:0 +endif EXTRA_DIST = $(USER_C) diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am index e5c6a340d282..092312e0dcd0 100644 --- a/lib/libzutil/Makefile.am +++ b/lib/libzutil/Makefile.am @@ -21,13 +21,24 @@ USER_C += \ os/linux/zutil_compat.c endif +if BUILD_FREEBSD +USER_C += \ + os/freebsd/zutil_device_path_os.c \ + os/freebsd/zutil_import_os.c \ + os/freebsd/zutil_compat.c +endif + nodist_libzutil_la_SOURCES = $(USER_C) libzutil_la_LIBADD = \ $(top_builddir)/lib/libavl/libavl.la \ - $(top_builddir)/lib/libefi/libefi.la \ $(top_builddir)/lib/libtpool/libtpool.la +if BUILD_LINUX +libzutil_la_LIBADD += \ + $(top_builddir)/lib/libefi/libefi.la +endif + libzutil_la_LIBADD += -lm $(LIBBLKID) $(LIBUDEV) EXTRA_DIST = $(USER_C) diff --git a/lib/libzutil/os/freebsd/zutil_compat.c b/lib/libzutil/os/freebsd/zutil_compat.c new file mode 100644 index 000000000000..5b0fe85485db --- /dev/null +++ b/lib/libzutil/os/freebsd/zutil_compat.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include +#include +#include + +static int +zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) +{ + int ret; + void *zc_c; + unsigned long ncmd; + zfs_iocparm_t zp; + + switch (cflag) { + case ZFS_CMD_COMPAT_NONE: + ncmd = _IOWR('Z', request, zfs_iocparm_t); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof (zfs_cmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_ZOF; + return (ioctl(fd, ncmd, &zp)); + default: + abort(); + return (EINVAL); + } + + ret = ioctl(fd, ncmd, zc_c); + zfs_cmd_compat_get(zc, (caddr_t)zc_c, cflag); + free(zc_c); + + return (ret); +} + +/* + * This is FreeBSD version of ioctl, because Solaris' ioctl() updates + * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an + * error is returned zc_nvlist_dst_size won't be updated. + */ +int +zfs_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +{ + size_t oldsize; + int ret, cflag = ZFS_CMD_COMPAT_NONE; + + oldsize = zc->zc_nvlist_dst_size; + ret = zcmd_ioctl_compat(fd, request, zc, cflag); + + if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { + ret = -1; + errno = ENOMEM; + } + + return (ret); +} diff --git a/lib/libzutil/os/freebsd/zutil_device_path_os.c b/lib/libzutil/os/freebsd/zutil_device_path_os.c new file mode 100644 index 000000000000..71c936005242 --- /dev/null +++ b/lib/libzutil/os/freebsd/zutil_device_path_os.c @@ -0,0 +1,132 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +/* + * We don't strip/append partitions on FreeBSD. + */ + +/* + * Note: The caller must free the returned string. + */ +char * +zfs_strip_partition(char *dev) +{ + return (strdup(dev)); +} + +int +zfs_append_partition(char *path, size_t max_len) +{ + return (strnlen(path, max_len)); +} + +/* + * Strip the path from a device name. + * On FreeBSD we only want to remove "/dev/" from the beginning of + * paths if present. + */ +char * +zfs_strip_path(char *path) +{ + if (strncmp(path, _PATH_DEV, sizeof (_PATH_DEV) - 1) == 0) + return (path + sizeof (_PATH_DEV) - 1); + else + return (path); +} + +char * +zfs_get_underlying_path(const char *dev_name) +{ + + if (dev_name == NULL) + return (NULL); + + return (realpath(dev_name, NULL)); +} + +boolean_t +zfs_dev_is_whole_disk(const char *dev_name) +{ + int fd; + + fd = g_open(dev_name, 0); + if (fd >= 0) { + g_close(fd); + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(const char *path, int timeout_ms) +{ + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +} + +/* ARGSUSED */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + return (B_FALSE); +} diff --git a/lib/libzutil/os/freebsd/zutil_import_os.c b/lib/libzutil/os/freebsd/zutil_import_os.c new file mode 100644 index 000000000000..7272a1e0ef0a --- /dev/null +++ b/lib/libzutil/os/freebsd/zutil_import_os.c @@ -0,0 +1,239 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright 2016 Nexenta Systems, Inc. + */ + +/* + * Pool import support functions. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include "zutil_import.h" + +/* + * Update a leaf vdev's persistent device strings + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * On FreeBSD we currently just strip devid and phys_path to avoid confusion. + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); +} + +/* + * Do not even look at these devices. + */ +static const char * const blacklist_devs[] = { + "nfslock", + "sequencer", + "zfs", +}; +#define BLACKLIST_DIR "/dev/" +#define BLACKLIST_DIR_LEN 5 + +void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + struct stat64 statbuf; + nvlist_t *config; + size_t i; + int num_labels; + int fd; + off_t mediasize = 0; + + /* + * Do not even look at blacklisted devices. + */ + if (strncmp(rn->rn_name, BLACKLIST_DIR, BLACKLIST_DIR_LEN) == 0) { + char *name = rn->rn_name + BLACKLIST_DIR_LEN; + for (i = 0; i < nitems(blacklist_devs); ++i) { + const char *badname = blacklist_devs[i]; + size_t len = strlen(badname); + if (strncmp(name, badname, len) == 0) { + return; + } + } + } + + /* + * O_NONBLOCK so we don't hang trying to open things like serial ports. + */ + if ((fd = open(rn->rn_name, O_RDONLY|O_NONBLOCK)) < 0) + return; + + /* + * Ignore failed stats. + */ + if (fstat64(fd, &statbuf) != 0) + goto out; + /* + * We only want regular files, character devs and block devs. + */ + if (S_ISREG(statbuf.st_mode)) { + /* Check if this file is too small to hold a zpool. */ + if (statbuf.st_size < SPA_MINDEVSIZE) { + goto out; + } + } else if (S_ISCHR(statbuf.st_mode) || S_ISBLK(statbuf.st_mode)) { + /* Check if this device is too small to hold a zpool. */ + if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) != 0 || + mediasize < SPA_MINDEVSIZE) { + goto out; + } + } else { + goto out; + } + + if (zpool_read_label(fd, &config, &num_labels) != 0) + goto out; + if (num_labels == 0) { + nvlist_free(config); + goto out; + } + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + + /* TODO: Reuse labelpaths logic from Linux? */ +out: + (void) close(fd); +} + +static const char * +zpool_default_import_path[] = { + "/dev" +}; + +const char * const * +zpool_default_search_paths(size_t *count) +{ + *count = nitems(zpool_default_import_path); + return (zpool_default_import_path); +} + +int +zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache) +{ + char *end, path[MAXPATHLEN]; + rdsk_node_t *slice; + struct gmesh mesh; + struct gclass *mp; + struct ggeom *gp; + struct gprovider *pp; + avl_index_t where; + size_t pathleft; + int error; + + end = stpcpy(path, "/dev/"); + pathleft = &path[sizeof (path)] - end; + + error = geom_gettree(&mesh); + if (error != 0) + return (error); + + *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); + avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + + LIST_FOREACH(mp, &mesh.lg_class, lg_class) { + LIST_FOREACH(gp, &mp->lg_geom, lg_geom) { + LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { + strlcpy(end, pp->lg_name, pathleft); + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, path); + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = *slice_cache; + slice->rn_hdl = hdl; + slice->rn_labelpaths = B_FALSE; + slice->rn_order = IMPORT_ORDER_DEFAULT; + + pthread_mutex_lock(lock); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } + pthread_mutex_unlock(lock); + } + } + } + + geom_deletetree(&mesh); + + return (0); +} + +int +zfs_dev_flush(int fd __unused) +{ + return (0); +} diff --git a/module/.gitignore b/module/.gitignore index 45e5f9922235..5f3d70487296 100644 --- a/module/.gitignore +++ b/module/.gitignore @@ -2,6 +2,8 @@ *.ko.unsigned *.ko.out *.ko.out.sig +*.ko.debug +*.ko.full *.dwo .*.cmd .*.d @@ -11,5 +13,13 @@ /.tmp_versions /Module.markers /Module.symvers +/vnode_if* +/bus_if.h +/device_if.h +/opt_global.h + +/export_syms +/machine +/x86 !Makefile.in diff --git a/module/Makefile.bsd b/module/Makefile.bsd new file mode 100644 index 000000000000..6d76796f51ed --- /dev/null +++ b/module/Makefile.bsd @@ -0,0 +1,381 @@ +.if !defined(WITH_CTF) +WITH_CTF=1 +.endif + +.include + +SRCDIR= ${.CURDIR} +INCDIR=${.CURDIR:H}/include + +KMOD= openzfs + +.PATH: ${SRCDIR}/avl \ + ${SRCDIR}/lua \ + ${SRCDIR}/nvpair \ + ${SRCDIR}/os/freebsd/spl \ + ${SRCDIR}/os/freebsd/zfs \ + ${SRCDIR}/unicode \ + ${SRCDIR}/zcommon \ + ${SRCDIR}/zfs + + +CFLAGS+= -I${INCDIR} +CFLAGS+= -I${INCDIR}/spl +CFLAGS+= -I${INCDIR}/os/freebsd +CFLAGS+= -I${INCDIR}/os/freebsd/spl +CFLAGS+= -I${INCDIR}/os/freebsd/zfs +CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h + +CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 +CFLAGS+= -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ +CFLAGS+= -D_SYS_VMEM_H_ -D_MACHINE_ENDIAN_H_ -DKDTRACE_HOOKS -DSMP + +.if ${MACHINE_ARCH} == "amd64" +CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F +.endif + +.if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true" +CFLAGS+= -DINVARIANTS -DWITNESS -g -O0 -DZFS_DEBUG -DOPENSOLARIS_WITNESS +.else +CFLAGS += -DNDEBUG +.endif + +.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true" +# kernel must also be built with this option for this to work +CFLAGS+= -DDEBUG_VFS_LOCKS +.endif + +.if defined(WITH_GCOV) && ${WITH_GCOV} == "true" +CFLAGS+= -fprofile-arcs -ftest-coverage +.endif + +DEBUG_FLAGS=-g + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +CFLAGS+= -DBITS_PER_LONG=32 +.else +CFLAGS+= -DBITS_PER_LONG=64 +.endif + +SRCS= vnode_if.h device_if.h bus_if.h + +# avl +SRCS+= avl.c + +#lua +SRCS+= lapi.c \ + lauxlib.c \ + lbaselib.c \ + lcode.c \ + lcompat.c \ + lcorolib.c \ + lctype.c \ + ldebug.c \ + ldo.c \ + lfunc.c \ + lgc.c \ + llex.c \ + lmem.c \ + lobject.c \ + lopcodes.c \ + lparser.c \ + lstate.c \ + lstring.c \ + lstrlib.c \ + ltable.c \ + ltablib.c \ + ltm.c \ + lvm.c \ + lzio.c + +#nvpair +SRCS+= nvpair.c \ + fnvpair.c \ + nvpair_alloc_spl.c \ + nvpair_alloc_fixed.c + +#os/freebsd/spl +SRCS+= acl_common.c \ + btree.c \ + callb.c \ + list.c \ + spl_acl.c \ + spl_cmn_err.c \ + spl_dtrace.c \ + spl_kmem.c \ + spl_kstat.c \ + spl_misc.c \ + spl_policy.c \ + spl_string.c \ + spl_sunddi.c \ + spl_sysevent.c \ + spl_taskq.c \ + spl_uio.c \ + spl_vfs.c \ + spl_vm.c \ + spl_zone.c \ + sha256c.c \ + sha512c.c \ + spl_procfs_list.c \ + spl_zlib.c + + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +SRCS+= spl_atomic.c +.endif + +#os/freebsd/zfs +SRCS+= abd.c \ + crypto_os.c \ + dmu_os.c \ + hkdf.c \ + kmod_core.c \ + spa_os.c \ + sysctl_os.c \ + vdev_file.c \ + vdev_label_os.c \ + vdev_geom.c \ + zfs_acl.c \ + zfs_ctldir.c \ + zfs_dir.c \ + zfs_ioctl_os.c \ + zfs_log.c \ + zfs_replay.c \ + zfs_vfsops.c \ + zfs_vnops.c \ + zfs_znode.c \ + zio_crypt.c \ + zvol_os.c + +#unicode +SRCS+= uconv.c \ + u8_textprep.c + +#zcommon +SRCS+= zfeature_common.c \ + zfs_comutil.c \ + zfs_deleg.c \ + zfs_fletcher.c \ + zfs_fletcher_avx512.c \ + zfs_fletcher_intel.c \ + zfs_fletcher_sse.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ + zfs_namecheck.c \ + zfs_prop.c \ + zpool_prop.c \ + zprop_common.c + +#zfs +SRCS+= aggsum.c \ + arc.c \ + arc_os.c \ + blkptr.c \ + bplist.c \ + bpobj.c \ + cityhash.c \ + dbuf.c \ + dbuf_stats.c \ + bptree.c \ + bqueue.c \ + dataset_kstats.c \ + ddt.c \ + ddt_zap.c \ + dmu.c \ + dmu_diff.c \ + dmu_object.c \ + dmu_objset.c \ + dmu_recv.c \ + dmu_redact.c \ + dmu_send.c \ + dmu_traverse.c \ + dmu_tx.c \ + dmu_zfetch.c \ + dnode.c \ + dnode_sync.c \ + dsl_dataset.c \ + dsl_deadlist.c \ + dsl_deleg.c \ + dsl_bookmark.c \ + dsl_dir.c \ + dsl_crypt.c \ + dsl_destroy.c \ + dsl_pool.c \ + dsl_prop.c \ + dsl_scan.c \ + dsl_synctask.c \ + dsl_userhold.c \ + fm.c \ + gzip.c \ + lzjb.c \ + lz4.c \ + metaslab.c \ + mmp.c \ + multilist.c \ + objlist.c \ + pathname.c \ + range_tree.c \ + refcount.c \ + rrwlock.c \ + sa.c \ + sha256.c \ + skein_zfs.c \ + spa.c \ + spa_boot.c \ + spa_checkpoint.c \ + spa_config.c \ + spa_errlog.c \ + spa_history.c \ + spa_log_spacemap.c \ + spa_misc.c \ + spa_stats.c \ + space_map.c \ + space_reftree.c \ + txg.c \ + uberblock.c \ + unique.c \ + vdev.c \ + vdev_cache.c \ + vdev_indirect.c \ + vdev_indirect_births.c \ + vdev_indirect_mapping.c \ + vdev_initialize.c \ + vdev_label.c \ + vdev_mirror.c \ + vdev_missing.c \ + vdev_queue.c \ + vdev_raidz.c \ + vdev_raidz_math.c \ + vdev_raidz_math_scalar.c \ + vdev_raidz_math_avx2.c \ + vdev_raidz_math_avx512bw.c \ + vdev_raidz_math_avx512f.c \ + vdev_raidz_math_sse2.c \ + vdev_raidz_math_ssse3.c \ + vdev_removal.c \ + vdev_root.c \ + vdev_trim.c \ + zap.c \ + zap_leaf.c \ + zap_micro.c \ + zcp.c \ + zcp_get.c \ + zcp_global.c \ + zcp_iter.c \ + zcp_set.c \ + zcp_synctask.c \ + zfeature.c \ + zfs_byteswap.c \ + zfs_debug.c \ + zfs_file_os.c \ + zfs_fm.c \ + zfs_fuid.c \ + zfs_fuid_os.c \ + zfs_ioctl.c \ + zfs_onexit.c \ + zfs_quota.c \ + zfs_ratelimit.c \ + zfs_rlock.c \ + zfs_sa.c \ + zil.c \ + zio.c \ + zio_checksum.c \ + zio_compress.c \ + zio_inject.c \ + zle.c \ + zrlock.c \ + zthr.c \ + zvol.c + +beforeinstall: +.if ${MK_DEBUG_FILES} != "no" + mtree -eu \ + -f /etc/mtree/BSD.debug.dist \ + -p ${DESTDIR}/usr/lib +.endif + +.include + + +CFLAGS.gcc+= -Wno-pointer-to-int-cast + +CFLAGS.lapi.c= -Wno-cast-qual +CFLAGS.lcompat.c= -Wno-cast-qual -Wno-missing-prototypes +CFLAGS.lobject.c= -Wno-cast-qual +CFLAGS.ltable.c= -Wno-cast-qual +CFLAGS.lvm.c= -Wno-cast-qual +CFLAGS.nvpair.c= -Wno-cast-qual +CFLAGS.acl_common.c= -Wno-strict-prototypes -Wno-missing-prototypes +CFLAGS.callb.c= -Wno-strict-prototypes -Wno-missing-prototypes +CFLAGS.spl_kstat.c= -Wno-missing-prototypes +CFLAGS.spl_string.c= -Wno-cast-qual +CFLAGS.spl_vm.c= -Wno-cast-qual -Wno-missing-prototypes +CFLAGS.spl_zlib.c= -Wno-cast-qual +CFLAGS.abd.c= -Wno-cast-qual +CFLAGS.freebsd_dmu.c= -Wno-missing-prototypes +CFLAGS.freebsd_kmod.c= -Wno-missing-prototypes +CFLAGS.vdev_geom.c= -Wno-missing-prototypes +CFLAGS.zfs_acl.c= -Wno-missing-prototypes +CFLAGS.zfs_ctldir.c= -Wno-missing-prototypes -Wno-strict-prototypes +CFLAGS.zfs_log.c= -Wno-cast-qual +CFLAGS.zfs_vfsops.c= -Wno-missing-prototypes +CFLAGS.zfs_vnops.c= -Wno-missing-prototypes -Wno-strict-prototypes -Wno-pointer-arith +CFLAGS.zfs_znode.c= -Wno-missing-prototypes +CFLAGS.zvol.c= -Wno-missing-prototypes +CFLAGS.u8_textprep.c= -Wno-cast-qual +CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zprop_common.c= -Wno-cast-qual +CFLAGS.arc.c= -Wno-missing-prototypes +CFLAGS.blkptr.c= -Wno-missing-prototypes +CFLAGS.dbuf.c= -Wno-missing-prototypes +CFLAGS.dbuf_stats.c= -Wno-missing-prototypes +CFLAGS.ddt.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.dmu.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.dmu_object.c= -Wno-missing-prototypes +CFLAGS.dmu_objset.c= -Wno-missing-prototypes +CFLAGS.dmu_traverse.c= -Wno-cast-qual +CFLAGS.dsl_dir.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.dsl_crypt.c= -Wno-missing-prototypes +CFLAGS.dsl_deadlist.c= -Wno-cast-qual +CFLAGS.dsl_pool.c= -Wno-missing-prototypes +CFLAGS.dsl_prop.c= -Wno-cast-qual +CFLAGS.dsl_scan.c= -Wno-missing-prototypes +CFLAGS.fm.c= -Wno-cast-qual +CFLAGS.gzip.c= -Wno-missing-prototypes +CFLAGS.lzjb.c= -Wno-missing-prototypes +CFLAGS.lz4.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.metaslab.c= -Wno-missing-prototypes +CFLAGS.sa.c= -Wno-missing-prototypes +CFLAGS.sha256.c= -Wno-missing-prototypes +CFLAGS.skein_zfs.c= -Wno-missing-prototypes +CFLAGS.spa.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.spa_boot.c= -Wno-missing-prototypes +CFLAGS.spa_misc.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.space_map.c= -Wno-missing-prototypes +CFLAGS.vdev.c= -Wno-missing-prototypes +CFLAGS.vdev_indirect.c= -Wno-missing-prototypes +CFLAGS.vdev_label.c= -Wno-missing-prototypes +CFLAGS.vdev_queue.c= -Wno-missing-prototypes +CFLAGS.vdev_raidz.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual -Wno-missing-prototypes +CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.zap_leaf.c= -Wno-cast-qual +CFLAGS.zap_micro.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.zcp.c= -Wno-cast-qual +CFLAGS.zcp_get.c= -Wno-missing-prototypes +CFLAGS.zfs_debug.c= -Wno-missing-prototypes +CFLAGS.zfs_fm.c= -Wno-cast-qual +CFLAGS.zfs_ioctl.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.zil.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.zio.c= -Wno-missing-prototypes -Wno-cast-qual +CFLAGS.zio_checksum.c= -Wno-missing-prototypes +CFLAGS.zle.c= -Wno-missing-prototypes +CFLAGS.zrlock.c= -Wno-missing-prototypes -Wno-cast-qual diff --git a/module/Makefile.in b/module/Makefile.in index 24af81aa771a..39acdac20b15 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -12,7 +12,7 @@ obj-m += os/linux/zfs/ INSTALL_MOD_DIR ?= extra ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement -ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ +ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h ZFS_MODULE_CFLAGS += -I@abs_top_srcdir@/include/os/linux/kernel ZFS_MODULE_CFLAGS += -I@abs_top_srcdir@/include/os/linux/spl @@ -40,6 +40,11 @@ modules-Linux: done $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ CONFIG_ZFS=m modules +# Only pass down gmake -j flag, if used. +modules-FreeBSD: + flags="$$(echo $$MAKEFLAGS | awk -v RS=' ' /^-j/)"; \ + env MAKEFLAGS="" make $${flags} -f Makefile.bsd + modules-unknown: @true @@ -55,6 +60,10 @@ clean-Linux: find . -name '*.ur-safe' -type f -print | xargs $(RM) +clean-FreeBSD: + flags="$$(echo $$MAKEFLAGS | awk -v RS=' ' /^-j/)"; \ + env MAKEFLAGS="" make $${flags} -f Makefile.bsd clean + clean: clean-@ac_system@ modules_install-Linux: @@ -100,6 +109,11 @@ cscopelist-am: $(am__tagged_files) fi; \ done >> $(top_builddir)/cscope.files +modules_install-FreeBSD: + @# Install the kernel modules + flags="$$(echo $$MAKEFLAGS | awk -v RS=' ' /^-j/)"; \ + env MAKEFLAGS="" make $${flags} -f Makefile.bsd install + modules_install: modules_install-@ac_system@ modules_uninstall-Linux: @@ -109,11 +123,16 @@ modules_uninstall-Linux: $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \ done +modules_uninstall-FreeBSD: + @false + modules_uninstall: modules_uninstall-@ac_system@ distdir: list='$(obj-m)'; for objdir in $$list; do \ - (cd @top_srcdir@/module && find $$objdir \ - -name '*.c' -o -name '*.h' -o -name '*.S' | \ - xargs -r cp --parents -t @abs_top_builddir@/module/$$distdir); \ + (cd @top_srcdir@/module && find $$objdir -name '*.[chS]' | \ + while read path; do \ + mkdir -p @abs_top_builddir@/module/$$distdir/$${path%/*}; \ + cp $$path @abs_top_builddir@/module/$$distdir/$$path; \ + done); \ done diff --git a/module/os/freebsd/spl/acl_common.c b/module/os/freebsd/spl/acl_common.c new file mode 100644 index 000000000000..8eea4695eb08 --- /dev/null +++ b/module/os/freebsd/spl/acl_common.c @@ -0,0 +1,1731 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#if defined(_KERNEL) +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define ASSERT assert +#endif + +#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \ + ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \ + ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL) + + +#define ACL_SYNCHRONIZE_SET_DENY 0x0000001 +#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002 +#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004 +#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008 + +#define ACL_WRITE_OWNER_SET_DENY 0x0000010 +#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020 +#define ACL_WRITE_OWNER_ERR_DENY 0x0000040 +#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080 + +#define ACL_DELETE_SET_DENY 0x0000100 +#define ACL_DELETE_SET_ALLOW 0x0000200 +#define ACL_DELETE_ERR_DENY 0x0000400 +#define ACL_DELETE_ERR_ALLOW 0x0000800 + +#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000 +#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000 +#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000 +#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000 + +#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000 +#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000 +#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000 +#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000 + +#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000 +#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000 +#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000 +#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000 + +#define ACL_READ_NAMED_READER_SET_DENY 0x1000000 +#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000 +#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000 +#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000 + + +#define ACE_VALID_MASK_BITS (\ + ACE_READ_DATA | \ + ACE_LIST_DIRECTORY | \ + ACE_WRITE_DATA | \ + ACE_ADD_FILE | \ + ACE_APPEND_DATA | \ + ACE_ADD_SUBDIRECTORY | \ + ACE_READ_NAMED_ATTRS | \ + ACE_WRITE_NAMED_ATTRS | \ + ACE_EXECUTE | \ + ACE_DELETE_CHILD | \ + ACE_READ_ATTRIBUTES | \ + ACE_WRITE_ATTRIBUTES | \ + ACE_DELETE | \ + ACE_READ_ACL | \ + ACE_WRITE_ACL | \ + ACE_WRITE_OWNER | \ + ACE_SYNCHRONIZE) + +#define ACE_MASK_UNDEFINED 0x80000000 + +#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \ + ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \ + ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE) + +/* + * ACL conversion helpers + */ + +typedef enum { + ace_unused, + ace_user_obj, + ace_user, + ace_group, /* includes GROUP and GROUP_OBJ */ + ace_other_obj +} ace_to_aent_state_t; + +typedef struct acevals { + uid_t key; + avl_node_t avl; + uint32_t mask; + uint32_t allowed; + uint32_t denied; + int aent_type; +} acevals_t; + +typedef struct ace_list { + acevals_t user_obj; + avl_tree_t user; + int numusers; + acevals_t group_obj; + avl_tree_t group; + int numgroups; + acevals_t other_obj; + uint32_t acl_mask; + int hasmask; + int dfacl_flag; + ace_to_aent_state_t state; + int seen; /* bitmask of all aclent_t a_type values seen */ +} ace_list_t; + +/* + * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. + * v = Ptr to array/vector of objs + * n = # objs in the array + * s = size of each obj (must be multiples of a word size) + * f = ptr to function to compare two objs + * returns (-1 = less than, 0 = equal, 1 = greater than + */ +void +ksort(caddr_t v, int n, int s, int (*f)()) +{ + int g, i, j, ii; + unsigned int *p1, *p2; + unsigned int tmp; + + /* No work to do */ + if (v == NULL || n <= 1) + return; + + /* Sanity check on arguments */ + ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0); + ASSERT(s > 0); + for (g = n / 2; g > 0; g /= 2) { + for (i = g; i < n; i++) { + for (j = i - g; j >= 0 && + (*f)(v + j * s, v + (j + g) * s) == 1; + j -= g) { + p1 = (void *)(v + j * s); + p2 = (void *)(v + (j + g) * s); + for (ii = 0; ii < s / 4; ii++) { + tmp = *p1; + *p1++ = *p2; + *p2++ = tmp; + } + } + } + } +} + +/* + * Compare two acls, all fields. Returns: + * -1 (less than) + * 0 (equal) + * +1 (greater than) + */ +int +cmp2acls(void *a, void *b) +{ + aclent_t *x = (aclent_t *)a; + aclent_t *y = (aclent_t *)b; + + /* Compare types */ + if (x->a_type < y->a_type) + return (-1); + if (x->a_type > y->a_type) + return (1); + /* Equal types; compare id's */ + if (x->a_id < y->a_id) + return (-1); + if (x->a_id > y->a_id) + return (1); + /* Equal ids; compare perms */ + if (x->a_perm < y->a_perm) + return (-1); + if (x->a_perm > y->a_perm) + return (1); + /* Totally equal */ + return (0); +} + +static int +cacl_malloc(void **ptr, size_t size) +{ + *ptr = kmem_zalloc(size, KM_SLEEP); + return (0); +} + + +#if !defined(_KERNEL) +acl_t * +acl_alloc(enum acl_type type) +{ + acl_t *aclp; + + if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0) + return (NULL); + + aclp->acl_aclp = NULL; + aclp->acl_cnt = 0; + + switch (type) { + case ACE_T: + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + break; + case ACLENT_T: + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + break; + default: + acl_free(aclp); + aclp = NULL; + } + return (aclp); +} + +/* + * Free acl_t structure + */ +void +acl_free(acl_t *aclp) +{ + int acl_size; + + if (aclp == NULL) + return; + + if (aclp->acl_aclp) { + acl_size = aclp->acl_cnt * aclp->acl_entry_size; + cacl_free(aclp->acl_aclp, acl_size); + } + + cacl_free(aclp, sizeof (acl_t)); +} + +static uint32_t +access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow) +{ + uint32_t access_mask = 0; + int acl_produce; + int synchronize_set = 0, write_owner_set = 0; + int delete_set = 0, write_attrs_set = 0; + int read_named_set = 0, write_named_set = 0; + + acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_WRITER_SET_DENY); + + if (isallow) { + synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW; + write_owner_set = ACL_WRITE_OWNER_SET_ALLOW; + delete_set = ACL_DELETE_SET_ALLOW; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_ALLOW; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + } else { + + synchronize_set = ACL_SYNCHRONIZE_SET_DENY; + write_owner_set = ACL_WRITE_OWNER_SET_DENY; + delete_set = ACL_DELETE_SET_DENY; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_DENY; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY; + else + /* + * If the entity is not the owner and does not + * have write permissions ACE_WRITE_ATTRIBUTES will + * always go in the DENY ACE. + */ + access_mask |= ACE_WRITE_ATTRIBUTES; + } + + if (acl_produce & synchronize_set) + access_mask |= ACE_SYNCHRONIZE; + if (acl_produce & write_owner_set) + access_mask |= ACE_WRITE_OWNER; + if (acl_produce & delete_set) + access_mask |= ACE_DELETE; + if (acl_produce & write_attrs_set) + access_mask |= ACE_WRITE_ATTRIBUTES; + if (acl_produce & read_named_set) + access_mask |= ACE_READ_NAMED_ATTRS; + if (acl_produce & write_named_set) + access_mask |= ACE_WRITE_NAMED_ATTRS; + + return (access_mask); +} + +/* + * Given an mode_t, convert it into an access_mask as used + * by nfsace, assuming aclent_t -> nfsace semantics. + */ +static uint32_t +mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow) +{ + uint32_t access = 0; + int haswriteperm = 0; + int hasreadperm = 0; + + if (isallow) { + haswriteperm = (mode & S_IWOTH); + hasreadperm = (mode & S_IROTH); + } else { + haswriteperm = !(mode & S_IWOTH); + hasreadperm = !(mode & S_IROTH); + } + + /* + * The following call takes care of correctly setting the following + * mask bits in the access_mask: + * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE, + * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS + */ + access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow); + + if (isallow) { + access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES; + if (isowner) + access |= ACE_WRITE_ACL; + } else { + if (! isowner) + access |= ACE_WRITE_ACL; + } + + /* read */ + if (mode & S_IROTH) { + access |= ACE_READ_DATA; + } + /* write */ + if (mode & S_IWOTH) { + access |= ACE_WRITE_DATA | + ACE_APPEND_DATA; + if (isdir) + access |= ACE_DELETE_CHILD; + } + /* exec */ + if (mode & S_IXOTH) { + access |= ACE_EXECUTE; + } + + return (access); +} + +/* + * Given an nfsace (presumably an ALLOW entry), make a + * corresponding DENY entry at the address given. + */ +static void +ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner) +{ + (void) memcpy(deny, allow, sizeof (ace_t)); + + deny->a_who = allow->a_who; + + deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS; + if (isdir) + deny->a_access_mask ^= ACE_DELETE_CHILD; + + deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER | + ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS | + ACE_WRITE_NAMED_ATTRS); + deny->a_access_mask |= access_mask_set((allow->a_access_mask & + ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner, + B_FALSE); +} +/* + * Make an initial pass over an array of aclent_t's. Gather + * information such as an ACL_MASK (if any), number of users, + * number of groups, and whether the array needs to be sorted. + */ +static int +ln_aent_preprocess(aclent_t *aclent, int n, + int *hasmask, mode_t *mask, + int *numuser, int *numgroup, int *needsort) +{ + int error = 0; + int i; + int curtype = 0; + + *hasmask = 0; + *mask = 07; + *needsort = 0; + *numuser = 0; + *numgroup = 0; + + for (i = 0; i < n; i++) { + if (aclent[i].a_type < curtype) + *needsort = 1; + else if (aclent[i].a_type > curtype) + curtype = aclent[i].a_type; + if (aclent[i].a_type & USER) + (*numuser)++; + if (aclent[i].a_type & (GROUP | GROUP_OBJ)) + (*numgroup)++; + if (aclent[i].a_type & CLASS_OBJ) { + if (*hasmask) { + error = EINVAL; + goto out; + } else { + *hasmask = 1; + *mask = aclent[i].a_perm; + } + } + } + + if ((! *hasmask) && (*numuser + *numgroup > 1)) { + error = EINVAL; + goto out; + } + +out: + return (error); +} + +/* + * Convert an array of aclent_t into an array of nfsace entries, + * following POSIX draft -> nfsv4 conversion semantics as outlined in + * the IETF draft. + */ +static int +ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir) +{ + int error = 0; + mode_t mask; + int numuser, numgroup, needsort; + int resultsize = 0; + int i, groupi = 0, skip; + ace_t *acep, *result = NULL; + int hasmask; + + error = ln_aent_preprocess(aclent, n, &hasmask, &mask, + &numuser, &numgroup, &needsort); + if (error != 0) + goto out; + + /* allow + deny for each aclent */ + resultsize = n * 2; + if (hasmask) { + /* + * stick extra deny on the group_obj and on each + * user|group for the mask (the group_obj was added + * into the count for numgroup) + */ + resultsize += numuser + numgroup; + /* ... and don't count the mask itself */ + resultsize -= 2; + } + + /* sort the source if necessary */ + if (needsort) + ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls); + + if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0) + goto out; + + acep = result; + + for (i = 0; i < n; i++) { + /* + * don't process CLASS_OBJ (mask); mask was grabbed in + * ln_aent_preprocess() + */ + if (aclent[i].a_type & CLASS_OBJ) + continue; + + /* If we need an ACL_MASK emulator, prepend it now */ + if ((hasmask) && + (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) { + acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + acep->a_flags = 0; + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= + (ACE_IDENTIFIER_GROUP|ACE_GROUP); + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + } else { + acep->a_who = aclent[i].a_id; + acep->a_flags |= ACE_IDENTIFIER_GROUP; + } + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + /* + * Set the access mask for the prepended deny + * ace. To do this, we invert the mask (found + * in ln_aent_preprocess()) then convert it to an + * DENY ace access_mask. + */ + acep->a_access_mask = mode_to_ace_access((mask ^ 07), + isdir, 0, 0); + acep += 1; + } + + /* handle a_perm -> access_mask */ + acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm, + isdir, aclent[i].a_type & USER_OBJ, 1); + + /* emulate a default aclent */ + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + + /* + * handle a_perm and a_id + * + * this must be done last, since it involves the + * corresponding deny aces, which are handled + * differently for each different a_type. + */ + if (aclent[i].a_type & USER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_OWNER; + ace_make_deny(acep, acep + 1, isdir, B_TRUE); + acep += 2; + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) { + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_GROUP; + } else { + acep->a_who = aclent[i].a_id; + } + acep->a_flags |= ACE_IDENTIFIER_GROUP; + /* + * Set the corresponding deny for the group ace. + * + * The deny aces go after all of the groups, unlike + * everything else, where they immediately follow + * the allow ace. + * + * We calculate "skip", the number of slots to + * skip ahead for the deny ace, here. + * + * The pattern is: + * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3 + * thus, skip is + * (2 * numgroup) - 1 - groupi + * (2 * numgroup) to account for MD + A + * - 1 to account for the fact that we're on the + * access (A), not the mask (MD) + * - groupi to account for the fact that we have + * passed up groupi number of MD's. + */ + skip = (2 * numgroup) - 1 - groupi; + ace_make_deny(acep, acep + skip, isdir, B_FALSE); + /* + * If we just did the last group, skip acep past + * all of the denies; else, just move ahead one. + */ + if (++groupi >= numgroup) + acep += numgroup + 1; + else + acep += 1; + } else if (aclent[i].a_type & OTHER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_EVERYONE; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else { + error = EINVAL; + goto out; + } + } + + *acepp = result; + *rescount = resultsize; + +out: + if (error != 0) { + if ((result != NULL) && (resultsize > 0)) { + cacl_free(result, resultsize * sizeof (ace_t)); + } + } + + return (error); +} + +static int +convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir, + ace_t **retacep, int *retacecnt) +{ + ace_t *acep; + ace_t *dfacep; + int acecnt = 0; + int dfacecnt = 0; + int dfaclstart = 0; + int dfaclcnt = 0; + aclent_t *aclp; + int i; + int error; + int acesz, dfacesz; + + ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls); + + for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) { + if (aclp->a_type & ACL_DEFAULT) + break; + } + + if (i < aclcnt) { + dfaclstart = i; + dfaclcnt = aclcnt - i; + } + + if (dfaclcnt && !isdir) { + return (EINVAL); + } + + error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir); + if (error) + return (error); + + if (dfaclcnt) { + error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt, + &dfacep, &dfacecnt, isdir); + if (error) { + if (acep) { + cacl_free(acep, acecnt * sizeof (ace_t)); + } + return (error); + } + } + + if (dfacecnt != 0) { + acesz = sizeof (ace_t) * acecnt; + dfacesz = sizeof (ace_t) * dfacecnt; + acep = cacl_realloc(acep, acesz, acesz + dfacesz); + if (acep == NULL) + return (ENOMEM); + if (dfaclcnt) { + (void) memcpy(acep + acecnt, dfacep, dfacesz); + } + } + if (dfaclcnt) + cacl_free(dfacep, dfacecnt * sizeof (ace_t)); + + *retacecnt = acecnt + dfacecnt; + *retacep = acep; + return (0); +} + +static int +ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) +{ + int error = 0; + o_mode_t mode = 0; + uint32_t bits, wantbits; + + /* read */ + if (mask & ACE_READ_DATA) + mode |= S_IROTH; + + /* write */ + wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA); + if (isdir) + wantbits |= ACE_DELETE_CHILD; + bits = mask & wantbits; + if (bits != 0) { + if (bits != wantbits) { + error = ENOTSUP; + goto out; + } + mode |= S_IWOTH; + } + + /* exec */ + if (mask & ACE_EXECUTE) { + mode |= S_IXOTH; + } + + *modep = mode; + +out: + return (error); +} + +static void +acevals_init(acevals_t *vals, uid_t key) +{ + bzero(vals, sizeof (*vals)); + vals->allowed = ACE_MASK_UNDEFINED; + vals->denied = ACE_MASK_UNDEFINED; + vals->mask = ACE_MASK_UNDEFINED; + vals->key = key; +} + +static void +ace_list_init(ace_list_t *al, int dfacl_flag) +{ + acevals_init(&al->user_obj, 0); + acevals_init(&al->group_obj, 0); + acevals_init(&al->other_obj, 0); + al->numusers = 0; + al->numgroups = 0; + al->acl_mask = 0; + al->hasmask = 0; + al->state = ace_unused; + al->seen = 0; + al->dfacl_flag = dfacl_flag; +} + +/* + * Find or create an acevals holder for a given id and avl tree. + * + * Note that only one thread will ever touch these avl trees, so + * there is no need for locking. + */ +static acevals_t * +acevals_find(ace_t *ace, avl_tree_t *avl, int *num) +{ + acevals_t key, *rc; + avl_index_t where; + + key.key = ace->a_who; + rc = avl_find(avl, &key, &where); + if (rc != NULL) + return (rc); + + /* this memory is freed by ln_ace_to_aent()->ace_list_free() */ + if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0) + return (NULL); + + acevals_init(rc, ace->a_who); + avl_insert(avl, rc, where); + (*num)++; + + return (rc); +} + +static int +access_mask_check(ace_t *acep, int mask_bit, int isowner) +{ + int set_deny, err_deny; + int set_allow, err_allow; + int acl_consume; + int haswriteperm, hasreadperm; + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1; + } else { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0; + } + + acl_consume = (ACL_SYNCHRONIZE_ERR_DENY | + ACL_DELETE_ERR_DENY | + ACL_WRITE_OWNER_ERR_DENY | + ACL_WRITE_OWNER_ERR_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_ERR_DENY | + ACL_WRITE_ATTRS_WRITER_SET_DENY | + ACL_WRITE_ATTRS_WRITER_ERR_ALLOW | + ACL_WRITE_NAMED_WRITER_ERR_DENY | + ACL_READ_NAMED_READER_ERR_DENY); + + if (mask_bit == ACE_SYNCHRONIZE) { + set_deny = ACL_SYNCHRONIZE_SET_DENY; + err_deny = ACL_SYNCHRONIZE_ERR_DENY; + set_allow = ACL_SYNCHRONIZE_SET_ALLOW; + err_allow = ACL_SYNCHRONIZE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_OWNER) { + set_deny = ACL_WRITE_OWNER_SET_DENY; + err_deny = ACL_WRITE_OWNER_ERR_DENY; + set_allow = ACL_WRITE_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_OWNER_ERR_ALLOW; + } else if (mask_bit == ACE_DELETE) { + set_deny = ACL_DELETE_SET_DENY; + err_deny = ACL_DELETE_ERR_DENY; + set_allow = ACL_DELETE_SET_ALLOW; + err_allow = ACL_DELETE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_ATTRIBUTES) { + if (isowner) { + set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW; + } else if (haswriteperm) { + set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW; + } else { + if ((acep->a_access_mask & mask_bit) && + (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) { + return (ENOTSUP); + } + return (0); + } + } else if (mask_bit == ACE_READ_NAMED_ATTRS) { + if (!hasreadperm) + return (0); + + set_deny = ACL_READ_NAMED_READER_SET_DENY; + err_deny = ACL_READ_NAMED_READER_ERR_DENY; + set_allow = ACL_READ_NAMED_READER_SET_ALLOW; + err_allow = ACL_READ_NAMED_READER_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) { + if (!haswriteperm) + return (0); + + set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY; + err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY; + set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW; + } else { + return (EINVAL); + } + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + if (acl_consume & set_deny) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_deny) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } else { + /* ACE_ACCESS_ALLOWED_ACE_TYPE */ + if (acl_consume & set_allow) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_allow) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } + return (0); +} + +static int +ace_to_aent_legal(ace_t *acep) +{ + int error = 0; + int isowner; + + /* only ALLOW or DENY */ + if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) && + (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid flags */ + if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) { + error = EINVAL; + goto out; + } + + /* some flags are illegal */ + if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG | + ACE_FAILED_ACCESS_ACE_FLAG | + ACE_NO_PROPAGATE_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid masks */ + if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) { + error = EINVAL; + goto out; + } + + if ((acep->a_flags & ACE_OWNER)) { + isowner = 1; + } else { + isowner = 0; + } + + error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_OWNER, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_DELETE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner); + if (error) + goto out; + + /* more detailed checking of masks */ + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (! (acep->a_access_mask & ACE_APPEND_DATA))) { + error = ENOTSUP; + goto out; + } + if ((! (acep->a_access_mask & ACE_WRITE_DATA)) && + (acep->a_access_mask & ACE_APPEND_DATA)) { + error = ENOTSUP; + goto out; + } + } + + /* ACL enforcement */ + if ((acep->a_access_mask & ACE_READ_ACL) && + (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + if (acep->a_access_mask & ACE_WRITE_ACL) { + if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) && + (isowner)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) && + (! isowner)) { + error = ENOTSUP; + goto out; + } + } + +out: + return (error); +} + +static int +ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) +{ + /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */ + if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) != + (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) { + return (ENOTSUP); + } + + return (ace_mask_to_mode(mask, modep, isdir)); +} + +static int +acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list, + uid_t owner, gid_t group, boolean_t isdir) +{ + int error; + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + + if (isdir) + flips |= ACE_DELETE_CHILD; + if (vals->allowed != (vals->denied ^ flips)) { + error = ENOTSUP; + goto out; + } + if ((list->hasmask) && (list->acl_mask != vals->mask) && + (vals->aent_type & (USER | GROUP | GROUP_OBJ))) { + error = ENOTSUP; + goto out; + } + error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir); + if (error != 0) + goto out; + dest->a_type = vals->aent_type; + if (dest->a_type & (USER | GROUP)) { + dest->a_id = vals->key; + } else if (dest->a_type & USER_OBJ) { + dest->a_id = owner; + } else if (dest->a_type & GROUP_OBJ) { + dest->a_id = group; + } else if (dest->a_type & OTHER_OBJ) { + dest->a_id = 0; + } else { + error = EINVAL; + goto out; + } + +out: + return (error); +} + + +static int +ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt, + uid_t owner, gid_t group, boolean_t isdir) +{ + int error = 0; + aclent_t *aent, *result = NULL; + acevals_t *vals; + int resultcount; + + if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) != + (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) { + error = ENOTSUP; + goto out; + } + if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) { + error = ENOTSUP; + goto out; + } + + resultcount = 3 + list->numusers + list->numgroups; + /* + * This must be the same condition as below, when we add the CLASS_OBJ + * (aka ACL mask) + */ + if ((list->hasmask) || (! list->dfacl_flag)) + resultcount += 1; + + if (cacl_malloc((void **)&result, + resultcount * sizeof (aclent_t)) != 0) { + error = ENOMEM; + goto out; + } + aent = result; + + /* USER_OBJ */ + if (!(list->user_obj.aent_type & USER_OBJ)) { + error = EINVAL; + goto out; + } + + error = acevals_to_aent(&list->user_obj, aent, list, owner, group, + isdir); + + if (error != 0) + goto out; + ++aent; + /* USER */ + vals = NULL; + for (vals = avl_first(&list->user); vals != NULL; + vals = AVL_NEXT(&list->user, vals)) { + if (!(vals->aent_type & USER)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* GROUP_OBJ */ + if (!(list->group_obj.aent_type & GROUP_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->group_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + /* GROUP */ + vals = NULL; + for (vals = avl_first(&list->group); vals != NULL; + vals = AVL_NEXT(&list->group, vals)) { + if (!(vals->aent_type & GROUP)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* + * CLASS_OBJ (aka ACL_MASK) + * + * An ACL_MASK is not fabricated if the ACL is a default ACL. + * This is to follow UFS's behavior. + */ + if ((list->hasmask) || (! list->dfacl_flag)) { + if (list->hasmask) { + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + if (isdir) + flips |= ACE_DELETE_CHILD; + error = ace_mask_to_mode(list->acl_mask ^ flips, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } else { + /* fabricate the ACL_MASK from the group permissions */ + error = ace_mask_to_mode(list->group_obj.allowed, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } + aent->a_id = 0; + aent->a_type = CLASS_OBJ | list->dfacl_flag; + ++aent; + } + /* OTHER_OBJ */ + if (!(list->other_obj.aent_type & OTHER_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->other_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + + *aclentp = result; + *aclcnt = resultcount; + +out: + if (error != 0) { + if (result != NULL) + cacl_free(result, resultcount * sizeof (aclent_t)); + } + + return (error); +} + + +/* + * free all data associated with an ace_list + */ +static void +ace_list_free(ace_list_t *al) +{ + acevals_t *node; + void *cookie; + + if (al == NULL) + return; + + cookie = NULL; + while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + cookie = NULL; + while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + + avl_destroy(&al->user); + avl_destroy(&al->group); + + /* free the container itself */ + cacl_free(al, sizeof (ace_list_t)); +} + +static int +acevals_compare(const void *va, const void *vb) +{ + const acevals_t *a = va, *b = vb; + + if (a->key == b->key) + return (0); + + if (a->key > b->key) + return (1); + + else + return (-1); +} + +/* + * Convert a list of ace_t entries to equivalent regular and default + * aclent_t lists. Return error (ENOTSUP) when conversion is not possible. + */ +static int +ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group, + aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt, + boolean_t isdir) +{ + int error = 0; + ace_t *acep; + uint32_t bits; + int i; + ace_list_t *normacl = NULL, *dfacl = NULL, *acl; + acevals_t *vals; + + *aclentp = NULL; + *aclcnt = 0; + *dfaclentp = NULL; + *dfaclcnt = 0; + + /* we need at least user_obj, group_obj, and other_obj */ + if (n < 6) { + error = ENOTSUP; + goto out; + } + if (ace == NULL) { + error = EINVAL; + goto out; + } + + error = cacl_malloc((void **)&normacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&normacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&normacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + + ace_list_init(normacl, 0); + + error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + ace_list_init(dfacl, ACL_DEFAULT); + + /* process every ace_t... */ + for (i = 0; i < n; i++) { + acep = &ace[i]; + + /* rule out certain cases quickly */ + error = ace_to_aent_legal(acep); + if (error != 0) + goto out; + + /* + * Turn off these bits in order to not have to worry about + * them when doing the checks for compliments. + */ + acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE | + ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS); + + /* see if this should be a regular or default acl */ + bits = acep->a_flags & + (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE); + if (bits != 0) { + /* all or nothing on these inherit bits */ + if (bits != (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + acl = dfacl; + } else { + acl = normacl; + } + + if ((acep->a_flags & ACE_OWNER)) { + if (acl->state > ace_user_obj) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user_obj; + acl->seen |= USER_OBJ; + vals = &acl->user_obj; + vals->aent_type = USER_OBJ | acl->dfacl_flag; + } else if ((acep->a_flags & ACE_EVERYONE)) { + acl->state = ace_other_obj; + acl->seen |= OTHER_OBJ; + vals = &acl->other_obj; + vals->aent_type = OTHER_OBJ | acl->dfacl_flag; + } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) { + if (acl->state > ace_group) { + error = ENOTSUP; + goto out; + } + if ((acep->a_flags & ACE_GROUP)) { + acl->seen |= GROUP_OBJ; + vals = &acl->group_obj; + vals->aent_type = GROUP_OBJ | acl->dfacl_flag; + } else { + acl->seen |= GROUP; + vals = acevals_find(acep, &acl->group, + &acl->numgroups); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = GROUP | acl->dfacl_flag; + } + acl->state = ace_group; + } else { + if (acl->state > ace_user) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user; + acl->seen |= USER; + vals = acevals_find(acep, &acl->user, + &acl->numusers); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = USER | acl->dfacl_flag; + } + + if (!(acl->state > ace_unused)) { + error = EINVAL; + goto out; + } + + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + /* no more than one allowed per aclent_t */ + if (vals->allowed != ACE_MASK_UNDEFINED) { + error = ENOTSUP; + goto out; + } + vals->allowed = acep->a_access_mask; + } else { + /* + * it's a DENY; if there was a previous DENY, it + * must have been an ACL_MASK. + */ + if (vals->denied != ACE_MASK_UNDEFINED) { + /* ACL_MASK is for USER and GROUP only */ + if ((acl->state != ace_user) && + (acl->state != ace_group)) { + error = ENOTSUP; + goto out; + } + + if (! acl->hasmask) { + acl->hasmask = 1; + acl->acl_mask = vals->denied; + /* check for mismatched ACL_MASK emulations */ + } else if (acl->acl_mask != vals->denied) { + error = ENOTSUP; + goto out; + } + vals->mask = vals->denied; + } + vals->denied = acep->a_access_mask; + } + } + + /* done collating; produce the aclent_t lists */ + if (normacl->state != ace_unused) { + error = ace_list_to_aent(normacl, aclentp, aclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + if (dfacl->state != ace_unused) { + error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + +out: + if (normacl != NULL) + ace_list_free(normacl); + if (dfacl != NULL) + ace_list_free(dfacl); + + return (error); +} + +static int +convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir, + uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt) +{ + int error = 0; + aclent_t *aclentp, *dfaclentp; + int aclcnt, dfaclcnt; + int aclsz, dfaclsz; + + error = ln_ace_to_aent(acebufp, acecnt, owner, group, + &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir); + + if (error) + return (error); + + + if (dfaclcnt != 0) { + /* + * Slap aclentp and dfaclentp into a single array. + */ + aclsz = sizeof (aclent_t) * aclcnt; + dfaclsz = sizeof (aclent_t) * dfaclcnt; + aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz); + if (aclentp != NULL) { + (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz); + } else { + error = ENOMEM; + } + } + + if (aclentp) { + *retaclentp = aclentp; + *retaclcnt = aclcnt + dfaclcnt; + } + + if (dfaclentp) + cacl_free(dfaclentp, dfaclsz); + + return (error); +} + + +int +acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner, + gid_t group) +{ + int aclcnt; + void *acldata; + int error; + + /* + * See if we need to translate + */ + if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) || + (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACLENT_T)) + return (0); + + if (target_flavor == -1) { + error = EINVAL; + goto out; + } + + if (target_flavor == _ACL_ACE_ENABLED && + aclp->acl_type == ACLENT_T) { + error = convert_aent_to_ace(aclp->acl_aclp, + aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt); + if (error) + goto out; + + } else if (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACE_T) { + error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt, + isdir, owner, group, (aclent_t **)&acldata, &aclcnt); + if (error) + goto out; + } else { + error = ENOTSUP; + goto out; + } + + /* + * replace old acl with newly translated acl + */ + cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size); + aclp->acl_aclp = acldata; + aclp->acl_cnt = aclcnt; + if (target_flavor == _ACL_ACE_ENABLED) { + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + } else { + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + } + return (0); + +out: + +#if !defined(_KERNEL) + errno = error; + return (-1); +#else + return (error); +#endif +} +#endif /* !_KERNEL */ + +#define SET_ACE(acl, index, who, mask, type, flags) { \ + acl[0][index].a_who = (uint32_t)who; \ + acl[0][index].a_type = type; \ + acl[0][index].a_flags = flags; \ + acl[0][index++].a_access_mask = mask; \ +} + +void +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) +{ + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + (void) isdir; /* will need this later */ + + masks->deny1 = 0; + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + masks->deny1 |= read_mask; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + masks->deny1 |= write_mask; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + masks->deny1 |= execute_mask; + + masks->deny2 = 0; + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + masks->deny2 |= read_mask; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + masks->deny2 |= write_mask; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + masks->deny2 |= execute_mask; + + masks->allow0 = 0; + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + masks->allow0 |= read_mask; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + masks->allow0 |= write_mask; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + masks->allow0 |= execute_mask; + + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + masks->owner |= read_mask; + if (mode & S_IWUSR) + masks->owner |= write_mask; + if (mode & S_IXUSR) + masks->owner |= execute_mask; + + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + masks->group |= read_mask; + if (mode & S_IWGRP) + masks->group |= write_mask; + if (mode & S_IXGRP) + masks->group |= execute_mask; + + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + masks->everyone |= read_mask; + if (mode & S_IWOTH) + masks->everyone |= write_mask; + if (mode & S_IXOTH) + masks->everyone |= execute_mask; +} + +int +acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count) +{ + int index = 0; + int error; + trivial_acl_t masks; + + *count = 3; + acl_trivial_access_masks(mode, isdir, &masks); + + if (masks.allow0) + (*count)++; + if (masks.deny1) + (*count)++; + if (masks.deny2) + (*count)++; + + if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0) + return (error); + + if (masks.allow0) { + SET_ACE(acl, index, -1, masks.allow0, + ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny1) { + SET_ACE(acl, index, -1, masks.deny1, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny2) { + SET_ACE(acl, index, -1, masks.deny2, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP); + } + + SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_OWNER); + SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_IDENTIFIER_GROUP|ACE_GROUP); + SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_EVERYONE); + + return (0); +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permissions are never set by default + */ + if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) + return (1); + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + return (0); +} + +uint64_t +ace_walk(void *datap, uint64_t cookie, int aclcnt, uint16_t *flags, + uint16_t *type, uint32_t *mask) +{ + ace_t *acep = datap; + + if (cookie >= aclcnt) + return (0); + + *flags = acep[cookie].a_flags; + *type = acep[cookie].a_type; + *mask = acep[cookie++].a_access_mask; + + return (cookie); +} + +int +ace_trivial(ace_t *acep, int aclcnt) +{ + return (ace_trivial_common(acep, aclcnt, ace_walk)); +} diff --git a/module/os/freebsd/spl/callb.c b/module/os/freebsd/spl/callb.c new file mode 100644 index 000000000000..d4a0e141cfda --- /dev/null +++ b/module/os/freebsd/spl/callb.c @@ -0,0 +1,372 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for delay() */ +#include /* For TASKQ_NAMELEN */ +#include + +#define CB_MAXNAME TASKQ_NAMELEN + +/* + * The callb mechanism provides generic event scheduling/echoing. + * A callb function is registered and called on behalf of the event. + */ +typedef struct callb { + struct callb *c_next; /* next in class or on freelist */ + kthread_id_t c_thread; /* ptr to caller's thread struct */ + char c_flag; /* info about the callb state */ + uchar_t c_class; /* this callb's class */ + kcondvar_t c_done_cv; /* signal callb completion */ + boolean_t (*c_func)(); /* cb function: returns true if ok */ + void *c_arg; /* arg to c_func */ + char c_name[CB_MAXNAME+1]; /* debug:max func name length */ +} callb_t; + +/* + * callb c_flag bitmap definitions + */ +#define CALLB_FREE 0x0 +#define CALLB_TAKEN 0x1 +#define CALLB_EXECUTING 0x2 + +/* + * Basic structure for a callb table. + * All callbs are organized into different class groups described + * by ct_class array. + * The callbs within a class are single-linked and normally run by a + * serial execution. + */ +typedef struct callb_table { + kmutex_t ct_lock; /* protect all callb states */ + callb_t *ct_freelist; /* free callb structures */ + int ct_busy; /* != 0 prevents additions */ + kcondvar_t ct_busy_cv; /* to wait for not busy */ + int ct_ncallb; /* num of callbs allocated */ + callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ +} callb_table_t; + +int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; + +static callb_id_t callb_add_common(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); + +static callb_table_t callb_table; /* system level callback table */ +static callb_table_t *ct = &callb_table; +static kmutex_t callb_safe_mutex; +callb_cpr_t callb_cprinfo_safe = { + &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} }; + +/* + * Init all callb tables in the system. + */ +void +callb_init(void *dummy __unused) +{ + callb_table.ct_busy = 0; /* mark table open for additions */ + mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +callb_fini(void *dummy __unused) +{ + callb_t *cp; + int i; + + mutex_enter(&ct->ct_lock); + for (i = 0; i < 16; i++) { + while ((cp = ct->ct_freelist) != NULL) { + ct->ct_freelist = cp->c_next; + ct->ct_ncallb--; + kmem_free(cp, sizeof (callb_t)); + } + if (ct->ct_ncallb == 0) + break; + /* Not all callbacks finished, waiting for the rest. */ + mutex_exit(&ct->ct_lock); + tsleep(ct, 0, "callb", hz / 4); + mutex_enter(&ct->ct_lock); + } + if (ct->ct_ncallb > 0) + printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb); + mutex_exit(&ct->ct_lock); + mutex_destroy(&callb_safe_mutex); + mutex_destroy(&callb_table.ct_lock); +} + +/* + * callout_add() is called to register func() be called later. + */ +static callb_id_t +callb_add_common(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + callb_t *cp; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + while (ct->ct_busy) + cv_wait(&ct->ct_busy_cv, &ct->ct_lock); + if ((cp = ct->ct_freelist) == NULL) { + ct->ct_ncallb++; + cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); + } + ct->ct_freelist = cp->c_next; + cp->c_thread = t; + cp->c_func = func; + cp->c_arg = arg; + cp->c_class = (uchar_t)class; + cp->c_flag |= CALLB_TAKEN; +#ifdef DEBUG + if (strlen(name) > CB_MAXNAME) + cmn_err(CE_WARN, "callb_add: name of callback function '%s' " + "too long -- truncated to %d chars", + name, CB_MAXNAME); +#endif + (void) strncpy(cp->c_name, name, CB_MAXNAME); + cp->c_name[CB_MAXNAME] = '\0'; + + /* + * Insert the new callb at the head of its class list. + */ + cp->c_next = ct->ct_first_cb[class]; + ct->ct_first_cb[class] = cp; + + mutex_exit(&ct->ct_lock); + return ((callb_id_t)cp); +} + +/* + * The default function to add an entry to the callback table. Since + * it uses curthread as the thread identifier to store in the table, + * it should be used for the normal case of a thread which is calling + * to add ITSELF to the table. + */ +callb_id_t +callb_add(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name) +{ + return (callb_add_common(func, arg, class, name, curthread)); +} + +/* + * A special version of callb_add() above for use by threads which + * might be adding an entry to the table on behalf of some other + * thread (for example, one which is constructed but not yet running). + * In this version the thread id is an argument. + */ +callb_id_t +callb_add_thread(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + return (callb_add_common(func, arg, class, name, t)); +} + +/* + * callout_delete() is called to remove an entry identified by id + * that was originally placed there by a call to callout_add(). + * return -1 if fail to delete a callb entry otherwise return 0. + */ +int +callb_delete(callb_id_t id) +{ + callb_t **pp; + callb_t *me = (callb_t *)id; + + mutex_enter(&ct->ct_lock); + + for (;;) { + pp = &ct->ct_first_cb[me->c_class]; + while (*pp != NULL && *pp != me) + pp = &(*pp)->c_next; + +#ifdef DEBUG + if (*pp != me) { + cmn_err(CE_WARN, "callb delete bogus entry 0x%p", + (void *)me); + mutex_exit(&ct->ct_lock); + return (-1); + } +#endif /* DEBUG */ + + /* + * It is not allowed to delete a callb in the middle of + * executing otherwise, the callb_execute() will be confused. + */ + if (!(me->c_flag & CALLB_EXECUTING)) + break; + + cv_wait(&me->c_done_cv, &ct->ct_lock); + } + /* relink the class list */ + *pp = me->c_next; + + /* clean up myself and return the free callb to the head of freelist */ + me->c_flag = CALLB_FREE; + me->c_next = ct->ct_freelist; + ct->ct_freelist = me; + + mutex_exit(&ct->ct_lock); + return (0); +} + +/* + * class: indicates to execute all callbs in the same class; + * code: optional argument for the callb functions. + * return: = 0: success + * != 0: ptr to string supplied when callback was registered + */ +void * +callb_execute_class(int class, int code) +{ + callb_t *cp; + void *ret = NULL; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + + for (cp = ct->ct_first_cb[class]; + cp != NULL && ret == 0; cp = cp->c_next) { + while (cp->c_flag & CALLB_EXECUTING) + cv_wait(&cp->c_done_cv, &ct->ct_lock); + /* + * cont if the callb is deleted while we're sleeping + */ + if (cp->c_flag == CALLB_FREE) + continue; + cp->c_flag |= CALLB_EXECUTING; + +#ifdef CALLB_DEBUG + printf("callb_execute: name=%s func=%p arg=%p\n", + cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); +#endif /* CALLB_DEBUG */ + + mutex_exit(&ct->ct_lock); + /* If callback function fails, pass back client's name */ + if (!(*cp->c_func)(cp->c_arg, code)) + ret = cp->c_name; + mutex_enter(&ct->ct_lock); + + cp->c_flag &= ~CALLB_EXECUTING; + cv_broadcast(&cp->c_done_cv); + } + mutex_exit(&ct->ct_lock); + return (ret); +} + +/* + * callers make sure no recursive entries to this func. + * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. + * + * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we + * use a cv_timedwait() in case the kernel thread is blocked. + * + * Note that this is a generic callback handler for daemon CPR and + * should NOT be changed to accommodate any specific requirement in a daemon. + * Individual daemons that require changes to the handler shall write + * callback routines in their own daemon modules. + */ +boolean_t +callb_generic_cpr(void *arg, int code) +{ + callb_cpr_t *cp = (callb_cpr_t *)arg; + clock_t ret = 0; /* assume success */ + + mutex_enter(cp->cc_lockp); + + switch (code) { + case CB_CODE_CPR_CHKPT: + cp->cc_events |= CALLB_CPR_START; +#ifdef CPR_NOT_THREAD_SAFE + while (!(cp->cc_events & CALLB_CPR_SAFE)) + /* cv_timedwait() returns -1 if it times out. */ + if ((ret = cv_reltimedwait(&cp->cc_callb_cv, + cp->cc_lockp, (callb_timeout_sec * hz), + TR_CLOCK_TICK)) == -1) + break; +#endif + break; + + case CB_CODE_CPR_RESUME: + cp->cc_events &= ~CALLB_CPR_START; + cv_signal(&cp->cc_stop_cv); + break; + } + mutex_exit(cp->cc_lockp); + return (ret != -1); +} + +/* + * The generic callback function associated with kernel threads which + * are always considered safe. + */ +/* ARGSUSED */ +boolean_t +callb_generic_cpr_safe(void *arg, int code) +{ + return (B_TRUE); +} +/* + * Prevent additions to callback table. + */ +void +callb_lock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy == 0); + ct->ct_busy = 1; + mutex_exit(&ct->ct_lock); +} + +/* + * Allow additions to callback table. + */ +void +callb_unlock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy != 0); + ct->ct_busy = 0; + cv_broadcast(&ct->ct_busy_cv); + mutex_exit(&ct->ct_lock); +} + +SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL); +SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/module/os/freebsd/spl/list.c b/module/os/freebsd/spl/list.c new file mode 100644 index 000000000000..e8db13a5cf68 --- /dev/null +++ b/module/os/freebsd/spl/list.c @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Generic doubly-linked list implementation + */ + +#include +#include +#include +#include +#include + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = (node); \ + lnew->list_next = (node)->list_next; \ + (node)->list_next->list_prev = lnew; \ + (node)->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = (node); \ + lnew->list_prev = (node)->list_prev; \ + (node)->list_prev->list_next = lnew; \ + (node)->list_prev = lnew; \ +} + +#define list_remove_node(node) \ + (node)->list_prev->list_next = (node)->list_next; \ + (node)->list_next->list_prev = (node)->list_prev; \ + (node)->list_next = (node)->list_prev = NULL + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); + list_remove_node(lold); +} + +void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + list_remove_node(head); + return (list_object(list, head)); +} + +void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + list_remove_node(tail); + return (list_object(list, tail)); +} + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +void +list_link_replace(list_node_t *lold, list_node_t *lnew) +{ + ASSERT(list_link_active(lold)); + ASSERT(!list_link_active(lnew)); + + lnew->list_next = lold->list_next; + lnew->list_prev = lold->list_prev; + lold->list_prev->list_next = lnew; + lold->list_next->list_prev = lnew; + lold->list_next = lold->list_prev = NULL; +} + +void +list_link_init(list_node_t *link) +{ + link->list_next = NULL; + link->list_prev = NULL; +} + +int +list_link_active(list_node_t *link) +{ + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/module/os/freebsd/spl/sha224.h b/module/os/freebsd/spl/sha224.h new file mode 100644 index 000000000000..0abd43068708 --- /dev/null +++ b/module/os/freebsd/spl/sha224.h @@ -0,0 +1,96 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA224_H_ +#define _SHA224_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA224_BLOCK_LENGTH 64 +#define SHA224_DIGEST_LENGTH 28 +#define SHA224_DIGEST_STRING_LENGTH (SHA224_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA224Context { + uint32_t state[8]; + uint64_t count; + uint8_t buf[SHA224_BLOCK_LENGTH]; +} SHA224_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ + +#ifndef SHA224_Init +#define SHA224_Init _libmd_SHA224_Init +#endif +#ifndef SHA224_Update +#define SHA224_Update _libmd_SHA224_Update +#endif +#ifndef SHA224_Final +#define SHA224_Final _libmd_SHA224_Final +#endif +#ifndef SHA224_End +#define SHA224_End _libmd_SHA224_End +#endif +#ifndef SHA224_Fd +#define SHA224_Fd _libmd_SHA224_Fd +#endif +#ifndef SHA224_FdChunk +#define SHA224_FdChunk _libmd_SHA224_FdChunk +#endif +#ifndef SHA224_File +#define SHA224_File _libmd_SHA224_File +#endif +#ifndef SHA224_FileChunk +#define SHA224_FileChunk _libmd_SHA224_FileChunk +#endif +#ifndef SHA224_Data +#define SHA224_Data _libmd_SHA224_Data +#endif + +#ifndef SHA224_version +#define SHA224_version _libmd_SHA224_version +#endif + +void SHA224_Init(SHA224_CTX *); +void SHA224_Update(SHA224_CTX *, const void *, size_t); +void SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)], + SHA224_CTX *); +#ifndef _KERNEL +char *SHA224_End(SHA224_CTX *, char *); +char *SHA224_Data(const void *, unsigned int, char *); +char *SHA224_Fd(int, char *); +char *SHA224_FdChunk(int, char *, off_t, off_t); +char *SHA224_File(const char *, char *); +char *SHA224_FileChunk(const char *, char *, off_t, off_t); +#endif +__END_DECLS + +#endif /* !_SHA224_H_ */ diff --git a/module/os/freebsd/spl/sha256.h b/module/os/freebsd/spl/sha256.h new file mode 100644 index 000000000000..193c0c025120 --- /dev/null +++ b/module/os/freebsd/spl/sha256.h @@ -0,0 +1,99 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA256_BLOCK_LENGTH 64 +#define SHA256_DIGEST_LENGTH 32 +#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA256Context { + uint32_t state[8]; + uint64_t count; + uint8_t buf[SHA256_BLOCK_LENGTH]; +} SHA256_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ + +#ifndef SHA256_Init +#define SHA256_Init _libmd_SHA256_Init +#endif +#ifndef SHA256_Update +#define SHA256_Update _libmd_SHA256_Update +#endif +#ifndef SHA256_Final +#define SHA256_Final _libmd_SHA256_Final +#endif +#ifndef SHA256_End +#define SHA256_End _libmd_SHA256_End +#endif +#ifndef SHA256_Fd +#define SHA256_Fd _libmd_SHA256_Fd +#endif +#ifndef SHA256_FdChunk +#define SHA256_FdChunk _libmd_SHA256_FdChunk +#endif +#ifndef SHA256_File +#define SHA256_File _libmd_SHA256_File +#endif +#ifndef SHA256_FileChunk +#define SHA256_FileChunk _libmd_SHA256_FileChunk +#endif +#ifndef SHA256_Data +#define SHA256_Data _libmd_SHA256_Data +#endif + +#ifndef SHA256_Transform +#define SHA256_Transform _libmd_SHA256_Transform +#endif +#ifndef SHA256_version +#define SHA256_version _libmd_SHA256_version +#endif + +void SHA256_Init(SHA256_CTX *); +void SHA256_Update(SHA256_CTX *, const void *, size_t); +void SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)], + SHA256_CTX *); +#ifndef _KERNEL +char *SHA256_End(SHA256_CTX *, char *); +char *SHA256_Data(const void *, unsigned int, char *); +char *SHA256_Fd(int, char *); +char *SHA256_FdChunk(int, char *, off_t, off_t); +char *SHA256_File(const char *, char *); +char *SHA256_FileChunk(const char *, char *, off_t, off_t); +#endif +__END_DECLS + +#endif /* !_SHA256_H_ */ diff --git a/module/os/freebsd/spl/sha256c.c b/module/os/freebsd/spl/sha256c.c new file mode 100644 index 000000000000..241cf8c9ae76 --- /dev/null +++ b/module/os/freebsd/spl/sha256c.c @@ -0,0 +1,378 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + + +#include +#include +#include "sha224.h" +#include "sha256.h" + +#if BYTE_ORDER == BIG_ENDIAN + +/* Copy a vector of big-endian uint32_t into a vector of bytes */ +#define be32enc_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +/* Copy a vector of bytes into a vector of big-endian uint32_t */ +#define be32dec_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +#else /* BYTE_ORDER != BIG_ENDIAN */ + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +#endif /* BYTE_ORDER != BIG_ENDIAN */ + +/* SHA256 round constants. */ +static const uint32_t K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i + ii] + K[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t *state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be32dec_vect(W, block, 64); + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + for (i = 0; i < 64; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 48) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX * ctx) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count >> 3) & 0x3f; + + /* Pad to 56 mod 64, transforming if we finish a block en route. */ + if (r < 56) { + /* Pad to 56 mod 64. */ + memcpy(&ctx->buf[r], PAD, 56 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 56); + } + + /* Add the terminating bit-count. */ + be64enc(&ctx->buf[56], ctx->count); + + /* Mix in the final block. */ + SHA256_Transform(ctx->state, ctx->buf); +} + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +void +SHA256_Init(SHA256_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +void +SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len) +{ + uint64_t bitlen; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen = len << 3; + + /* Update number of bits */ + ctx->count += bitlen; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx) +{ + + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* SHA-224: ******************************************************* */ +/* + * the SHA224 and SHA256 transforms are identical + */ + +/* SHA-224 initialization. Begins a SHA-224 operation. */ +void +SHA224_Init(SHA224_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0xC1059ED8; + ctx->state[1] = 0x367CD507; + ctx->state[2] = 0x3070DD17; + ctx->state[3] = 0xF70E5939; + ctx->state[4] = 0xFFC00B31; + ctx->state[5] = 0x68581511; + ctx->state[6] = 0x64f98FA7; + ctx->state[7] = 0xBEFA4FA4; +} + +/* Add bytes into the SHA-224 hash */ +void +SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len) +{ + + SHA256_Update((SHA256_CTX *)ctx, in, len); +} + +/* + * SHA-224 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx) +{ + + /* Add padding */ + SHA256_Pad((SHA256_CTX *)ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +#ifdef WEAK_REFS +/* + * When building libmd, provide weak references. Note: this is not + * activated in the context of compiling these sources for internal + * use in libcrypt. + */ +#undef SHA256_Init +__weak_reference(_libmd_SHA256_Init, SHA256_Init); +#undef SHA256_Update +__weak_reference(_libmd_SHA256_Update, SHA256_Update); +#undef SHA256_Final +__weak_reference(_libmd_SHA256_Final, SHA256_Final); +#undef SHA256_Transform +__weak_reference(_libmd_SHA256_Transform, SHA256_Transform); + +#undef SHA224_Init +__weak_reference(_libmd_SHA224_Init, SHA224_Init); +#undef SHA224_Update +__weak_reference(_libmd_SHA224_Update, SHA224_Update); +#undef SHA224_Final +__weak_reference(_libmd_SHA224_Final, SHA224_Final); +#endif diff --git a/module/os/freebsd/spl/sha384.h b/module/os/freebsd/spl/sha384.h new file mode 100644 index 000000000000..67250cee0313 --- /dev/null +++ b/module/os/freebsd/spl/sha384.h @@ -0,0 +1,96 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA384_H_ +#define _SHA384_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA384_BLOCK_LENGTH 128 +#define SHA384_DIGEST_LENGTH 48 +#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA384Context { + uint64_t state[8]; + uint64_t count[2]; + uint8_t buf[SHA384_BLOCK_LENGTH]; +} SHA384_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef SHA384_Init +#define SHA384_Init _libmd_SHA384_Init +#endif +#ifndef SHA384_Update +#define SHA384_Update _libmd_SHA384_Update +#endif +#ifndef SHA384_Final +#define SHA384_Final _libmd_SHA384_Final +#endif +#ifndef SHA384_End +#define SHA384_End _libmd_SHA384_End +#endif +#ifndef SHA384_Fd +#define SHA384_Fd _libmd_SHA384_Fd +#endif +#ifndef SHA384_FdChunk +#define SHA384_FdChunk _libmd_SHA384_FdChunk +#endif +#ifndef SHA384_File +#define SHA384_File _libmd_SHA384_File +#endif +#ifndef SHA384_FileChunk +#define SHA384_FileChunk _libmd_SHA384_FileChunk +#endif +#ifndef SHA384_Data +#define SHA384_Data _libmd_SHA384_Data +#endif + +#ifndef SHA384_version +#define SHA384_version _libmd_SHA384_version +#endif + +void SHA384_Init(SHA384_CTX *); +void SHA384_Update(SHA384_CTX *, const void *, size_t); +void SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)], + SHA384_CTX *); +#ifndef _KERNEL +char *SHA384_End(SHA384_CTX *, char *); +char *SHA384_Data(const void *, unsigned int, char *); +char *SHA384_Fd(int, char *); +char *SHA384_FdChunk(int, char *, off_t, off_t); +char *SHA384_File(const char *, char *); +char *SHA384_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA384_H_ */ diff --git a/module/os/freebsd/spl/sha512.h b/module/os/freebsd/spl/sha512.h new file mode 100644 index 000000000000..b6fb733ca54e --- /dev/null +++ b/module/os/freebsd/spl/sha512.h @@ -0,0 +1,101 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA512_H_ +#define _SHA512_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA512_BLOCK_LENGTH 128 +#define SHA512_DIGEST_LENGTH 64 +#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA512Context { + uint64_t state[8]; + uint64_t count[2]; + uint8_t buf[SHA512_BLOCK_LENGTH]; +} SHA512_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#if 0 +#ifndef SHA512_Init +#define SHA512_Init _libmd_SHA512_Init +#endif +#ifndef SHA512_Update +#define SHA512_Update _libmd_SHA512_Update +#endif +#ifndef SHA512_Final +#define SHA512_Final _libmd_SHA512_Final +#endif +#endif +#ifndef SHA512_End +#define SHA512_End _libmd_SHA512_End +#endif +#ifndef SHA512_Fd +#define SHA512_Fd _libmd_SHA512_Fd +#endif +#ifndef SHA512_FdChunk +#define SHA512_FdChunk _libmd_SHA512_FdChunk +#endif +#ifndef SHA512_File +#define SHA512_File _libmd_SHA512_File +#endif +#ifndef SHA512_FileChunk +#define SHA512_FileChunk _libmd_SHA512_FileChunk +#endif +#ifndef SHA512_Data +#define SHA512_Data _libmd_SHA512_Data +#endif + +#ifndef SHA512_Transform +#define SHA512_Transform _libmd_SHA512_Transform +#endif +#ifndef SHA512_version +#define SHA512_version _libmd_SHA512_version +#endif + +void SHA512_Init(SHA512_CTX *); +void SHA512_Update(SHA512_CTX *, const void *, size_t); +void SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_End(SHA512_CTX *, char *); +char *SHA512_Data(const void *, unsigned int, char *); +char *SHA512_Fd(int, char *); +char *SHA512_FdChunk(int, char *, off_t, off_t); +char *SHA512_File(const char *, char *); +char *SHA512_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA512_H_ */ diff --git a/module/os/freebsd/spl/sha512c.c b/module/os/freebsd/spl/sha512c.c new file mode 100644 index 000000000000..146f338f0ed4 --- /dev/null +++ b/module/os/freebsd/spl/sha512c.c @@ -0,0 +1,508 @@ +/* + * Copyright 2005 Colin Percival + * Copyright (c) 2015 Allan Jude + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include "sha512.h" +#include "sha512t.h" +#include "sha384.h" + +#if BYTE_ORDER == BIG_ENDIAN + +/* Copy a vector of big-endian uint64_t into a vector of bytes */ +#define be64enc_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +/* Copy a vector of bytes into a vector of big-endian uint64_t */ +#define be64dec_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +#else /* BYTE_ORDER != BIG_ENDIAN */ + +/* + * Encode a length len/4 vector of (uint64_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 8. + */ +static void +be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 8; i++) + be64enc(dst + i * 8, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint64_t). Assumes len is a multiple of 8. + */ +static void +be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 8; i++) + dst[i] = be64dec(src + i * 8); +} + +#endif /* BYTE_ORDER != BIG_ENDIAN */ + +/* SHA512 round constants. */ +static const uint64_t K[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, + 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, + 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, + 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, + 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, + 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, + 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, + 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, + 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, + 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, + 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, + 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, + 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, + 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, + 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, + 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, + 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, + 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, + 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, + 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, + 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, + 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, + 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, + 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, + 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, + 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, + 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL +}; + +/* Elementary functions used by SHA512 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (64 - n))) +#define S0(x) (ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39)) +#define S1(x) (ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41)) +#define s0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7)) +#define s1(x) (ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6)) + +/* SHA512 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(80 - i) % 8], S[(81 - i) % 8], \ + S[(82 - i) % 8], S[(83 - i) % 8], \ + S[(84 - i) % 8], S[(85 - i) % 8], \ + S[(86 - i) % 8], S[(87 - i) % 8], \ + W[i + ii] + K[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA512 block compression function. The 512-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA512_Transform(uint64_t *state, + const unsigned char block[SHA512_BLOCK_LENGTH]) +{ + uint64_t W[80]; + uint64_t S[8]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be64dec_vect(W, block, SHA512_BLOCK_LENGTH); + + /* 2. Initialize working variables. */ + memcpy(S, state, SHA512_DIGEST_LENGTH); + + /* 3. Mix. */ + for (i = 0; i < 80; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 64) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static unsigned char PAD[SHA512_BLOCK_LENGTH] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA512_Pad(SHA512_CTX * ctx) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count[1] >> 3) & 0x7f; + + /* Pad to 112 mod 128, transforming if we finish a block en route. */ + if (r < 112) { + /* Pad to 112 mod 128. */ + memcpy(&ctx->buf[r], PAD, 112 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 128 - r); + SHA512_Transform(ctx->state, ctx->buf); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 112); + } + + /* Add the terminating bit-count. */ + be64enc_vect(&ctx->buf[112], ctx->count, 16); + + /* Mix in the final block. */ + SHA512_Transform(ctx->state, ctx->buf); +} + +/* SHA-512 initialization. Begins a SHA-512 operation. */ +void +SHA512_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6a09e667f3bcc908ULL; + ctx->state[1] = 0xbb67ae8584caa73bULL; + ctx->state[2] = 0x3c6ef372fe94f82bULL; + ctx->state[3] = 0xa54ff53a5f1d36f1ULL; + ctx->state[4] = 0x510e527fade682d1ULL; + ctx->state[5] = 0x9b05688c2b3e6c1fULL; + ctx->state[6] = 0x1f83d9abfb41bd6bULL; + ctx->state[7] = 0x5be0cd19137e2179ULL; +} + +/* Add bytes into the hash */ +void +SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + uint64_t bitlen[2]; + uint64_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x7f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint64_t)len) << 3; + bitlen[0] = ((uint64_t)len) >> 61; + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < SHA512_BLOCK_LENGTH - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r); + SHA512_Transform(ctx->state, ctx->buf); + src += SHA512_BLOCK_LENGTH - r; + len -= SHA512_BLOCK_LENGTH - r; + + /* Perform complete blocks */ + while (len >= SHA512_BLOCK_LENGTH) { + SHA512_Transform(ctx->state, src); + src += SHA512_BLOCK_LENGTH; + len -= SHA512_BLOCK_LENGTH; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-512 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* SHA-512t: ******************************************************** */ +/* + * the SHA512t transforms are identical to SHA512 so reuse the existing function + */ +void +SHA512_224_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x8c3d37c819544da2ULL; + ctx->state[1] = 0x73e1996689dcd4d6ULL; + ctx->state[2] = 0x1dfab7ae32ff9c82ULL; + ctx->state[3] = 0x679dd514582f9fcfULL; + ctx->state[4] = 0x0f6d2b697bd44da8ULL; + ctx->state[5] = 0x77e36f7304c48942ULL; + ctx->state[6] = 0x3f9d85a86a1d36c8ULL; + ctx->state[7] = 0x1112e6ad91d692a1ULL; +} + +void +SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update(ctx, in, len); +} + +void +SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH], + SHA512_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +void +SHA512_256_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x22312194fc2bf72cULL; + ctx->state[1] = 0x9f555fa3c84c64c2ULL; + ctx->state[2] = 0x2393b86b6f53b151ULL; + ctx->state[3] = 0x963877195940eabdULL; + ctx->state[4] = 0x96283ee2a88effe3ULL; + ctx->state[5] = 0xbe5e1e2553863992ULL; + ctx->state[6] = 0x2b0199fc2c85b8aaULL; + ctx->state[7] = 0x0eb72ddc81c52ca2ULL; +} + +void +SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update(ctx, in, len); +} + +void +SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH], + SHA512_CTX * ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* ** SHA-384: ******************************************************** */ +/* + * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped + */ + +/* SHA-384 initialization. Begins a SHA-384 operation. */ +void +SHA384_Init(SHA384_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0xcbbb9d5dc1059ed8ULL; + ctx->state[1] = 0x629a292a367cd507ULL; + ctx->state[2] = 0x9159015a3070dd17ULL; + ctx->state[3] = 0x152fecd8f70e5939ULL; + ctx->state[4] = 0x67332667ffc00b31ULL; + ctx->state[5] = 0x8eb44a8768581511ULL; + ctx->state[6] = 0xdb0c2e0d64f98fa7ULL; + ctx->state[7] = 0x47b5481dbefa4fa4ULL; +} + +/* Add bytes into the SHA-384 hash */ +void +SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update((SHA512_CTX *)ctx, in, len); +} + +/* + * SHA-384 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad((SHA512_CTX *)ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +#if 0 +/* + * When building libmd, provide weak references. Note: this is not + * activated in the context of compiling these sources for internal + * use in libcrypt. + */ +#undef SHA512_Init +__weak_reference(_libmd_SHA512_Init, SHA512_Init); +#undef SHA512_Update +__weak_reference(_libmd_SHA512_Update, SHA512_Update); +#undef SHA512_Final +__weak_reference(_libmd_SHA512_Final, SHA512_Final); +#undef SHA512_Transform +__weak_reference(_libmd_SHA512_Transform, SHA512_Transform); + +#undef SHA512_224_Init +__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init); +#undef SHA512_224_Update +__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update); +#undef SHA512_224_Final +__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final); + +#undef SHA512_256_Init +__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init); +#undef SHA512_256_Update +__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update); +#undef SHA512_256_Final +__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final); + +#undef SHA384_Init +__weak_reference(_libmd_SHA384_Init, SHA384_Init); +#undef SHA384_Update +__weak_reference(_libmd_SHA384_Update, SHA384_Update); +#undef SHA384_Final +__weak_reference(_libmd_SHA384_Final, SHA384_Final); +#endif diff --git a/module/os/freebsd/spl/sha512t.h b/module/os/freebsd/spl/sha512t.h new file mode 100644 index 000000000000..703867fc0288 --- /dev/null +++ b/module/os/freebsd/spl/sha512t.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 Allan Jude + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA512T_H_ +#define _SHA512T_H_ + +#include "sha512.h" + +#ifndef _KERNEL +#include +#endif + +#define SHA512_224_DIGEST_LENGTH 28 +#define SHA512_224_DIGEST_STRING_LENGTH (SHA512_224_DIGEST_LENGTH * 2 + 1) +#define SHA512_256_DIGEST_LENGTH 32 +#define SHA512_256_DIGEST_STRING_LENGTH (SHA512_256_DIGEST_LENGTH * 2 + 1) + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef SHA512_224_Init +#define SHA512_224_Init _libmd_SHA512_224_Init +#endif +#ifndef SHA512_224_Update +#define SHA512_224_Update _libmd_SHA512_224_Update +#endif +#ifndef SHA512_224_Final +#define SHA512_224_Final _libmd_SHA512_224_Final +#endif +#ifndef SHA512_224_End +#define SHA512_224_End _libmd_SHA512_224_End +#endif +#ifndef SHA512_224_Fd +#define SHA512_224_Fd _libmd_SHA512_224_Fd +#endif +#ifndef SHA512_224_FdChunk +#define SHA512_224_FdChunk _libmd_SHA512_224_FdChunk +#endif +#ifndef SHA512_224_File +#define SHA512_224_File _libmd_SHA512_224_File +#endif +#ifndef SHA512_224_FileChunk +#define SHA512_224_FileChunk _libmd_SHA512_224_FileChunk +#endif +#ifndef SHA512_224_Data +#define SHA512_224_Data _libmd_SHA512_224_Data +#endif + +#ifndef SHA512_224_Transform +#define SHA512_224_Transform _libmd_SHA512_224_Transform +#endif +#ifndef SHA512_224_version +#define SHA512_224_version _libmd_SHA512_224_version +#endif + +#ifndef SHA512_256_Init +#define SHA512_256_Init _libmd_SHA512_256_Init +#endif +#ifndef SHA512_256_Update +#define SHA512_256_Update _libmd_SHA512_256_Update +#endif +#ifndef SHA512_256_Final +#define SHA512_256_Final _libmd_SHA512_256_Final +#endif +#ifndef SHA512_256_End +#define SHA512_256_End _libmd_SHA512_256_End +#endif +#ifndef SHA512_256_Fd +#define SHA512_256_Fd _libmd_SHA512_256_Fd +#endif +#ifndef SHA512_256_FdChunk +#define SHA512_256_FdChunk _libmd_SHA512_256_FdChunk +#endif +#ifndef SHA512_256_File +#define SHA512_256_File _libmd_SHA512_256_File +#endif +#ifndef SHA512_256_FileChunk +#define SHA512_256_FileChunk _libmd_SHA512_256_FileChunk +#endif +#ifndef SHA512_256_Data +#define SHA512_256_Data _libmd_SHA512_256_Data +#endif + +#ifndef SHA512_256_Transform +#define SHA512_256_Transform _libmd_SHA512_256_Transform +#endif +#ifndef SHA512_256_version +#define SHA512_256_version _libmd_SHA512_256_version +#endif + +void SHA512_224_Init(SHA512_CTX *); +void SHA512_224_Update(SHA512_CTX *, const void *, size_t); +void SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_224_End(SHA512_CTX *, char *); +char *SHA512_224_Data(const void *, unsigned int, char *); +char *SHA512_224_Fd(int, char *); +char *SHA512_224_FdChunk(int, char *, off_t, off_t); +char *SHA512_224_File(const char *, char *); +char *SHA512_224_FileChunk(const char *, char *, off_t, off_t); +#endif +void SHA512_256_Init(SHA512_CTX *); +void SHA512_256_Update(SHA512_CTX *, const void *, size_t); +void SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_256_End(SHA512_CTX *, char *); +char *SHA512_256_Data(const void *, unsigned int, char *); +char *SHA512_256_Fd(int, char *); +char *SHA512_256_FdChunk(int, char *, off_t, off_t); +char *SHA512_256_File(const char *, char *); +char *SHA512_256_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA512T_H_ */ diff --git a/module/os/freebsd/spl/spl_acl.c b/module/os/freebsd/spl/spl_acl.c new file mode 100644 index 000000000000..bb4c30728d56 --- /dev/null +++ b/module/os/freebsd/spl/spl_acl.c @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2008, 2009 Edward Tomasz Napierała + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +struct zfs2bsd { + uint32_t zb_zfs; + int zb_bsd; +}; + +struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA}, + {ACE_WRITE_DATA, ACL_WRITE_DATA}, + {ACE_EXECUTE, ACL_EXECUTE}, + {ACE_APPEND_DATA, ACL_APPEND_DATA}, + {ACE_DELETE_CHILD, ACL_DELETE_CHILD}, + {ACE_DELETE, ACL_DELETE}, + {ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES}, + {ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES}, + {ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS}, + {ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS}, + {ACE_READ_ACL, ACL_READ_ACL}, + {ACE_WRITE_ACL, ACL_WRITE_ACL}, + {ACE_WRITE_OWNER, ACL_WRITE_OWNER}, + {ACE_SYNCHRONIZE, ACL_SYNCHRONIZE}, + {0, 0}}; + +struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE, + ACL_ENTRY_FILE_INHERIT}, + {ACE_DIRECTORY_INHERIT_ACE, + ACL_ENTRY_DIRECTORY_INHERIT}, + {ACE_NO_PROPAGATE_INHERIT_ACE, + ACL_ENTRY_NO_PROPAGATE_INHERIT}, + {ACE_INHERIT_ONLY_ACE, + ACL_ENTRY_INHERIT_ONLY}, + {ACE_INHERITED_ACE, + ACL_ENTRY_INHERITED}, + {ACE_SUCCESSFUL_ACCESS_ACE_FLAG, + ACL_ENTRY_SUCCESSFUL_ACCESS}, + {ACE_FAILED_ACCESS_ACE_FLAG, + ACL_ENTRY_FAILED_ACCESS}, + {0, 0}}; + +static int +_bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table) +{ + const struct zfs2bsd *tmp; + int bsd = 0; + + for (tmp = table; tmp->zb_zfs != 0; tmp++) { + if (zfs & tmp->zb_zfs) + bsd |= tmp->zb_bsd; + } + + return (bsd); +} + +static uint32_t +_zfs_from_bsd(int bsd, const struct zfs2bsd *table) +{ + const struct zfs2bsd *tmp; + uint32_t zfs = 0; + + for (tmp = table; tmp->zb_bsd != 0; tmp++) { + if (bsd & tmp->zb_bsd) + zfs |= tmp->zb_zfs; + } + + return (zfs); +} + +int +acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries) +{ + int i; + struct acl_entry *entry; + const ace_t *ace; + + if (nentries < 1) { + printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n"); + return (EINVAL); + } + + if (nentries > ACL_MAX_ENTRIES) { + /* + * I believe it may happen only when moving a pool + * from SunOS to FreeBSD. + */ + printf("acl_from_aces: ZFS ACL too big to fit " + "into 'struct acl'; returning EINVAL.\n"); + return (EINVAL); + } + + bzero(aclp, sizeof (*aclp)); + aclp->acl_maxcnt = ACL_MAX_ENTRIES; + aclp->acl_cnt = nentries; + + for (i = 0; i < nentries; i++) { + entry = &(aclp->acl_entry[i]); + ace = &(aces[i]); + + if (ace->a_flags & ACE_OWNER) + entry->ae_tag = ACL_USER_OBJ; + else if (ace->a_flags & ACE_GROUP) + entry->ae_tag = ACL_GROUP_OBJ; + else if (ace->a_flags & ACE_EVERYONE) + entry->ae_tag = ACL_EVERYONE; + else if (ace->a_flags & ACE_IDENTIFIER_GROUP) + entry->ae_tag = ACL_GROUP; + else + entry->ae_tag = ACL_USER; + + if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP) + entry->ae_id = ace->a_who; + else + entry->ae_id = ACL_UNDEFINED_ID; + + entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms); + entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags); + + switch (ace->a_type) { + case ACE_ACCESS_ALLOWED_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW; + break; + case ACE_ACCESS_DENIED_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_DENY; + break; + case ACE_SYSTEM_AUDIT_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT; + break; + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM; + break; + default: + panic("acl_from_aces: a_type is 0x%x", ace->a_type); + } + } + + return (0); +} + +void +aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp) +{ + int i; + const struct acl_entry *entry; + ace_t *ace; + + bzero(aces, sizeof (*aces) * aclp->acl_cnt); + + *nentries = aclp->acl_cnt; + + for (i = 0; i < aclp->acl_cnt; i++) { + entry = &(aclp->acl_entry[i]); + ace = &(aces[i]); + + ace->a_who = entry->ae_id; + + if (entry->ae_tag == ACL_USER_OBJ) + ace->a_flags = ACE_OWNER; + else if (entry->ae_tag == ACL_GROUP_OBJ) + ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP); + else if (entry->ae_tag == ACL_GROUP) + ace->a_flags = ACE_IDENTIFIER_GROUP; + else if (entry->ae_tag == ACL_EVERYONE) + ace->a_flags = ACE_EVERYONE; + else /* ACL_USER */ + ace->a_flags = 0; + + ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms); + ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags); + + switch (entry->ae_entry_type) { + case ACL_ENTRY_TYPE_ALLOW: + ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_DENY: + ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_ALARM: + ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_AUDIT: + ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE; + break; + default: + panic("aces_from_acl: ae_entry_type is 0x%x", + entry->ae_entry_type); + } + } +} diff --git a/module/os/freebsd/spl/spl_atomic.c b/module/os/freebsd/spl/spl_atomic.c new file mode 100644 index 000000000000..e82fed847409 --- /dev/null +++ b/module/os/freebsd/spl/spl_atomic.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#ifdef _KERNEL +#include + +struct mtx atomic_mtx; +MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF); +#else +#include + +#define mtx_lock(lock) pthread_mutex_lock(lock) +#define mtx_unlock(lock) pthread_mutex_unlock(lock) + +static pthread_mutex_t atomic_mtx; + +static __attribute__((constructor)) void +atomic_init(void) +{ + pthread_mutex_init(&atomic_mtx, NULL); +} +#endif + +#if !defined(__LP64__) && !defined(__mips_n32) && \ + !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) +void +atomic_add_64(volatile uint64_t *target, int64_t delta) +{ + + mtx_lock(&atomic_mtx); + *target += delta; + mtx_unlock(&atomic_mtx); +} + +void +atomic_dec_64(volatile uint64_t *target) +{ + + mtx_lock(&atomic_mtx); + *target -= 1; + mtx_unlock(&atomic_mtx); +} +#endif + +uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + uint64_t newval; + + mtx_lock(&atomic_mtx); + newval = (*target += delta); + mtx_unlock(&atomic_mtx); + return (newval); +} + +#if defined(__powerpc__) || defined(__arm__) || defined(__mips__) +void +atomic_or_8(volatile uint8_t *target, uint8_t value) +{ + mtx_lock(&atomic_mtx); + *target |= value; + mtx_unlock(&atomic_mtx); +} +#endif + +uint8_t +atomic_or_8_nv(volatile uint8_t *target, uint8_t value) +{ + uint8_t newval; + + mtx_lock(&atomic_mtx); + newval = (*target |= value); + mtx_unlock(&atomic_mtx); + return (newval); +} + +uint64_t +atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval) +{ + uint64_t oldval; + + mtx_lock(&atomic_mtx); + oldval = *target; + if (oldval == cmp) + *target = newval; + mtx_unlock(&atomic_mtx); + return (oldval); +} + +uint32_t +atomic_cas_32(volatile uint32_t *target, uint32_t cmp, uint32_t newval) +{ + uint32_t oldval; + + mtx_lock(&atomic_mtx); + oldval = *target; + if (oldval == cmp) + *target = newval; + mtx_unlock(&atomic_mtx); + return (oldval); +} + +void +membar_producer(void) +{ + /* nothing */ +} diff --git a/module/os/freebsd/spl/spl_cmn_err.c b/module/os/freebsd/spl/spl_cmn_err.c new file mode 100644 index 000000000000..23566603f5fa --- /dev/null +++ b/module/os/freebsd/spl/spl_cmn_err.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2007 John Birrell . All rights reserved. + * Copyright 2012 Martin Matuska . All rights reserved. + */ + +#include + +void +vcmn_err(int ce, const char *fmt, va_list adx) +{ + char buf[256]; + const char *prefix; + + prefix = NULL; /* silence unwitty compilers */ + switch (ce) { + case CE_CONT: + prefix = "Solaris(cont): "; + break; + case CE_NOTE: + prefix = "Solaris: NOTICE: "; + break; + case CE_WARN: + prefix = "Solaris: WARNING: "; + break; + case CE_PANIC: + prefix = "Solaris(panic): "; + break; + case CE_IGNORE: + break; + default: + panic("Solaris: unknown severity level"); + } + if (ce == CE_PANIC) { + vsnprintf(buf, sizeof (buf), fmt, adx); + panic("%s%s", prefix, buf); + } + if (ce != CE_IGNORE) { + printf("%s", prefix); + vprintf(fmt, adx); + printf("\n"); + } +} + +void +cmn_err(int type, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(type, fmt, ap); + va_end(ap); +} diff --git a/module/os/freebsd/spl/spl_dtrace.c b/module/os/freebsd/spl/spl_dtrace.c new file mode 100644 index 000000000000..e7b2ff823094 --- /dev/null +++ b/module/os/freebsd/spl/spl_dtrace.c @@ -0,0 +1,37 @@ +/* + * Copyright 2014 The FreeBSD Project. + * All rights reserved. + * + * This software was developed by Steven Hartland. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +/* CSTYLED */ +SDT_PROBE_DEFINE1(sdt, , , set__error, "int"); diff --git a/module/os/freebsd/spl/spl_kmem.c b/module/os/freebsd/spl/spl_kmem.c new file mode 100644 index 000000000000..af3747c271fd --- /dev/null +++ b/module/os/freebsd/spl/spl_kmem.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2006-2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include + +#ifdef KMEM_DEBUG +#include +#include +#endif + +#ifdef _KERNEL +MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris"); +#else +#define malloc(size, type, flags) malloc(size) +#define free(addr, type) free(addr) +#endif + +#ifdef KMEM_DEBUG +struct kmem_item { + struct stack stack; + LIST_ENTRY(kmem_item) next; +}; +static LIST_HEAD(, kmem_item) kmem_items; +static struct mtx kmem_items_mtx; +MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF); +#endif /* KMEM_DEBUG */ + +#include + +void * +zfs_kmem_alloc(size_t size, int kmflags) +{ + void *p; +#ifdef KMEM_DEBUG + struct kmem_item *i; + + size += sizeof (struct kmem_item); +#endif + p = malloc(MAX(size, 16), M_SOLARIS, kmflags); +#ifndef _KERNEL + if (kmflags & KM_SLEEP) + assert(p != NULL); +#endif +#ifdef KMEM_DEBUG + if (p != NULL) { + i = p; + p = (uint8_t *)p + sizeof (struct kmem_item); + stack_save(&i->stack); + mtx_lock(&kmem_items_mtx); + LIST_INSERT_HEAD(&kmem_items, i, next); + mtx_unlock(&kmem_items_mtx); + } +#endif + return (p); +} + +void +zfs_kmem_free(void *buf, size_t size __unused) +{ +#ifdef KMEM_DEBUG + if (buf == NULL) { + printf("%s: attempt to free NULL\n", __func__); + return; + } + struct kmem_item *i; + + buf = (uint8_t *)buf - sizeof (struct kmem_item); + mtx_lock(&kmem_items_mtx); + LIST_FOREACH(i, &kmem_items, next) { + if (i == buf) + break; + } + ASSERT(i != NULL); + LIST_REMOVE(i, next); + mtx_unlock(&kmem_items_mtx); + memset(buf, 0xDC, MAX(size, 16)); +#endif + free(buf, M_SOLARIS); +} + +static uint64_t kmem_size_val; + +static void +kmem_size_init(void *unused __unused) +{ + + kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE; + if (kmem_size_val > vm_kmem_size) + kmem_size_val = vm_kmem_size; +} +SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL); + +uint64_t +kmem_size(void) +{ + + return (kmem_size_val); +} + +static int +kmem_std_constructor(void *mem, int size __unused, void *private, int flags) +{ + struct kmem_cache *cache = private; + + return (cache->kc_constructor(mem, cache->kc_private, flags)); +} + +static void +kmem_std_destructor(void *mem, int size __unused, void *private) +{ + struct kmem_cache *cache = private; + + cache->kc_destructor(mem, cache->kc_private); +} + +kmem_cache_t * +kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), + void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags) +{ + kmem_cache_t *cache; + + ASSERT(vmp == NULL); + + cache = kmem_alloc(sizeof (*cache), KM_SLEEP); + strlcpy(cache->kc_name, name, sizeof (cache->kc_name)); + cache->kc_constructor = constructor; + cache->kc_destructor = destructor; + cache->kc_private = private; +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + cache->kc_zone = uma_zcreate(cache->kc_name, bufsize, + constructor != NULL ? kmem_std_constructor : NULL, + destructor != NULL ? kmem_std_destructor : NULL, + NULL, NULL, align > 0 ? align - 1 : 0, cflags); +#else + cache->kc_size = bufsize; +#endif + + return (cache); +} + +void +kmem_cache_destroy(kmem_cache_t *cache) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + uma_zdestroy(cache->kc_zone); +#endif + kmem_free(cache, sizeof (*cache)); +} + +void * +kmem_cache_alloc(kmem_cache_t *cache, int flags) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + return (uma_zalloc_arg(cache->kc_zone, cache, flags)); +#else + void *p; + + p = kmem_alloc(cache->kc_size, flags); + if (p != NULL && cache->kc_constructor != NULL) + kmem_std_constructor(p, cache->kc_size, cache, flags); + return (p); +#endif +} + +void +kmem_cache_free(kmem_cache_t *cache, void *buf) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + uma_zfree_arg(cache->kc_zone, buf, cache); +#else + if (cache->kc_destructor != NULL) + kmem_std_destructor(buf, cache->kc_size, cache); + kmem_free(buf, cache->kc_size); +#endif +} + +/* + * Allow our caller to determine if there are running reaps. + * + * This call is very conservative and may return B_TRUE even when + * reaping activity isn't active. If it returns B_FALSE, then reaping + * activity is definitely inactive. + */ +boolean_t +kmem_cache_reap_active(void) +{ + + return (B_FALSE); +} + +/* + * Reap (almost) everything soon. + * + * Note: this does not wait for the reap-tasks to complete. Caller + * should use kmem_cache_reap_active() (above) and/or moderation to + * avoid scheduling too many reap-tasks. + */ +#ifdef _KERNEL +void +kmem_cache_reap_soon(kmem_cache_t *cache) +{ +#ifndef KMEM_DEBUG +#if __FreeBSD_version >= 1300043 + uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN); +#else + zone_drain(cache->kc_zone); +#endif +#endif +} + +void +kmem_reap(void) +{ +#if __FreeBSD_version >= 1300043 + uma_reclaim(UMA_RECLAIM_TRIM); +#else + uma_reclaim(); +#endif +} +#else +void +kmem_cache_reap_soon(kmem_cache_t *cache __unused) +{ +} + +void +kmem_reap(void) +{ +} +#endif + +int +kmem_debugging(void) +{ + return (0); +} + +void * +calloc(size_t n, size_t s) +{ + return (kmem_zalloc(n * s, KM_NOSLEEP)); +} + +char * +kmem_vasprintf(const char *fmt, va_list adx) +{ + char *msg; + va_list adx2; + + va_copy(adx2, adx); + msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); + (void) vsprintf(msg, fmt, adx2); + va_end(adx2); + + return (msg); +} + +#include +#include +#ifdef KMEM_DEBUG +#error "KMEM_DEBUG not currently supported" +#endif + +uint64_t +spl_kmem_cache_inuse(kmem_cache_t *cache) +{ + return (uma_zone_get_cur(cache->kc_zone)); +} + +uint64_t +spl_kmem_cache_entry_size(kmem_cache_t *cache) +{ + return (cache->kc_zone->uz_size); +} + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} + +#ifdef KMEM_DEBUG +void kmem_show(void *); +void +kmem_show(void *dummy __unused) +{ + struct kmem_item *i; + + mtx_lock(&kmem_items_mtx); + if (LIST_EMPTY(&kmem_items)) + printf("KMEM_DEBUG: No leaked elements.\n"); + else { + printf("KMEM_DEBUG: Leaked elements:\n\n"); + LIST_FOREACH(i, &kmem_items, next) { + printf("address=%p\n", i); + stack_print_ddb(&i->stack); + printf("\n"); + } + } + mtx_unlock(&kmem_items_mtx); +} + +SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL); +#endif /* KMEM_DEBUG */ diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c new file mode 100644 index 000000000000..fda03a3d7881 --- /dev/null +++ b/module/os/freebsd/spl/spl_kstat.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics"); + +SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics"); + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +kstat_t * +__kstat_create(const char *module, int instance, const char *name, + const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags) +{ + struct sysctl_oid *root; + kstat_t *ksp; + + KASSERT(instance == 0, ("instance=%d", instance)); + if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) + ASSERT(ks_ndata == 1); + + /* + * Allocate the main structure. We don't need to copy module/class/name + * stuff in here, because it is only used for sysctl node creation + * done in this function. + */ + ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO); + + ksp->ks_crtime = gethrtime(); + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_instance = instance; + strncpy(ksp->ks_name, name, KSTAT_STRLEN); + strncpy(ksp->ks_class, class, KSTAT_STRLEN); + ksp->ks_type = ks_type; + ksp->ks_flags = flags; + ksp->ks_update = kstat_default_update; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + panic("Undefined kstat type %d\n", ksp->ks_type); + } + + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { + ksp->ks_data = NULL; + } else { + ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); + if (ksp->ks_data == NULL) { + kmem_free(ksp, sizeof (*ksp)); + ksp = NULL; + } + } + /* + * Create sysctl tree for those statistics: + * + * kstat.... + */ + sysctl_ctx_init(&ksp->ks_sysctl_ctx); + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0, + ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s tree!\n", __func__, module); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), + OID_AUTO, class, CTLFLAG_RW, 0, ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s.%s tree!\n", __func__, + module, class); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), + OID_AUTO, name, CTLFLAG_RW, 0, ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__, + module, class, name); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + ksp->ks_sysctl_root = root; + + return (ksp); +} + +static int +kstat_sysctl(SYSCTL_HANDLER_ARGS) +{ + kstat_named_t *ksent = arg1; + uint64_t val; + + val = ksent->value.ui64; + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +void +kstat_install(kstat_t *ksp) +{ + kstat_named_t *ksent; + char *namelast; + int typelast; + + ksent = ksp->ks_data; + if (ksp->ks_ndata == UINT32_MAX) { +#ifdef INVARIANTS + printf("can't handle raw ops yet!!!\n"); +#endif + return; + } + if (ksent == NULL) { + printf("%s ksp->ks_data == NULL!!!!\n", __func__); + return; + } + typelast = 0; + namelast = NULL; + for (int i = 0; i < ksp->ks_ndata; i++, ksent++) { + if (ksent->data_type != 0) { + typelast = ksent->data_type; + namelast = ksent->name; + } + switch (typelast) { + case KSTAT_DATA_INT32: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_S32 | CTLFLAG_RD, ksent, + sizeof (*ksent), kstat_sysctl, "I", + namelast); + break; + case KSTAT_DATA_UINT32: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_U32 | CTLFLAG_RD, ksent, + sizeof (*ksent), kstat_sysctl, "IU", + namelast); + break; + case KSTAT_DATA_INT64: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_S64 | CTLFLAG_RD, ksent, + sizeof (*ksent), kstat_sysctl, "Q", + namelast); + break; + case KSTAT_DATA_UINT64: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_U64 | CTLFLAG_RD, ksent, + sizeof (*ksent), kstat_sysctl, "QU", + namelast); + break; + default: + panic("unsupported type: %d", typelast); + } + + } +} + +void +kstat_delete(kstat_t *ksp) +{ + + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); +} + +void +kstat_set_string(char *dst, const char *src) +{ + + bzero(dst, KSTAT_STRLEN); + (void) strncpy(dst, src, KSTAT_STRLEN - 1); +} + +void +kstat_named_init(kstat_named_t *knp, const char *name, uchar_t data_type) +{ + + kstat_set_string(knp->name, name); + knp->data_type = data_type; +} + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; +} + +void +kstat_runq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} + +void +kstat_runq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; +} diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c new file mode 100644 index 000000000000..ab4702574b08 --- /dev/null +++ b/module/os/freebsd/spl/spl_misc.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include + +char hw_serial[11] = "0"; + +static struct opensolaris_utsname hw_utsname = { + .machine = MACHINE +}; + +static void +opensolaris_utsname_init(void *arg) +{ + + hw_utsname.sysname = ostype; + hw_utsname.nodename = prison0.pr_hostname; + hw_utsname.release = osrelease; + snprintf(hw_utsname.version, sizeof (hw_utsname.version), + "%d", osreldate); +} + +char * +kmem_strdup(const char *s) +{ + char *buf; + + buf = kmem_alloc(strlen(s) + 1, KM_SLEEP); + strcpy(buf, s); + return (buf); +} + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyin(from, to, len)); +} + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyout(from, to, len)); +} + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vpanic(fmt, ap); + va_end(ap); +} + +utsname_t * +utsname(void) +{ + return (&hw_utsname); +} +SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, + opensolaris_utsname_init, NULL); diff --git a/module/os/freebsd/spl/spl_policy.c b/module/os/freebsd/spl/spl_policy.c new file mode 100644 index 000000000000..53732836ff9a --- /dev/null +++ b/module/os/freebsd/spl/spl_policy.c @@ -0,0 +1,429 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int +secpolicy_nfs(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON)); +} + +int +secpolicy_zfs(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_sys_config(cred_t *cr, int checkonly __unused) +{ + + return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG)); +} + +int +secpolicy_zinject(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT)); +} + +int +secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT)); +} + +int +secpolicy_fs_owner(struct mount *mp, cred_t *cr) +{ + + if (zfs_super_owner) { + if (cr->cr_uid == mp->mnt_cred->cr_uid && + cr->cr_prison == mp->mnt_cred->cr_prison) { + return (0); + } + } + return (EPERM); +} + +/* + * This check is done in kern_link(), so we could just return 0 here. + */ +extern int hardlink_check_uid; +int +secpolicy_basic_link(vnode_t *vp, cred_t *cr) +{ + + if (!hardlink_check_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_LINK)); +} + +int +secpolicy_vnode_stky_modify(cred_t *cr) +{ + + return (EPERM); +} + +int +secpolicy_vnode_remove(vnode_t *vp, cred_t *cr) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN)); +} + +int +secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0) + return (EACCES); + if ((accmode & VWRITE) && + spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) { + return (EACCES); + } + if (accmode & VEXEC) { + if (vp->v_type == VDIR) { + if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0) + return (EACCES); + } else { + if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0) + return (EACCES); + } + } + return (0); +} + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + */ +int +secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t curmode, accmode_t wantmode) +{ + accmode_t mode; + + mode = ~curmode & wantmode; + + if (mode == 0) + return (0); + + return (secpolicy_vnode_access(cr, vp, owner, mode)); +} + +int +secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner) +{ + static int privs[] = { + PRIV_VFS_ADMIN, + PRIV_VFS_READ, + PRIV_VFS_WRITE, + PRIV_VFS_EXEC, + PRIV_VFS_LOOKUP + }; + int i; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* Same as secpolicy_vnode_setdac */ + if (owner == cr->cr_uid) + return (0); + + for (i = 0; i < sizeof (privs)/sizeof (int); i++) { + int priv; + + switch (priv = privs[i]) { + case PRIV_VFS_EXEC: + if (vp->v_type == VDIR) + continue; + break; + case PRIV_VFS_LOOKUP: + if (vp->v_type != VDIR) + continue; + break; + } + if (spl_priv_check_cred(cr, priv) == 0) + return (0); + } + return (EPERM); +} + +int +secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (owner == cr->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN)); +} + +int +secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + int mask = vap->va_mask; + int error; + + if (mask & AT_SIZE) { + if (vp->v_type == VDIR) + return (EISDIR); + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + if (mask & AT_MODE) { + /* + * If not the owner of the file then check privilege + * for two things: the privilege to set the mode at all + * and, if we're setting setuid, we also need permissions + * to add the set-uid bit, if we're not the owner. + * In the specific case of creating a set-uid root + * file, we need even more permissions. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr); + if (error) + return (error); + } else { + vap->va_mode = ovap->va_mode; + } + if (mask & (AT_UID | AT_GID)) { + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + + /* + * To change the owner of a file, or change the group of + * a file to a group of which we are not a member, the + * caller must have privilege. + */ + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid && + !groupmember(vap->va_gid, cr))) { + if (secpolicy_fs_owner(vp->v_mount, cr) != 0) { + error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN); + if (error) + return (error); + } + } + + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) { + secpolicy_setid_clear(vap, vp, cr); + } + } + if (mask & (AT_ATIME | AT_MTIME)) { + /* + * From utimes(2): + * If times is NULL, ... The caller must be the owner of + * the file, have permission to write the file, or be the + * super-user. + * If times is non-NULL, ... The caller must be the owner of + * the file or be the super-user. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error && (vap->va_vaflags & VA_UTIMES_NULL)) + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + return (0); +} + +int +secpolicy_vnode_create_gid(cred_t *cr) +{ + + return (EPERM); +} + +int +secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid) +{ + + if (groupmember(gid, cr)) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_SETGID)); +} + +int +secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, + boolean_t issuidroot __unused) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)); +} + +void +secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return; + + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) { + if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } + } +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, cred_t *cr) +{ + int error; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* + * Privileged processes may set the sticky bit on non-directories, + * as well as set the setgid bit on a file with a group that the process + * is not a member of. Both of these are allowed in jail(8). + */ + if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) { + if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE)) + return (EFTYPE); + } + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0) { + error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid); + if (error) + return (error); + } + /* + * Deny setting setuid if we are not the file owner. + */ + if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) { + error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN); + if (error) + return (error); + } + return (0); +} + +int +secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (owner == cr->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* XXX: vfs_suser()? */ + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER)); +} + +int +secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN)); +} + +void +secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp) +{ + + if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) { + MNT_ILOCK(vfsp); + vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER; + vfs_clearmntopt(vfsp, MNTOPT_SETUID); + vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0); + MNT_IUNLOCK(vfsp); + } +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr, + vtype_t vtype) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS)); +} + +int +secpolicy_smb(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_NETSMB)); +} diff --git a/module/os/freebsd/spl/spl_procfs_list.c b/module/os/freebsd/spl/spl_procfs_list.c new file mode 100644 index 000000000000..7b4ae9d0e357 --- /dev/null +++ b/module/os/freebsd/spl/spl_procfs_list.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +void +seq_printf(struct seq_file *m, const char *fmt, ...) +{} + +void +procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; + procfs_list->pl_node_offset = procfs_list_node_off; +} + +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{} + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} diff --git a/module/os/freebsd/spl/spl_string.c b/module/os/freebsd/spl/spl_string.c new file mode 100644 index 000000000000..14d816b5c326 --- /dev/null +++ b/module/os/freebsd/spl/spl_string.c @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include + +#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') + +#define IS_ALPHA(c) \ + (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +char * +strpbrk(const char *s, const char *b) +{ + const char *p; + + do { + for (p = b; *p != '\0' && *p != *s; ++p) + ; + if (*p != '\0') + return ((char *)s); + } while (*s++); + + return (NULL); +} + +/* + * Convert a string into a valid C identifier by replacing invalid + * characters with '_'. Also makes sure the string is nul-terminated + * and takes up at most n bytes. + */ +void +strident_canon(char *s, size_t n) +{ + char c; + char *end = s + n - 1; + + if ((c = *s) == 0) + return; + + if (!IS_ALPHA(c) && c != '_') + *s = '_'; + + while (s < end && ((c = *(++s)) != 0)) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + *s = '_'; + } + *s = 0; +} + +/* + * Do not change the length of the returned string; it must be freed + * with strfree(). + */ +char * +kmem_asprintf(const char *fmt, ...) +{ + int size; + va_list adx; + char *buf; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx) + 1; + va_end(adx); + + buf = kmem_alloc(size, KM_SLEEP); + + va_start(adx, fmt); + (void) vsnprintf(buf, size, fmt, adx); + va_end(adx); + + return (buf); +} + +void +kmem_strfree(char *str) +{ + ASSERT(str != NULL); + kmem_free(str, strlen(str) + 1); +} diff --git a/module/os/freebsd/spl/spl_sunddi.c b/module/os/freebsd/spl/spl_sunddi.c new file mode 100644 index 000000000000..1fa4f56f1f8c --- /dev/null +++ b/module/os/freebsd/spl/spl_sunddi.c @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +int +ddi_strtol(const char *str, char **nptr, int base, long *result) +{ + + *result = strtol(str, nptr, base); + return (0); +} + +int +ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) +{ + + if (str == hw_serial) { + *result = prison0.pr_hostid; + return (0); + } + + *result = strtoul(str, nptr, base); + return (0); +} + +int +ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result) +{ + + *result = (unsigned long long)strtouq(str, nptr, base); + return (0); +} + +int +ddi_strtoll(const char *str, char **nptr, int base, long long *result) +{ + + *result = (long long)strtoq(str, nptr, base); + return (0); +} diff --git a/module/os/freebsd/spl/spl_sysevent.c b/module/os/freebsd/spl/spl_sysevent.c new file mode 100644 index 000000000000..d3748276a71c --- /dev/null +++ b/module/os/freebsd/spl/spl_sysevent.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +log_sysevent(nvlist_t *event) +{ + struct sbuf *sb; + const char *type; + char typestr[128]; + nvpair_t *elem = NULL; + + sb = sbuf_new_auto(); + if (sb == NULL) + return (ENOMEM); + type = NULL; + + while ((elem = nvlist_next_nvpair(event, elem)) != NULL) { + switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN: + { + boolean_t value; + + (void) nvpair_value_boolean_value(elem, &value); + sbuf_printf(sb, " %s=%s", nvpair_name(elem), + value ? "true" : "false"); + break; + } + case DATA_TYPE_UINT8: + { + uint8_t value; + + (void) nvpair_value_uint8(elem, &value); + sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value); + break; + } + case DATA_TYPE_INT32: + { + int32_t value; + + (void) nvpair_value_int32(elem, &value); + sbuf_printf(sb, " %s=%jd", nvpair_name(elem), + (intmax_t)value); + break; + } + case DATA_TYPE_UINT32: + { + uint32_t value; + + (void) nvpair_value_uint32(elem, &value); + sbuf_printf(sb, " %s=%ju", nvpair_name(elem), + (uintmax_t)value); + break; + } + case DATA_TYPE_INT64: + { + int64_t value; + + (void) nvpair_value_int64(elem, &value); + sbuf_printf(sb, " %s=%jd", nvpair_name(elem), + (intmax_t)value); + break; + } + case DATA_TYPE_UINT64: + { + uint64_t value; + + (void) nvpair_value_uint64(elem, &value); + sbuf_printf(sb, " %s=%ju", nvpair_name(elem), + (uintmax_t)value); + break; + } + case DATA_TYPE_STRING: + { + char *value; + + (void) nvpair_value_string(elem, &value); + sbuf_printf(sb, " %s=%s", nvpair_name(elem), value); + if (strcmp(FM_CLASS, nvpair_name(elem)) == 0) + type = value; + break; + } + case DATA_TYPE_UINT8_ARRAY: + { + uint8_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint8_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%02hhx", value[ii]); + break; + } + case DATA_TYPE_UINT16_ARRAY: + { + uint16_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint16_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%04hx", value[ii]); + break; + } + case DATA_TYPE_UINT32_ARRAY: + { + uint32_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint32_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]); + break; + } + case DATA_TYPE_INT64_ARRAY: + { + int64_t *value; + uint_t ii, nelem; + + (void) nvpair_value_int64_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%016lld", + (long long)value[ii]); + break; + } + case DATA_TYPE_UINT64_ARRAY: + { + uint64_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint64_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]); + break; + } + case DATA_TYPE_STRING_ARRAY: + { + char **strarr; + uint_t ii, nelem; + + (void) nvpair_value_string_array(elem, &strarr, &nelem); + + for (ii = 0; ii < nelem; ii++) { + if (strarr[ii] == NULL) { + sbuf_printf(sb, " "); + continue; + } + + sbuf_printf(sb, " %s", strarr[ii]); + if (strcmp(FM_CLASS, strarr[ii]) == 0) + type = strarr[ii]; + } + break; + } + case DATA_TYPE_NVLIST: + /* XXX - requires recursing in log_sysevent */ + break; + default: + printf("%s: type %d is not implemented\n", __func__, + nvpair_type(elem)); + break; + } + } + + if (sbuf_finish(sb) != 0) { + sbuf_delete(sb); + return (ENOMEM); + } + + if (type == NULL) + type = ""; + if (strncmp(type, "ESC_ZFS_", 8) == 0) { + snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8); + type = typestr; + } + devctl_notify("ZFS", "ZFS", type, sbuf_data(sb)); + sbuf_delete(sb); + + return (0); +} + +static void +sysevent_worker(void *arg __unused) +{ + zfs_zevent_t *ze; + nvlist_t *event; + uint64_t dropped = 0; + uint64_t dst_size; + int error; + + zfs_zevent_init(&ze); + for (;;) { + dst_size = 131072; + dropped = 0; + event = NULL; + error = zfs_zevent_next(ze, &event, + &dst_size, &dropped); + if (error) { + error = zfs_zevent_wait(ze); + if (error == ESHUTDOWN) + break; + } else { + VERIFY(event != NULL); + log_sysevent(event); + nvlist_free(event); + } + } + zfs_zevent_destroy(ze); + kthread_exit(); +} + +void +ddi_sysevent_init(void) +{ + kproc_kthread_add(sysevent_worker, NULL, &zfsproc, NULL, 0, 0, + "zfskern", "sysevent"); +} diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c new file mode 100644 index 000000000000..b6a501f6773f --- /dev/null +++ b/module/os/freebsd/spl/spl_taskq.c @@ -0,0 +1,329 @@ +/* + * Copyright (c) 2009 Pawel Jakub Dawidek + * All rights reserved. + * + * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static uint_t taskq_tsd; +static uma_zone_t taskq_zone; + +taskq_t *system_taskq = NULL; +taskq_t *system_delay_taskq = NULL; +taskq_t *dynamic_taskq = NULL; + +extern int uma_align_cache; + +#define TQ_MASK uma_align_cache +#define TQ_PTR_MASK ~uma_align_cache + +#define TIMEOUT_TASK 1 +#define NORMAL_TASK 2 + +static int +taskqent_init(void *mem, int size, int flags) +{ + bzero(mem, sizeof (taskq_ent_t)); + return (0); +} + +static int +taskqent_ctor(void *mem, int size, void *arg, int flags) +{ + return (0); +} + +static void +taskqent_dtor(void *mem, int size, void *arg) +{ + taskq_ent_t *ent = mem; + + ent->tqent_gen = (ent->tqent_gen + 1) & TQ_MASK; +} + +static void +system_taskq_init(void *arg) +{ + + tsd_create(&taskq_tsd, NULL); + taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t), + taskqent_ctor, taskqent_dtor, taskqent_init, NULL, + UMA_ALIGN_CACHE, UMA_ZONE_NOFREE); + system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri, + 0, 0, 0); + system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus, + minclsyspri, 0, 0, 0); +} +SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init, + NULL); + +static void +system_taskq_fini(void *arg) +{ + + taskq_destroy(system_taskq); + uma_zdestroy(taskq_zone); + tsd_destroy(&taskq_tsd); +} +SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, + NULL); + +static void +taskq_tsd_set(void *context) +{ + taskq_t *tq = context; + + tsd_set(taskq_tsd, tq); +} + +static taskq_t * +taskq_create_with_init(const char *name, int nthreads, pri_t pri, + int minalloc __unused, int maxalloc __unused, uint_t flags) +{ + taskq_t *tq; + + if ((flags & TASKQ_THREADS_CPU_PCT) != 0) + nthreads = MAX((mp_ncpus * nthreads) / 100, 1); + + tq = kmem_alloc(sizeof (*tq), KM_SLEEP); + tq->tq_queue = taskqueue_create(name, M_WAITOK, + taskqueue_thread_enqueue, &tq->tq_queue); + taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT, + taskq_tsd_set, tq); + taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, + taskq_tsd_set, NULL); + (void) taskqueue_start_threads(&tq->tq_queue, nthreads, pri, + "%s", name); + + return ((taskq_t *)tq); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, + int maxalloc __unused, uint_t flags) +{ + + return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc, + flags)); +} + +taskq_t * +taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, + int maxalloc, proc_t *proc __unused, uint_t flags) +{ + + return (taskq_create_with_init(name, nthreads, pri, minalloc, maxalloc, + flags)); +} + +void +taskq_destroy(taskq_t *tq) +{ + + taskqueue_free(tq->tq_queue); + kmem_free(tq, sizeof (*tq)); +} + +int +taskq_member(taskq_t *tq, kthread_t *thread) +{ + + return (taskqueue_member(tq->tq_queue, thread)); +} + +taskq_t * +taskq_of_curthread(void) +{ + return (tsd_get(taskq_tsd)); +} + +int +taskq_cancel_id(taskq_t *tq, taskqid_t tid) +{ + uint32_t pend; + int rc; + taskq_ent_t *ent = (void*)(tid & TQ_PTR_MASK); + + if (ent == NULL) + return (0); + if ((tid & TQ_MASK) != ent->tqent_gen) + return (0); + if (ent->tqent_type == TIMEOUT_TASK) { + rc = taskqueue_cancel_timeout(tq->tq_queue, + &ent->tqent_timeout_task, &pend); + } else + rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); + if (rc == EBUSY) + taskq_wait_id(tq, tid); + else + uma_zfree(taskq_zone, ent); + return (rc); +} + +static void +taskq_run(void *arg, int pending __unused) +{ + taskq_ent_t *task = arg; + + task->tqent_func(task->tqent_arg); + uma_zfree(taskq_zone, task); +} + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskq_ent_t *task; + taskqid_t tid; + clock_t timo; + int mflag; + + timo = expire_time - ddi_get_lbolt(); + if (timo <= 0) + return (taskq_dispatch(tq, func, arg, flags)); + + if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) + mflag = M_WAITOK; + else + mflag = M_NOWAIT; + + task = uma_zalloc(taskq_zone, mflag); + if (task == NULL) + return (0); + tid = (uintptr_t)task; + MPASS((tid & TQ_MASK) == 0); + task->tqent_func = func; + task->tqent_arg = arg; + task->tqent_type = TIMEOUT_TASK; + tid |= task->tqent_gen; + TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0, + taskq_run, task); + + taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task, + timo); + return (tid); +} + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *task; + int mflag, prio; + taskqid_t tid; + + if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) + mflag = M_WAITOK; + else + mflag = M_NOWAIT; + /* + * If TQ_FRONT is given, we want higher priority for this task, so it + * can go at the front of the queue. + */ + prio = !!(flags & TQ_FRONT); + + task = uma_zalloc(taskq_zone, mflag); + if (task == NULL) + return (0); + + tid = (uintptr_t)task; + MPASS((tid & TQ_MASK) == 0); + task->tqent_func = func; + task->tqent_arg = arg; + task->tqent_type = NORMAL_TASK; + TASK_INIT(&task->tqent_task, prio, taskq_run, task); + tid |= task->tqent_gen; + taskqueue_enqueue(tq->tq_queue, &task->tqent_task); + return (tid); +} + +static void +taskq_run_ent(void *arg, int pending __unused) +{ + taskq_ent_t *task = arg; + + task->tqent_func(task->tqent_arg); +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, + taskq_ent_t *task) +{ + int prio; + + /* + * If TQ_FRONT is given, we want higher priority for this task, so it + * can go at the front of the queue. + */ + prio = !!(flags & TQ_FRONT); + + task->tqent_func = func; + task->tqent_arg = arg; + + TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task); + taskqueue_enqueue(tq->tq_queue, &task->tqent_task); +} + +void +taskq_wait(taskq_t *tq) +{ + taskqueue_quiesce(tq->tq_queue); +} + +void +taskq_wait_id(taskq_t *tq, taskqid_t tid) +{ + taskq_ent_t *ent = (void*)(tid & TQ_PTR_MASK); + + if ((tid & TQ_MASK) != ent->tqent_gen) + return; + + taskqueue_drain(tq->tq_queue, &ent->tqent_task); +} + +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused) +{ + taskqueue_drain_all(tq->tq_queue); +} + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (t->tqent_task.ta_pending == 0); +} diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c new file mode 100644 index 000000000000..05dbfd06d79d --- /dev/null +++ b/module/os/freebsd/spl/spl_uio.c @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +/* + * $FreeBSD$ + */ + +#include +#include +#include + +/* + * same as uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +int +uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) +{ + struct iovec small_iovec[1]; + struct uio small_uio_clone; + struct uio *uio_clone; + int error; + + ASSERT3U(uio->uio_rw, ==, rw); + if (uio->uio_iovcnt == 1) { + small_uio_clone = *uio; + small_iovec[0] = *uio->uio_iov; + small_uio_clone.uio_iov = small_iovec; + uio_clone = &small_uio_clone; + } else { + uio_clone = cloneuio(uio); + } + + error = vn_io_fault_uiomove(p, n, uio_clone); + *cbytes = uio->uio_resid - uio_clone->uio_resid; + if (uio_clone != &small_uio_clone) + free(uio_clone, M_IOV); + return (error); +} + +/* + * Drop the next n chars out of *uiop. + */ +void +uioskip(uio_t *uio, size_t n) +{ + enum uio_seg segflg; + + /* For the full compatibility with illumos. */ + if (n > uio->uio_resid) + return; + + segflg = uio->uio_segflg; + uio->uio_segflg = UIO_NOCOPY; + uiomove(NULL, n, uio->uio_rw, uio); + uio->uio_segflg = segflg; +} diff --git a/module/os/freebsd/spl/spl_vfs.c b/module/os/freebsd/spl/spl_vfs.c new file mode 100644 index 000000000000..99da8c976532 --- /dev/null +++ b/module/os/freebsd/spl/spl_vfs.c @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2006-2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +MALLOC_DECLARE(M_MOUNT); + +void +vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, + int flags __unused) +{ + struct vfsopt *opt; + size_t namesize; + int locked; + + if (!(locked = mtx_owned(MNT_MTX(vfsp)))) + MNT_ILOCK(vfsp); + + if (vfsp->mnt_opt == NULL) { + void *opts; + + MNT_IUNLOCK(vfsp); + opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK); + MNT_ILOCK(vfsp); + if (vfsp->mnt_opt == NULL) { + vfsp->mnt_opt = opts; + TAILQ_INIT(vfsp->mnt_opt); + } else { + free(opts, M_MOUNT); + } + } + + MNT_IUNLOCK(vfsp); + + opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK); + namesize = strlen(name) + 1; + opt->name = malloc(namesize, M_MOUNT, M_WAITOK); + strlcpy(opt->name, name, namesize); + opt->pos = -1; + opt->seen = 1; + if (arg == NULL) { + opt->value = NULL; + opt->len = 0; + } else { + opt->len = strlen(arg) + 1; + opt->value = malloc(opt->len, M_MOUNT, M_WAITOK); + bcopy(arg, opt->value, opt->len); + } + + MNT_ILOCK(vfsp); + TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link); + if (!locked) + MNT_IUNLOCK(vfsp); +} + +void +vfs_clearmntopt(vfs_t *vfsp, const char *name) +{ + int locked; + + if (!(locked = mtx_owned(MNT_MTX(vfsp)))) + MNT_ILOCK(vfsp); + vfs_deleteopt(vfsp->mnt_opt, name); + if (!locked) + MNT_IUNLOCK(vfsp); +} + +int +vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) +{ + struct vfsoptlist *opts = vfsp->mnt_optnew; + int error; + + if (opts == NULL) + return (0); + error = vfs_getopt(opts, opt, (void **)argp, NULL); + return (error != 0 ? 0 : 1); +} + +int +mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, + char *fspec, int fsflags) +{ + struct vfsconf *vfsp; + struct mount *mp; + vnode_t *vp, *mvp; + struct ucred *cr; + int error; + + ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot"); + + vp = *vpp; + *vpp = NULL; + error = 0; + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) + error = ENAMETOOLONG; + if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) + error = ENODEV; + if (error == 0 && vp->v_type != VDIR) + error = ENOTDIR; + /* + * We need vnode lock to protect v_mountedhere and vnode interlock + * to protect v_iflag. + */ + if (error == 0) { + VI_LOCK(vp); + if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) + vp->v_iflag |= VI_MOUNT; + else + error = EBUSY; + VI_UNLOCK(vp); + } + if (error != 0) { + vput(vp); + return (error); + } + VOP_UNLOCK1(vp); + + /* + * Allocate and initialize the filesystem. + * We don't want regular user that triggered snapshot mount to be able + * to unmount it, so pass credentials of the parent mount. + */ + mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred); + + mp->mnt_optnew = NULL; + vfs_setmntopt(mp, "from", fspec, 0); + mp->mnt_optnew = mp->mnt_opt; + mp->mnt_opt = NULL; + + /* + * Set the mount level flags. + */ + mp->mnt_flag = fsflags & MNT_UPDATEMASK; + /* + * Snapshots are always read-only. + */ + mp->mnt_flag |= MNT_RDONLY; + /* + * We don't want snapshots to allow access to vulnerable setuid + * programs, so we turn off setuid when mounting snapshots. + */ + mp->mnt_flag |= MNT_NOSUID; + /* + * We don't want snapshots to be visible in regular + * mount(8) and df(1) output. + */ + mp->mnt_flag |= MNT_IGNORE; + /* + * XXX: This is evil, but we can't mount a snapshot as a regular user. + * XXX: Is is safe when snapshot is mounted from within a jail? + */ + cr = td->td_ucred; + td->td_ucred = kcred; + error = VFS_MOUNT(mp); + td->td_ucred = cr; + + if (error != 0) { + /* + * Clear VI_MOUNT and decrement the use count "atomically", + * under the vnode lock. This is not strictly required, + * but makes it easier to reason about the life-cycle and + * ownership of the covered vnode. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + vput(vp); + vfs_unbusy(mp); + vfs_freeopts(mp->mnt_optnew); + mp->mnt_vnodecovered = NULL; + vfs_mount_destroy(mp); + return (error); + } + + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + mp->mnt_opt = mp->mnt_optnew; + (void) VFS_STATFS(mp, &mp->mnt_stat); + + /* + * Prevent external consumers of mount options from reading + * mnt_optnew. + */ + mp->mnt_optnew = NULL; + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); +#ifdef FREEBSD_NAMECACHE + cache_purge(vp); +#endif + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + + vp->v_mountedhere = mp; + /* Put the new filesystem on the mount list. */ + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + vfs_event_signal(NULL, VQ_MOUNT, 0); + if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) + panic("mount: lost mount"); + VOP_UNLOCK1(vp); +#if __FreeBSD_version >= 1300048 + vfs_op_exit(mp); +#endif + vfs_unbusy(mp); + *vpp = mvp; + return (0); +} + +/* + * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it + * asynchronously using a taskq. This can avoid deadlocks caused by re-entering + * the file system as a result of releasing the vnode. Note, file systems + * already have to handle the race where the vnode is incremented before the + * inactive routine is called and does its locking. + * + * Warning: Excessive use of this routine can lead to performance problems. + * This is because taskqs throttle back allocation if too many are created. + */ +void +vn_rele_async(vnode_t *vp, taskq_t *taskq) +{ + VERIFY(vp->v_count > 0); + if (refcount_release_if_not_last(&vp->v_usecount)) { +#if __FreeBSD_version < 1300045 + vdrop(vp); +#endif + return; + } + VERIFY(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)vrele, vp, TQ_SLEEP) != 0); +} diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c new file mode 100644 index 000000000000..cd18ebb7adfc --- /dev/null +++ b/module/os/freebsd/spl/spl_vm.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +const int zfs_vm_pagerret_bad = VM_PAGER_BAD; +const int zfs_vm_pagerret_error = VM_PAGER_ERROR; +const int zfs_vm_pagerret_ok = VM_PAGER_OK; +const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; +const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; + +void +zfs_vmobject_assert_wlocked(vm_object_t object) +{ + + /* + * This is not ideal because FILE/LINE used by assertions will not + * be too helpful, but it must be an hard function for + * compatibility reasons. + */ + VM_OBJECT_ASSERT_WLOCKED(object); +} + +void +zfs_vmobject_wlock(vm_object_t object) +{ + + VM_OBJECT_WLOCK(object); +} + +void +zfs_vmobject_wunlock(vm_object_t object) +{ + + VM_OBJECT_WUNLOCK(object); +} diff --git a/module/os/freebsd/spl/spl_zlib.c b/module/os/freebsd/spl/spl_zlib.c new file mode 100644 index 000000000000..7549483d8bc5 --- /dev/null +++ b/module/os/freebsd/spl/spl_zlib.c @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#if __FreeBSD_version >= 1300041 +#include +#else +#include +#endif +#include + + +/*ARGSUSED*/ +static void * +zcalloc(void *opaque, uint_t items, uint_t size) +{ + + return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT)); +} + +/*ARGSUSED*/ +static void +zcfree(void *opaque, void *ptr) +{ + + free(ptr, M_SOLARIS); +} + +static int +zlib_deflateInit(z_stream *stream, int level) +{ + + stream->zalloc = zcalloc; + stream->opaque = NULL; + stream->zfree = zcfree; + + return (deflateInit(stream, level)); +} + +static int +zlib_deflate(z_stream *stream, int flush) +{ + return (deflate(stream, flush)); +} + +static int +zlib_deflateEnd(z_stream *stream) +{ + return (deflateEnd(stream)); +} + +static int +zlib_inflateInit(z_stream *stream) +{ + stream->zalloc = zcalloc; + stream->opaque = NULL; + stream->zfree = zcfree; + + return (inflateInit(stream)); +} + +static int +zlib_inflate(z_stream *stream, int finish) +{ +#if __FreeBSD_version >= 1300024 + return (inflate(stream, finish)); +#else + return (_zlib104_inflate(stream, finish)); +#endif +} + + +static int +zlib_inflateEnd(z_stream *stream) +{ + return (inflateInit(stream)); +} + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + // return (kmem_cache_alloc(zlib_workspace_cache, flags)); + return (NULL); +} + +static void +zlib_workspace_free(void *workspace) +{ + // kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + bzero(&stream, sizeof (stream)); + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + stream.opaque = NULL; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.opaque = zlib_workspace_alloc(KM_SLEEP); +#if 0 + if (!stream.opaque) + return (Z_MEM_ERROR); +#endif + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.opaque); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.opaque); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.opaque); + return (err); +} + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + bzero(&stream, sizeof (stream)); + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.opaque = zlib_workspace_alloc(KM_SLEEP); +#if 0 + if (!stream.opaque) + return (Z_MEM_ERROR); +#endif + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.opaque); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.opaque); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.opaque); + + return (err); +} + +#if 0 +int +spl_zlib_init(void) +{ + int size; + + size = MAX(spl_zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_VMEM | KMC_NOEMERGENCY); + if (!zlib_workspace_cache) + return (1); + + return (0); +} + +void +spl_zlib_fini(void) +{ + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; +} +#endif diff --git a/module/os/freebsd/spl/spl_zone.c b/module/os/freebsd/spl/spl_zone.c new file mode 100644 index 000000000000..40f21934ef61 --- /dev/null +++ b/module/os/freebsd/spl/spl_zone.c @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data"); + +/* + * Structure to record list of ZFS datasets exported to a zone. + */ +typedef struct zone_dataset { + LIST_ENTRY(zone_dataset) zd_next; + char zd_dataset[0]; +} zone_dataset_t; + +LIST_HEAD(zone_dataset_head, zone_dataset); + +static int zone_slot; + +int +zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd, *zd2; + struct prison *pr; + int dofree, error; + + if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) + return (error); + + /* Allocate memory before we grab prison's mutex. */ + zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK); + + sx_slock(&allprison_lock); + pr = prison_find(jailid); /* Locks &pr->pr_mtx. */ + sx_sunlock(&allprison_lock); + if (pr == NULL) { + free(zd, M_ZONES); + return (ENOENT); + } + + head = osd_jail_get(pr, zone_slot); + if (head != NULL) { + dofree = 0; + LIST_FOREACH(zd2, head, zd_next) { + if (strcmp(dataset, zd2->zd_dataset) == 0) { + free(zd, M_ZONES); + error = EEXIST; + goto end; + } + } + } else { + dofree = 1; + prison_hold_locked(pr); + mtx_unlock(&pr->pr_mtx); + head = malloc(sizeof (*head), M_ZONES, M_WAITOK); + LIST_INIT(head); + mtx_lock(&pr->pr_mtx); + error = osd_jail_set(pr, zone_slot, head); + KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", + error)); + } + strcpy(zd->zd_dataset, dataset); + LIST_INSERT_HEAD(head, zd, zd_next); +end: + if (dofree) + prison_free_locked(pr); + else + mtx_unlock(&pr->pr_mtx); + return (error); +} + +int +zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + struct prison *pr; + int error; + + if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) + return (error); + + sx_slock(&allprison_lock); + pr = prison_find(jailid); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENOENT); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) { + error = ENOENT; + goto end; + } + LIST_FOREACH(zd, head, zd_next) { + if (strcmp(dataset, zd->zd_dataset) == 0) + break; + } + if (zd == NULL) + error = ENOENT; + else { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + if (LIST_EMPTY(head)) + osd_jail_del(pr, zone_slot); + error = 0; + } +end: + mtx_unlock(&pr->pr_mtx); + return (error); +} + +/* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + struct prison *pr; + size_t len; + int ret = 0; + + if (dataset[0] == '\0') + return (0); + if (INGLOBALZONE(curproc)) { + if (write != NULL) + *write = 1; + return (1); + } + pr = curthread->td_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) + goto end; + + /* + * Walk the list once, looking for datasets which match exactly, or + * specify a dataset underneath an exported dataset. If found, return + * true and note that it is writable. + */ + LIST_FOREACH(zd, head, zd_next) { + len = strlen(zd->zd_dataset); + if (strlen(dataset) >= len && + bcmp(dataset, zd->zd_dataset, len) == 0 && + (dataset[len] == '\0' || dataset[len] == '/' || + dataset[len] == '@')) { + if (write) + *write = 1; + ret = 1; + goto end; + } + } + + /* + * Walk the list a second time, searching for datasets which are parents + * of exported datasets. These should be visible, but read-only. + * + * Note that we also have to support forms such as 'pool/dataset/', with + * a trailing slash. + */ + LIST_FOREACH(zd, head, zd_next) { + len = strlen(dataset); + if (dataset[len - 1] == '/') + len--; /* Ignore trailing slash */ + if (len < strlen(zd->zd_dataset) && + bcmp(dataset, zd->zd_dataset, len) == 0 && + zd->zd_dataset[len] == '/') { + if (write) + *write = 0; + ret = 1; + goto end; + } + } +end: + mtx_unlock(&pr->pr_mtx); + return (ret); +} + +static void +zone_destroy(void *arg) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + + head = arg; + while ((zd = LIST_FIRST(head)) != NULL) { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + } + free(head, M_ZONES); +} + +uint32_t +zone_get_hostid(void *ptr) +{ + + KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__)); + + return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid); +} + +boolean_t +in_globalzone(struct proc *p) +{ + return (!jailed(FIRST_THREAD_IN_PROC((p))->td_ucred)); +} + +static void +zone_sysinit(void *arg __unused) +{ + + zone_slot = osd_jail_register(zone_destroy, NULL); +} + +static void +zone_sysuninit(void *arg __unused) +{ + + osd_jail_deregister(zone_slot); +} + +SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL); +SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL); diff --git a/module/os/freebsd/zfs/abd.c b/module/os/freebsd/zfs/abd.c new file mode 100644 index 000000000000..888a113a4291 --- /dev/null +++ b/module/os/freebsd/zfs/abd.c @@ -0,0 +1,1134 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + */ + +#include +#include +#include +#include +#include + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +/* + * It is possible to make all future ABDs be linear by setting this to B_FALSE. + * Otherwise, ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear(). + */ +boolean_t zfs_abd_scatter_enabled = B_TRUE; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +#if defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, + &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, + &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +extern inline boolean_t abd_is_linear(abd_t *abd); +extern inline void abd_copy(abd_t *dabd, abd_t *sabd, size_t size); +extern inline void abd_copy_from_buf(abd_t *abd, const void *buf, size_t size); +extern inline void abd_copy_to_buf(void* buf, abd_t *abd, size_t size); +extern inline int abd_cmp_buf(abd_t *abd, const void *buf, size_t size); +extern inline void abd_zero(abd_t *abd, size_t size); + +static void * +abd_alloc_chunk() +{ + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + return (c); +} + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NOTOUCH | KMC_NODEBUG); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } +} + +void +abd_fini(void) +{ + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +static inline size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + abd->abd_u.abd_scatter.abd_offset + abd->abd_size)); +} + +static inline void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL); + } else { + ASSERT3U(abd->abd_u.abd_scatter.abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + abd->abd_u.abd_scatter.abd_chunks[i], !=, NULL); + } + } +} + +static inline abd_t * +abd_alloc_struct(size_t chunkcnt) +{ + size_t size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + ABDSTAT_INCR(abdstat_struct_size, size); + + return (abd); +} + +static inline void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) ? 0 : abd_scatter_chunkcnt(abd); + int size = offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt]); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + if (!zfs_abd_scatter_enabled || size <= zfs_abd_chunk_size) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + size_t n = abd_chunkcnt_for_bytes(size); + abd_t *abd = abd_alloc_struct(n); + + abd->abd_flags = ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + abd->abd_u.abd_scatter.abd_offset = 0; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + for (int i = 0; i < n; i++) { + void *c = abd_alloc_chunk(); + ASSERT3P(c, !=, NULL); + abd->abd_u.abd_scatter.abd_chunks[i] = c; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + n * zfs_abd_chunk_size - size); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(abd->abd_u.abd_scatter.abd_chunks[i]); + } + + zfs_refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, + abd->abd_size - n * zfs_abd_chunk_size); + + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + if (is_metadata) { + abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size); + } else { + abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size); + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, size); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } else { + zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size); + } + + zfs_refcount_destroy(&abd->abd_children); + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc() or + * abd_alloc_linear(). + */ +void +abd_free(abd_t *abd) +{ + if (abd == NULL) + return; + + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; + if (abd_is_linear(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +/* ARGSUSED */ +static inline abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +{ + abd_t *abd; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + abd->abd_u.abd_linear.abd_buf = + (char *)sabd->abd_u.abd_linear.abd_buf + off; + } else { + size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_struct(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd->abd_u.abd_scatter.abd_offset = + new_offset % zfs_abd_chunk_size; + abd->abd_u.abd_scatter.abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&abd->abd_u.abd_scatter.abd_chunks, + &sabd->abd_u.abd_scatter.abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + } + + if (size == 0) + abd->abd_size = sabd->abd_size - off; + else + abd->abd_size = size; + abd->abd_parent = sabd; + zfs_refcount_create(&abd->abd_children); + (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + + return (abd); +} + +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + + return (abd_get_offset_impl(sabd, off, 0)); +} + +abd_t * +abd_get_offset_size(abd_t *sabd, size_t off, size_t size) +{ + ASSERT3U(off + size, <=, sabd->abd_size); + + return (abd_get_offset_impl(sabd, off, size)); +} + + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + abd->abd_u.abd_linear.abd_buf = buf; + + return (abd); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + if (abd == NULL) + return; + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + zfs_refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (abd->abd_u.abd_linear.abd_buf); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); + + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); +} + +struct abd_iter { + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; /* position (relative to abd_offset) */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ +}; + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((aiter->iter_abd->abd_u.abd_scatter.abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +static void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +static void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + aiter->iter_abd->abd_u.abd_scatter.abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf; + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = aiter->iter_abd->abd_u.abd_scatter.abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +static void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (aiter->iter_pos == aiter->iter_abd->abd_size) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + while (size > 0) { + abd_iter_map(&aiter); + + size_t len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + abd_iter_init(&daiter, dabd); + abd_iter_init(&saiter, sabd); + abd_iter_advance(&daiter, doff); + abd_iter_advance(&saiter, soff); + + while (size > 0) { + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + size_t dlen = MIN(daiter.iter_mapsize, size); + size_t slen = MIN(saiter.iter_mapsize, size); + size_t len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + abd_iter_advance(&daiter, len); + abd_iter_advance(&saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the contents of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd) +{ + ASSERT3U(dabd->abd_size, ==, sabd->abd_size); + return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, + abd_cmp_cb, NULL)); +} + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter = {0}; + void *caddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) + abd_iter_init(&caiters[i], cabds[i]); + + if (dabd) + abd_iter_init(&daiter, dabd); + + ASSERT3S(dsize, >=, 0); + + critical_enter(); + while (csize > 0) { + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + for (i = 0; i < parity; i++) { + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + case 2: + len = MIN(caiters[1].iter_mapsize, len); + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + abd_iter_advance(&caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + abd_iter_advance(&daiter, dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } + critical_exit(); +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + abd_iter_init(&citers[i], cabds[i]); + abd_iter_init(&xiters[i], tabds[i]); + } + + critical_enter(); + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + abd_iter_advance(&xiters[i], len); + abd_iter_advance(&citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } + critical_exit(); +} diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c new file mode 100644 index 000000000000..f0c2724471f9 --- /dev/null +++ b/module/os/freebsd/zfs/arc_os.c @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct vfsops zfs_vfsops; + +/* vmem_size typemask */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 +#define VMEM_MAXFREE 0x10 +typedef size_t vmem_size_t; +extern vmem_size_t vmem_size(vmem_t *vm, int typemask); + +uint_t zfs_arc_free_target = 0; + +int64_t last_free_memory; +free_memory_reason_t last_free_reason; + +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + int64_t n __unused; + free_memory_reason_t r = FMR_UNKNOWN; + +#ifdef _KERNEL + /* + * Cooperate with pagedaemon when it's time for it to scan + * and reclaim some pages. + */ + n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); + if (n < lowest) { + lowest = n; + r = FMR_LOTSFREE; + } +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + n = uma_avail() - (long)(uma_limit() / 4); + if (n < lowest) { + lowest = n; + r = FMR_HEAP_ARENA; + } +#endif + + /* + * If zio data pages are being allocated out of a separate heap segment, + * then enforce that the size of available vmem for this arena remains + * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. + * + * Note that reducing the arc_zio_arena_free_shift keeps more virtual + * memory (in the zio_arena) free, which can avoid memory + * fragmentation issues. + */ + if (zio_arena != NULL) { + n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - + (vmem_size(zio_arena, VMEM_ALLOC) >> + arc_zio_arena_free_shift); + if (n < lowest) { + lowest = n; + r = FMR_ZIO_ARENA; + } + } + +#else /* _KERNEL */ + /* Every 100 calls, free a small amount */ + if (spa_get_random(100) == 0) + lowest = -1024; +#endif /* _KERNEL */ + + last_free_memory = lowest; + last_free_reason = r; + DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r); + return (lowest); +} + +/* + * Return a default max arc size based on the amount of physical memory. + */ +uint64_t +arc_default_max(uint64_t min, uint64_t allmem) +{ + uint64_t size; + + if (allmem >= 1 << 30) + size = allmem - (1 << 30); + else + size = min; + return (MAX(allmem * 5 / 8, size)); +} + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *arg) +{ + int64_t nr_scan = *(int64_t *)arg; + + arc_reduce_target_size(ptob(nr_scan)); + free(arg, M_TEMP); + vnlru_free(nr_scan, &zfs_vfsops); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +void +arc_prune_async(int64_t adjust) +{ + + int64_t *adjustptr; + + if ((adjustptr = malloc(sizeof (int64_t), M_TEMP, M_NOWAIT)) == NULL) + return; + + *adjustptr = adjust; + taskq_dispatch(arc_prune_taskq, arc_prune_task, adjustptr, TQ_SLEEP); + ARCSTAT_BUMP(arcstat_prune); +} + +uint64_t +arc_all_memory(void) +{ + return ((uint64_t)ptob(physmem)); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + return (0); +} + +uint64_t +arc_free_memory(void) +{ + /* XXX */ + return (0); +} + +static eventhandler_tag arc_event_lowmem = NULL; + +static void +arc_lowmem(void *arg __unused, int howto __unused) +{ + int64_t free_memory, to_free; + + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + free_memory = arc_available_memory(); + to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); + DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); + arc_reduce_target_size(to_free); + + mutex_enter(&arc_adjust_lock); + arc_adjust_needed = B_TRUE; + zthr_wakeup(arc_adjust_zthr); + + /* + * It is unsafe to block here in arbitrary threads, because we can come + * here from ARC itself and may hold ARC locks and thus risk a deadlock + * with ARC reclaim thread. + */ + if (curproc == pageproc) + (void) cv_wait(&arc_adjust_waiters_cv, &arc_adjust_lock); + mutex_exit(&arc_adjust_lock); +} + +void +arc_lowmem_init(void) +{ + arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); + +} + +void +arc_lowmem_fini(void) +{ + if (arc_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); +} diff --git a/module/os/freebsd/zfs/crypto_os.c b/module/os/freebsd/zfs/crypto_os.c new file mode 100644 index 000000000000..cc86074c2648 --- /dev/null +++ b/module/os/freebsd/zfs/crypto_os.c @@ -0,0 +1,613 @@ +/* + * Copyright (c) 2005-2010 Pawel Jakub Dawidek + * Copyright (c) 2018 Sean Eric Fagan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#else +#include +#endif + +#include +#include +#include + +#include + +#define SHA512_HMAC_BLOCK_SIZE 128 + +static int crypt_sessions = 0; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD, + &crypt_sessions, 0, "Number of cryptographic sessions created"); + +void +crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key) +{ + uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE], + k_opad[SHA512_HMAC_BLOCK_SIZE], + key[SHA512_HMAC_BLOCK_SIZE]; + SHA512_CTX lctx; + int i; + size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length); + + /* + * This code is based on the similar code in geom/eli/g_eli_hmac.c + */ + explicit_bzero(key, sizeof (key)); + if (c_key->ck_length == 0) + /* do nothing */; + else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE) + bcopy(c_key->ck_data, key, cl_bytes); + else { + /* + * If key is longer than 128 bytes reset it to + * key = SHA512(key). + */ + SHA512_Init(&lctx); + SHA512_Update(&lctx, c_key->ck_data, cl_bytes); + SHA512_Final(key, &lctx); + } + + /* XOR key with ipad and opad values. */ + for (i = 0; i < sizeof (key); i++) { + k_ipad[i] = key[i] ^ 0x36; + k_opad[i] = key[i] ^ 0x5c; + } + explicit_bzero(key, sizeof (key)); + + /* Start inner SHA512. */ + SHA512_Init(&ctx->innerctx); + SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad)); + explicit_bzero(k_ipad, sizeof (k_ipad)); + /* Start outer SHA512. */ + SHA512_Init(&ctx->outerctx); + SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad)); + explicit_bzero(k_opad, sizeof (k_opad)); +} + +void +crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize) +{ + SHA512_Update(&ctx->innerctx, data, datasize); +} + +void +crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize) +{ + uint8_t digest[SHA512_DIGEST_LENGTH]; + + /* Complete inner hash */ + SHA512_Final(digest, &ctx->innerctx); + + /* Complete outer hash */ + SHA512_Update(&ctx->outerctx, digest, sizeof (digest)); + SHA512_Final(digest, &ctx->outerctx); + + explicit_bzero(ctx, sizeof (*ctx)); + /* mdsize == 0 means "Give me the whole hash!" */ + if (mdsize == 0) + mdsize = SHA512_DIGEST_LENGTH; + bcopy(digest, md, mdsize); + explicit_bzero(digest, sizeof (digest)); +} + +void +crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size, + void *out_data, size_t out_data_size) +{ + struct hmac_ctx ctx; + + crypto_mac_init(&ctx, key); + crypto_mac_update(&ctx, in_data, in_data_size); + crypto_mac_final(&ctx, out_data, out_data_size); +} + +static int +freebsd_zfs_crypt_done(struct cryptop *crp) +{ + freebsd_crypt_session_t *ses; + + ses = crp->crp_opaque; + mtx_lock(&ses->fs_lock); + ses->fs_done = true; + mtx_unlock(&ses->fs_lock); + wakeup(crp); + return (0); +} + +void +freebsd_crypt_freesession(freebsd_crypt_session_t *sess) +{ + mtx_destroy(&sess->fs_lock); + crypto_freesession(sess->fs_sid); + explicit_bzero(sess, sizeof (*sess)); +} + +static int +zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp) +{ + int error; + + crp->crp_opaque = session; + crp->crp_callback = freebsd_zfs_crypt_done; + for (;;) { + error = crypto_dispatch(crp); + if (error) + break; + mtx_lock(&session->fs_lock); + while (session->fs_done == false) + msleep(crp, &session->fs_lock, PRIBIO, + "zfs_crypto", hz/5); + mtx_unlock(&session->fs_lock); + + if (crp->crp_etype != EAGAIN) { + error = crp->crp_etype; + break; + } + crp->crp_etype = 0; + crp->crp_flags &= ~CRYPTO_F_DONE; + session->fs_done = false; +#if __FreeBSD_version < 1300087 + /* + * Session ID changed, so we should record that, + * and try again + */ + session->fs_sid = crp->crp_session; +#endif + } + return (error); +} +static void +freebsd_crypt_uio_debug_log(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ +#ifdef FCRYPTO_DEBUG + struct cryptodesc *crd; + uint8_t *p = NULL; + size_t total = 0; + + printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %d, %p, %u }, " + "%p, %u, %u)\n", + __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + data_uio, key->ck_format, key->ck_data, + (unsigned int)key->ck_length, + ivbuf, (unsigned int)datalen, (unsigned int)auth_len); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); + for (int i = 0; i < data_uio->uio_iovcnt; i++) { + printf("\tiovec #%d: <%p, %u>\n", i, + data_uio->uio_iov[i].iov_base, + (unsigned int)data_uio->uio_iov[i].iov_len); + total += data_uio->uio_iov[i].iov_len; + } + data_uio->uio_resid = total; +#endif +} +/* + * Create a new cryptographic session. This should + * happen every time the key changes (including when + * it's first loaded). + */ +#if __FreeBSD_version >= 1300087 +int +freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *c_info, crypto_key_t *key) +{ + struct crypto_session_params csp; + int error = 0; + +#ifdef FCRYPTO_DEBUG + printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n", + __FUNCTION__, sessp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + key->ck_format, key->ck_data, (unsigned int)key->ck_length); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); +#endif + bzero(&csp, sizeof (csp)); + csp.csp_mode = CSP_MODE_AEAD; + csp.csp_cipher_key = key->ck_data; + csp.csp_cipher_klen = key->ck_length / 8; + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16; + csp.csp_ivlen = AES_GCM_IV_LEN; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + case AES_192_GMAC_KEY_LEN: + case AES_256_GMAC_KEY_LEN: + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + csp.csp_cipher_alg = CRYPTO_AES_CCM_16; + csp.csp_ivlen = AES_CCM_IV_LEN; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + case AES_192_CBC_MAC_KEY_LEN: + case AES_256_CBC_MAC_KEY_LEN: + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } + error = crypto_newsession(&sessp->fs_sid, &csp, + CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); + mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock", + NULL, MTX_DEF); + crypt_sessions++; +bad: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + return (error); +} + +int +freebsd_crypt_uio(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ + struct cryptop *crp; + freebsd_crypt_session_t *session = NULL; + int error = 0; + size_t total = 0; + + freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, + key, ivbuf, datalen, auth_len); + for (int i = 0; i < data_uio->uio_iovcnt; i++) + total += data_uio->uio_iov[i].iov_len; + data_uio->uio_resid = total; + if (input_sessionp == NULL) { + session = kmem_zalloc(sizeof (*session), KM_SLEEP); + error = freebsd_crypt_newsession(session, c_info, key); + if (error) + goto out; + } else + session = input_sessionp; + + crp = crypto_getreq(session->fs_sid, M_WAITOK); + if (encrypt) { + crp->crp_op = CRYPTO_OP_ENCRYPT | + CRYPTO_OP_COMPUTE_DIGEST; + } else { + crp->crp_op = CRYPTO_OP_DECRYPT | + CRYPTO_OP_VERIFY_DIGEST; + } + crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE; + crp->crp_buf_type = CRYPTO_BUF_UIO; + crp->crp_uio = (void*)data_uio; + crp->crp_ilen = data_uio->uio_resid; + + crp->crp_aad_start = 0; + crp->crp_aad_length = auth_len; + crp->crp_payload_start = auth_len; + crp->crp_payload_length = datalen; + crp->crp_digest_start = auth_len + datalen; + + bcopy(ivbuf, crp->crp_iv, ZIO_DATA_IV_LEN); + error = zfs_crypto_dispatch(session, crp); + crypto_freereq(crp); +out: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + if (input_sessionp == NULL) { + freebsd_crypt_freesession(session); + kmem_free(session, sizeof (*session)); + } + return (error); +} + +#else +int +freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *c_info, crypto_key_t *key) +{ + struct cryptoini cria, crie, *crip; + struct enc_xform *xform; + struct auth_hash *xauth; + int error = 0; + crypto_session_t sid; + +#ifdef FCRYPTO_DEBUG + printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n", + __FUNCTION__, sessp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + key->ck_format, key->ck_data, (unsigned int)key->ck_length); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); +#endif + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + xform = &enc_xform_aes_nist_gcm; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_128; + break; + case AES_192_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_192; + break; + case AES_256_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_256; + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + xform = &enc_xform_ccm; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_128; + break; + case AES_192_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_192; + break; + case AES_256_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_256; + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Using crypt %s (key length %u [%u bytes]), " + "auth %s (key length %d)\n", + __FUNCTION__, __LINE__, + xform->name, (unsigned int)key->ck_length, + (unsigned int)key->ck_length/8, + xauth->name, xauth->keysize); +#endif + + bzero(&crie, sizeof (crie)); + bzero(&cria, sizeof (cria)); + + crie.cri_alg = xform->type; + crie.cri_key = key->ck_data; + crie.cri_klen = key->ck_length; + + cria.cri_alg = xauth->type; + cria.cri_key = key->ck_data; + cria.cri_klen = key->ck_length; + + cria.cri_next = &crie; + crie.cri_next = NULL; + crip = &cria; + // Everything else is bzero'd + + error = crypto_newsession(&sid, crip, + CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); + if (error != 0) { + printf("%s(%d): crypto_newsession failed with %d\n", + __FUNCTION__, __LINE__, error); + goto bad; + } + sessp->fs_sid = sid; + mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock", + NULL, MTX_DEF); + crypt_sessions++; +bad: + return (error); +} + +/* + * The meat of encryption/decryption. + * If sessp is NULL, then it will create a + * temporary cryptographic session, and release + * it when done. + */ +int +freebsd_crypt_uio(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ + struct cryptop *crp; + struct cryptodesc *enc_desc, *auth_desc; + struct enc_xform *xform; + struct auth_hash *xauth; + freebsd_crypt_session_t *session = NULL; + int error; + + freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, + key, ivbuf, datalen, auth_len); + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + xform = &enc_xform_aes_nist_gcm; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_128; + break; + case AES_192_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_192; + break; + case AES_256_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_256; + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + xform = &enc_xform_ccm; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_128; + break; + case AES_192_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_192; + break; + case AES_256_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_256; + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } + +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Using crypt %s (key length %u [%u bytes]), " + "auth %s (key length %d)\n", + __FUNCTION__, __LINE__, + xform->name, (unsigned int)key->ck_length, + (unsigned int)key->ck_length/8, + xauth->name, xauth->keysize); +#endif + + if (input_sessionp == NULL) { + session = kmem_zalloc(sizeof (*session), KM_SLEEP); + error = freebsd_crypt_newsession(session, c_info, key); + if (error) + goto out; + } else + session = input_sessionp; + + crp = crypto_getreq(2); + if (crp == NULL) { + error = ENOMEM; + goto bad; + } + + auth_desc = crp->crp_desc; + enc_desc = auth_desc->crd_next; + + crp->crp_session = session->fs_sid; + crp->crp_ilen = auth_len + datalen; + crp->crp_buf = (void*)data_uio; + crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC; + + auth_desc->crd_skip = 0; + auth_desc->crd_len = auth_len; + auth_desc->crd_inject = auth_len + datalen; + auth_desc->crd_alg = xauth->type; +#ifdef FCRYPTO_DEBUG + printf("%s: auth: skip = %u, len = %u, inject = %u\n", + __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len, + auth_desc->crd_inject); +#endif + + enc_desc->crd_skip = auth_len; + enc_desc->crd_len = datalen; + enc_desc->crd_inject = auth_len; + enc_desc->crd_alg = xform->type; + enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; + bcopy(ivbuf, enc_desc->crd_iv, ZIO_DATA_IV_LEN); + enc_desc->crd_next = NULL; + +#ifdef FCRYPTO_DEBUG + printf("%s: enc: skip = %u, len = %u, inject = %u\n", + __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len, + enc_desc->crd_inject); +#endif + + if (encrypt) + enc_desc->crd_flags |= CRD_F_ENCRYPT; + + error = zfs_crypto_dispatch(session, crp); + crypto_freereq(crp); +out: + if (input_sessionp == NULL) { + freebsd_crypt_freesession(session); + kmem_free(session, sizeof (*session)); + } +bad: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + return (error); +} +#endif diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c new file mode 100644 index 000000000000..268c843e50ee --- /dev/null +++ b/module/os/freebsd/zfs/dmu_os.c @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifndef IDX_TO_OFF +#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) +#endif + +#if __FreeBSD_version < 1300051 +#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY +#else +#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY +#endif + + +#if __FreeBSD_version < 1300072 +#define dmu_page_lock(m) vm_page_lock(m) +#define dmu_page_unlock(m) vm_page_unlock(m) +#else +#define dmu_page_lock(m) +#define dmu_page_unlock(m) +#endif + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + vm_page_t *ma, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + struct sf_buf *sf; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(ptoa((*ma)->pindex), ==, + db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(*ma, &sf); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(sf); + ma += 1; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +int +dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size) +{ + struct sf_buf *sf; + vm_object_t vmobj; + vm_page_t m; + dmu_buf_t **dbp; + dmu_buf_t *db; + caddr_t va; + int numbufs, i; + int bufoff, pgoff, tocpy; + int mi, di; + int err; + + ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); + ASSERT(last_size <= PAGE_SIZE); + + err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), + IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); + if (err != 0) + return (err); + +#ifdef DEBUG + IMPLY(last_size < PAGE_SIZE, *rahead == 0); + if (dbp[0]->db_offset != 0 || numbufs > 1) { + for (i = 0; i < numbufs; i++) { + ASSERT(ISP2(dbp[i]->db_size)); + ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0); + ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); + } + } +#endif + + vmobj = ma[0]->object; + zfs_vmobject_wlock(vmobj); + + db = dbp[0]; + for (i = 0; i < *rbehind; i++) { + m = vm_page_grab(vmobj, ma[0]->pindex - 1 - i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); + if (m == NULL) + break; + if (!vm_page_none_valid(m)) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(m); + break; + } + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + + ASSERT(db->db_size > PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, PAGESIZE); + zfs_unmap_page(sf); + vm_page_valid(m); + dmu_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + dmu_page_unlock(m); + vm_page_do_sunbusy(m); + } + *rbehind = i; + + bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; + pgoff = 0; + for (mi = 0, di = 0; mi < count && di < numbufs; ) { + if (pgoff == 0) { + m = ma[mi]; + if (m != bogus_page) { + vm_page_assert_xbusied(m); + ASSERT(vm_page_none_valid(m)); + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + va = zfs_map_page(m, &sf); + } + } + if (bufoff == 0) + db = dbp[di]; + + if (m != bogus_page) { + ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, + db->db_offset + bufoff); + } + + /* + * We do not need to clamp the copy size by the file + * size as the last block is zero-filled beyond the + * end of file anyway. + */ + tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); + if (m != bogus_page) + bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); + + pgoff += tocpy; + ASSERT(pgoff <= PAGESIZE); + if (pgoff == PAGESIZE) { + if (m != bogus_page) { + zfs_unmap_page(sf); + vm_page_valid(m); + } + ASSERT(mi < count); + mi++; + pgoff = 0; + } + + bufoff += tocpy; + ASSERT(bufoff <= db->db_size); + if (bufoff == db->db_size) { + ASSERT(di < numbufs); + di++; + bufoff = 0; + } + } + +#ifdef DEBUG + /* + * Three possibilities: + * - last requested page ends at a buffer boundary and , thus, + * all pages and buffers have been iterated; + * - all requested pages are filled, but the last buffer + * has not been exhausted; + * the read-ahead is possible only in this case; + * - all buffers have been read, but the last page has not been + * fully filled; + * this is only possible if the file has only a single buffer + * with a size that is not a multiple of the page size. + */ + if (mi == count) { + ASSERT(di >= numbufs - 1); + IMPLY(*rahead != 0, di == numbufs - 1); + IMPLY(*rahead != 0, bufoff != 0); + ASSERT(pgoff == 0); + } + if (di == numbufs) { + ASSERT(mi >= count - 1); + ASSERT(*rahead == 0); + IMPLY(pgoff == 0, mi == count); + if (pgoff != 0) { + ASSERT(mi == count - 1); + ASSERT((dbp[0]->db_size & PAGE_MASK) != 0); + } + } +#endif + if (pgoff != 0) { + ASSERT(m != bogus_page); + bzero(va + pgoff, PAGESIZE - pgoff); + zfs_unmap_page(sf); + vm_page_valid(m); + } + + for (i = 0; i < *rahead; i++) { + m = vm_page_grab(vmobj, ma[count - 1]->pindex + 1 + i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); + if (m == NULL) + break; + if (!vm_page_none_valid(m)) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(m); + break; + } + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + + ASSERT(db->db_size > PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + tocpy = MIN(db->db_size - bufoff, PAGESIZE); + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, tocpy); + if (tocpy < PAGESIZE) { + ASSERT(i == *rahead - 1); + ASSERT((db->db_size & PAGE_MASK) != 0); + bzero(va + tocpy, PAGESIZE - tocpy); + } + zfs_unmap_page(sf); + vm_page_valid(m); + dmu_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + dmu_page_unlock(m); + vm_page_do_sunbusy(m); + } + *rahead = i; + zfs_vmobject_wunlock(vmobj); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (0); +} diff --git a/module/os/freebsd/zfs/hkdf.c b/module/os/freebsd/zfs/hkdf.c new file mode 100644 index 000000000000..8324ff2319b6 --- /dev/null +++ b/module/os/freebsd/zfs/hkdf.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include + +static int +hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material, + uint_t km_len, uint8_t *out_buf) +{ + crypto_key_t key; + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(salt_len); + key.ck_data = salt; + + crypto_mac(&key, key_material, km_len, out_buf, SHA512_DIGEST_LENGTH); + + return (0); +} + +static int +hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, + uint8_t *out_buf, uint_t out_len) +{ + struct hmac_ctx ctx; + crypto_key_t key; + uint_t i, T_len = 0, pos = 0; + uint8_t c; + uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH; + uint8_t T[SHA512_DIGEST_LENGTH]; + + if (N > 255) + return (SET_ERROR(EINVAL)); + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH); + key.ck_data = extract_key; + + for (i = 1; i <= N; i++) { + c = i; + + crypto_mac_init(&ctx, &key); + crypto_mac_update(&ctx, T, T_len); + crypto_mac_update(&ctx, info, info_len); + crypto_mac_update(&ctx, &c, 1); + crypto_mac_final(&ctx, T, SHA512_DIGEST_LENGTH); + bcopy(T, out_buf + pos, + (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos)); + pos += SHA512_DIGEST_LENGTH; + } + + return (0); +} + +/* + * HKDF is designed to be a relatively fast function for deriving keys from a + * master key + a salt. We use this function to generate new encryption keys + * so as to avoid hitting the cryptographic limits of the underlying + * encryption modes. Note that, for the sake of deriving encryption keys, the + * info parameter is called the "salt" everywhere else in the code. + */ +int +hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt, + uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key, + uint_t out_len) +{ + int ret; + uint8_t extract_key[SHA512_DIGEST_LENGTH]; + + ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len, + extract_key); + if (ret != 0) + return (ret); + + ret = hkdf_sha512_expand(extract_key, info, info_len, output_key, + out_len); + if (ret != 0) + return (ret); + + return (0); +} diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c new file mode 100644 index 000000000000..10807afa3d31 --- /dev/null +++ b/module/os/freebsd/zfs/kmod_core.c @@ -0,0 +1,404 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_deleg.h" +#include "zfs_comutil.h" + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_DECL(_vfs_zfs_vdev); + + +static int zfs_version_ioctl = ZFS_IOCVER_ZOF; +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl, + 0, "ZFS_IOCTL_VERSION"); + +static struct cdev *zfsdev; + +extern void zfs_init(void); +extern void zfs_fini(void); +extern void zfs_ioctl_init(void); + + +static struct root_hold_token *zfs_root_token; + +extern uint_t rrw_tsd_key; +extern uint_t zfs_allow_log_key; +extern uint_t zfs_geom_probe_vdev_key; + +static int zfs__init(void); +static int zfs__fini(void); +static void zfs_shutdown(void *, int); + +static eventhandler_tag zfs_shutdown_event_tag; +extern zfsdev_state_t *zfsdev_state_list; + +#define ZFS_MIN_KSTACK_PAGES 4 + +static void +zfs_cmd_bsd12_to_zof(zfs_cmd_legacy_t *src, zfs_cmd_t *dst) +{ + memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats)); + *&dst->zc_objset_stats = *&src->zc_objset_stats; + memcpy(&dst->zc_begin_record, &src->zc_begin_record, + offsetof(zfs_cmd_t, zc_sendobj) - + offsetof(zfs_cmd_t, zc_begin_record)); + memcpy(&dst->zc_sendobj, &src->zc_sendobj, + sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); + dst->zc_zoneid = src->zc_jailid; +} + +static void +zfs_cmd_zof_to_bsd12(zfs_cmd_t *src, zfs_cmd_legacy_t *dst) +{ + memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats)); + *&dst->zc_objset_stats = *&src->zc_objset_stats; + memcpy(&dst->zc_begin_record, &src->zc_begin_record, + offsetof(zfs_cmd_t, zc_sendobj) - + offsetof(zfs_cmd_t, zc_begin_record)); + memcpy(&dst->zc_sendobj, &src->zc_sendobj, + sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); + dst->zc_jailid = src->zc_zoneid; +} + +static int +zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, + struct thread *td) +{ + uint_t len, vecnum; + zfs_iocparm_t *zp; + zfs_cmd_t *zc; + zfs_cmd_legacy_t *zcl; + int rc, error; + void *uaddr; + + len = IOCPARM_LEN(zcmd); + vecnum = zcmd & 0xff; + zp = (void *)arg; + uaddr = (void *)zp->zfs_cmd; + error = 0; + zcl = NULL; + + if (len != sizeof (zfs_iocparm_t)) { + printf("len %d vecnum: %d sizeof (zfs_cmd_t) %lu\n", + len, vecnum, sizeof (zfs_cmd_t)); + return (EINVAL); + } + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + /* + * Remap ioctl code for legacy user binaries + */ + if (zp->zfs_ioctl_version == ZFS_IOCVER_FREEBSD) { + if (vecnum >= sizeof (zfs_ioctl_bsd12_to_zof)/sizeof (long)) { + kmem_free(zc, sizeof (zfs_cmd_t)); + return (ENOTSUP); + } + zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); + vecnum = zfs_ioctl_bsd12_to_zof[vecnum]; + if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + zfs_cmd_bsd12_to_zof(zcl, zc); + } else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + error = zfsdev_ioctl_common(vecnum, zc); + if (zcl) { + zfs_cmd_zof_to_bsd12(zc, zcl); + rc = copyout(zcl, uaddr, sizeof (*zcl)); + } else { + rc = copyout(zc, uaddr, sizeof (*zc)); + } + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); +out: + if (zcl) + kmem_free(zcl, sizeof (zfs_cmd_legacy_t)); + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); +} + +static void +zfsdev_close(void *data) +{ + zfsdev_state_t *zs, *zsp = data; + + mutex_enter(&zfsdev_state_lock); + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs == zsp) + break; + } + if (zs == NULL || zs->zs_minor <= 0) { + mutex_exit(&zfsdev_state_lock); + return; + } + zs->zs_minor = -1; + zfs_onexit_destroy(zs->zs_onexit); + zfs_zevent_destroy(zs->zs_zevent); + mutex_exit(&zfsdev_state_lock); +} + +static int +zfs_ctldev_init(struct cdev *devp) +{ + boolean_t newzs = B_FALSE; + minor_t minor; + zfsdev_state_t *zs, *zsprev = NULL; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + minor = zfsdev_minor_alloc(); + if (minor == 0) + return (SET_ERROR(ENXIO)); + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs->zs_minor == -1) + break; + zsprev = zs; + } + + if (!zs) { + zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); + newzs = B_TRUE; + } + + devfs_set_cdevpriv(zs, zfsdev_close); + + zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); + zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); + + if (newzs) { + zs->zs_minor = minor; + wmb(); + zsprev->zs_next = zs; + } else { + wmb(); + zs->zs_minor = minor; + } + return (0); +} + +static int +zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfs_ctldev_init(devp); + mutex_exit(&zfsdev_state_lock); + + return (error); +} + +static struct cdevsw zfs_cdevsw = { + .d_version = D_VERSION, + .d_open = zfsdev_open, + .d_ioctl = zfsdev_ioctl, + .d_name = ZFS_DRIVER +}; + +int +zfsdev_attach(void) +{ + zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, + ZFS_DRIVER); + return (0); +} + +void +zfsdev_detach(void) +{ + if (zfsdev != NULL) + destroy_dev(zfsdev); +} + +int +zfs__init(void) +{ + int error; + +#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES + printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " + "overflow panic!\nPlease consider adding " + "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, + ZFS_MIN_KSTACK_PAGES); +#endif + zfs_root_token = root_mount_hold("ZFS"); + if ((error = zfs_kmod_init()) != 0) { + printf("ZFS: Failed to Load ZFS Filesystem" + ", rc = %d\n", error); + root_mount_rel(zfs_root_token); + return (error); + } + + + tsd_create(&zfs_geom_probe_vdev_key, NULL); + + printf("ZFS storage pool version: features support (" + SPA_VERSION_STRING ")\n"); + root_mount_rel(zfs_root_token); + ddi_sysevent_init(); + return (0); +} + +int +zfs__fini(void) +{ + if (zfs_busy() || zvol_busy() || + zio_injection_enabled) { + return (EBUSY); + } + zfs_kmod_fini(); + tsd_destroy(&zfs_geom_probe_vdev_key); + return (0); +} + +static void +zfs_shutdown(void *arg __unused, int howto __unused) +{ + + /* + * ZFS fini routines can not properly work in a panic-ed system. + */ + if (panicstr == NULL) + zfs__fini(); +} + + +static int +zfs_modevent(module_t mod, int type, void *unused __unused) +{ + int err; + + switch (type) { + case MOD_LOAD: + err = zfs__init(); + if (err == 0) + zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( + shutdown_post_sync, zfs_shutdown, NULL, + SHUTDOWN_PRI_FIRST); + return (err); + case MOD_UNLOAD: + err = zfs__fini(); + if (err == 0 && zfs_shutdown_event_tag != NULL) + EVENTHANDLER_DEREGISTER(shutdown_post_sync, + zfs_shutdown_event_tag); + return (err); + case MOD_SHUTDOWN: + return (0); + default: + break; + } + return (EOPNOTSUPP); +} + +static moduledata_t zfs_mod = { + "zfsctrl", + zfs_modevent, + 0 +}; + +#ifdef _KERNEL +EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); +#endif + +DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY); +MODULE_VERSION(zfsctrl, 1); +MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); +MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); +MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1); +MODULE_DEPEND(zfsctrl, cryptodev, 1, 1, 1); diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c new file mode 100644 index 000000000000..ed124a5faf8b --- /dev/null +++ b/module/os/freebsd/zfs/spa_os.c @@ -0,0 +1,280 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, + uint64_t *count); + +static nvlist_t * +spa_generate_rootconf(const char *name) +{ + nvlist_t **configs, **tops; + nvlist_t *config; + nvlist_t *best_cfg, *nvtop, *nvroot; + uint64_t *holes; + uint64_t best_txg; + uint64_t nchildren; + uint64_t pgid; + uint64_t count; + uint64_t i; + uint_t nholes; + + if (vdev_geom_read_pool_label(name, &configs, &count) != 0) + return (NULL); + + ASSERT3U(count, !=, 0); + best_txg = 0; + for (i = 0; i < count; i++) { + uint64_t txg; + + VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, + &txg) == 0); + if (txg > best_txg) { + best_txg = txg; + best_cfg = configs[i]; + } + } + + nchildren = 1; + nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); + holes = NULL; + nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, + &holes, &nholes); + + tops = kmem_zalloc(nchildren * sizeof (void *), KM_SLEEP); + for (i = 0; i < nchildren; i++) { + if (i >= count) + break; + if (configs[i] == NULL) + continue; + VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + nvlist_dup(nvtop, &tops[i], KM_SLEEP); + } + for (i = 0; holes != NULL && i < nholes; i++) { + if (i >= nchildren) + continue; + if (tops[holes[i]] != NULL) + continue; + nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, + holes[i]) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, + 0) == 0); + } + for (i = 0; i < nchildren; i++) { + if (tops[i] != NULL) + continue; + nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, + i) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, + 0) == 0); + } + + /* + * Create pool config based on the best vdev config. + */ + nvlist_dup(best_cfg, &config, KM_SLEEP); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + tops, nchildren) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + + /* + * Drop vdev config elements that should not be present at pool level. + */ + nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); + nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); + + for (i = 0; i < count; i++) + nvlist_free(configs[i]); + kmem_free(configs, count * sizeof (void *)); + for (i = 0; i < nchildren; i++) + nvlist_free(tops[i]); + kmem_free(tops, nchildren * sizeof (void *)); + nvlist_free(nvroot); + return (config); +} + +int +spa_import_rootpool(const char *name) +{ + spa_t *spa; + vdev_t *rvd; + nvlist_t *config, *nvtop; + uint64_t txg; + char *pname; + int error; + + /* + * Read the label from the boot device and generate a configuration. + */ + config = spa_generate_rootconf(name); + + mutex_enter(&spa_namespace_lock); + if (config != NULL) { + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && strcmp(name, pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) + == 0); + + if ((spa = spa_lookup(pname)) != NULL) { + /* + * The pool could already be imported, + * e.g., after reboot -r. + */ + if (spa->spa_state == POOL_STATE_ACTIVE) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + return (0); + } + + /* + * Remove the existing root pool from the namespace so + * that we can replace it with the correct config + * we just read in. + */ + spa_remove(spa); + } + spa = spa_add(pname, config, NULL); + + /* + * Set spa_ubsync.ub_version as it can be used in vdev_alloc() + * via spa_version(). + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + } else if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", + name); + return (EIO); + } else { + VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); + } + spa->spa_is_root = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; + + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + nvlist_free(config); + return (0); +} + +const char * +spa_history_zone(void) +{ + return ("freebsd"); +} diff --git a/module/os/freebsd/zfs/spa_stats.c b/module/os/freebsd/zfs/spa_stats.c new file mode 100644 index 000000000000..45c880ada24d --- /dev/null +++ b/module/os/freebsd/zfs/spa_stats.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +void +spa_stats_init(spa_t *spa) +{ + +} + +void +spa_stats_destroy(spa_t *spa) +{ + +} + +void +spa_iostats_trim_add(spa_t *spa, trim_type_t type, + uint64_t extents_written, uint64_t bytes_written, + uint64_t extents_skipped, uint64_t bytes_skipped, + uint64_t extents_failed, uint64_t bytes_failed) +{ +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) +{ +} + +void +spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) +{ + +} +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + return (0); +} + +txg_stat_t * +spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) +{ + return (NULL); +} + +void +spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) +{ + +} + +void +spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) +{ + +} + +void +spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, + int error) +{ + +} + +int +spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, + hrtime_t duration) +{ + return (0); +} + +int +spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) +{ + return (0); +} diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c new file mode 100644 index 000000000000..ea9c1b3f1f16 --- /dev/null +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -0,0 +1,699 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include <../zfs_config.h> + +/* BEGIN CSTYLED */ +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS events"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "space allocation"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "reconstruct"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "multihost protection"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "metaslab group"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "lua"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "l2arc"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS Adaptive Replacement Cache"); + +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, + "ZFS VDEV Mirror"); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "livelist state"); +SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, "condense knobs"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "receive knobs"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "send knobs"); + +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, + (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version"); + +extern arc_state_t ARC_anon; +extern arc_state_t ARC_mru; +extern arc_state_t ARC_mru_ghost; +extern arc_state_t ARC_mfu; +extern arc_state_t ARC_mfu_ghost; +extern arc_state_t ARC_l2c_only; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ + +/* arc.c */ + +/* legacy compat */ +extern unsigned long l2arc_write_max; /* def max write size */ +extern unsigned long l2arc_write_boost; /* extra warmup write */ +extern unsigned long l2arc_headroom; /* # of dev writes */ +extern unsigned long l2arc_headroom_boost; +extern unsigned long l2arc_feed_secs; /* interval seconds */ +extern unsigned long l2arc_feed_min_ms; /* min interval msecs */ +extern int l2arc_noprefetch; /* don't cache prefetch bufs */ +extern int l2arc_feed_again; /* turbo warmup */ +extern int l2arc_norw; /* no reads during writes */ + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, + &l2arc_write_max, 0, "max write size (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, + &l2arc_write_boost, 0, "extra write during warmup (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, + &l2arc_headroom, 0, "number of dev writes (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, + &l2arc_feed_secs, 0, "interval seconds (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, + &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, + &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, + &l2arc_feed_again, 0, "turbo warmup (LEGACY)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, + &l2arc_norw, 0, "no reads during writes (LEGACY)"); +#if 0 +extern int zfs_compressed_arc_enabled; +SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW, + &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)"); +#endif + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); + +extern int arc_no_grow_shift; +extern int arc_shrink_shift; + +extern arc_stats_t arc_stats; +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ +#define arc_tempreserve ARCSTAT(arcstat_tempreserve) +#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +#define arc_dnode_limit ARCSTAT(arcstat_dnode_limit) /* max size for dnodes */ +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ +#define arc_need_free ARCSTAT(arcstat_need_free) /* bytes to be freed */ +#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ + +static int +sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) +{ + uint32_t val; + int err; + + val = arc_no_grow_shift; + err = sysctl_handle_32(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val >= arc_shrink_shift) + return (EINVAL); + + arc_no_grow_shift = val; + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, + 0, sizeof (uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", + "log2(fraction of ARC which must be free to allow growing)"); +/* dbuf.c */ + + +/* dmu.c */ + +/* dmu_zfetch.c */ +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH"); + +/* max bytes to prefetch per stream (default 8MB) */ +extern uint32_t zfetch_max_distance; +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, + &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); + +/* max bytes to prefetch indirects for per stream (default 64MB) */ +extern uint32_t zfetch_max_idistance; +SYSCTL_UINT(_vfs_zfs_prefetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, + &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream"); + +/* dsl_pool.c */ + +/* dnode.c */ +extern int zfs_default_bs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, + &zfs_default_bs, 0, "Default dnode block shift"); + +extern int zfs_default_ibs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, + &zfs_default_ibs, 0, "Default dnode indirect block shift"); + + +/* dsl_scan.c */ + +/* metaslab.c */ + +/* + * Enable/disable lba weighting (i.e. outer tracks are given preference). + */ +extern boolean_t metaslab_lba_weighting_enabled; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting, CTLFLAG_RWTUN, + &metaslab_lba_weighting_enabled, 0, + "Enable LBA weighting (i.e. outer tracks are given preference)"); + + +/* + * In pools where the log space map feature is not enabled we touch + * multiple metaslabs (and their respective space maps) with each + * transaction group. Thus, we benefit from having a small space map + * block size since it allows us to issue more I/O operations scattered + * around the disk. So a sane default for the space map block size + * is 8~16K. + */ +extern int zfs_metaslab_sm_blksz_no_log; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz_no_log, 0, + "Block size for space map in pools with log space map disabled. " + "Power of 2 and greater than 4096."); + +/* + * When the log space map feature is enabled, we accumulate a lot of + * changes per metaslab that are flushed once in a while so we benefit + * from a bigger block size like 128K for the metaslab space maps. + */ +extern int zfs_metaslab_sm_blksz_with_log; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz_with_log, 0, + "Block size for space map in pools with log space map enabled. " + "Power of 2 and greater than 4096."); + +/* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +extern int zfs_condense_pct; +SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, + &zfs_condense_pct, 0, + "Condense on-disk spacemap when it is more than this many percents" + " of in-memory counterpart"); + +extern int zfs_remove_max_segment; +SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, + &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to" + " allocate when removing a device"); + +extern int zfs_removal_suspend_progress; +SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, + &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while" + " in the middle of a removal"); + + +/* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +extern uint64_t metaslab_df_alloc_threshold; +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, + &metaslab_df_alloc_threshold, 0, + "Minimum size which forces the dynamic allocator to change it's allocation strategy"); + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +extern int metaslab_df_free_pct; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, + &metaslab_df_free_pct, 0, + "The minimum free space, in percent, which must be available in a " + "space map to continue allocations in a first-fit fashion"); + +/* + * Percentage of all cpus that can be used by the metaslab taskq. + */ +extern int metaslab_load_pct; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, + &metaslab_load_pct, 0, + "Percentage of cpus that can be used by the metaslab taskq"); + +/* + * Max number of metaslabs per group to preload. + */ +extern int metaslab_preload_limit; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, + &metaslab_preload_limit, 0, + "Max number of metaslabs per group to preload"); + +/* refcount.c */ +extern int reference_tracking_enable; +SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN, + &reference_tracking_enable, 0, + "Track reference holders to refcount_t objects, used mostly by ZFS"); + +/* spa.c */ +extern int zfs_ccw_retry_interval; +SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, + &zfs_ccw_retry_interval, 0, + "Configuration cache file write, retry after failure, interval (seconds)"); + +extern uint64_t zfs_max_missing_tvds_cachefile; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_cachefile, 0, + "allow importing pools with missing top-level vdevs in cache file"); + +extern uint64_t zfs_max_missing_tvds_scan; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_scan, 0, + "allow importing pools with missing top-level vdevs during scan"); + +/* spa_misc.c */ +extern int zfs_flags; +static int +sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) +{ + int err, val; + + val = zfs_flags; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + /* + * ZFS_DEBUG_MODIFY must be enabled prior to boot so all + * arc buffers in the system have the necessary additional + * checksum data. However, it is safe to disable at any + * time. + */ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + val &= ~ZFS_DEBUG_MODIFY; + zfs_flags = val; + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0, + sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); + +int +param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_deadman_synctime_ms; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + zfs_deadman_synctime_ms = val; + + spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_deadman_ziotime_ms; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + zfs_deadman_ziotime_ms = val; + + spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) +{ + char buf[16]; + int rc; + + if (req->newptr == NULL) + strlcpy(buf, zfs_deadman_failmode, sizeof (buf)); + + rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (rc || req->newptr == NULL) + return (rc); + if (strcmp(buf, zfs_deadman_failmode) == 0) + return (0); + if (!strcmp(buf, "wait")) + zfs_deadman_failmode = "wait"; + if (!strcmp(buf, "continue")) + zfs_deadman_failmode = "continue"; + if (!strcmp(buf, "panic")) + zfs_deadman_failmode = "panic"; + + return (-param_set_deadman_failmode_common(buf)); +} + + +/* spacemap.c */ +extern int space_map_ibs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, + &space_map_ibs, 0, "Space map indirect block shift"); + + +/* vdev.c */ +#ifdef notyet +extern uint64_t zfs_max_auto_ashift; +extern uint64_t zfs_min_auto_ashift; + +static int +sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_max_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val > ASHIFT_MAX || val < zfs_min_auto_ashift) + return (EINVAL); + + zfs_max_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t), + sysctl_vfs_zfs_max_auto_ashift, "QU", + "Max ashift used when optimising for logical -> physical sectors size on " + "new top-level vdevs."); +static int +sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_min_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < ASHIFT_MIN || val > zfs_max_auto_ashift) + return (EINVAL); + + zfs_min_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint64_t), + sysctl_vfs_zfs_min_auto_ashift, "QU", + "Min ashift used when creating new top-level vdevs."); +#endif + +/* + * Since the DTL space map of a vdev is not expected to have a lot of + * entries, we default its block size to 4K. + */ +extern int zfs_vdev_dtl_sm_blksz; +SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, + &zfs_vdev_dtl_sm_blksz, 0, + "Block size for DTL space map. Power of 2 and greater than 4096."); + +/* + * vdev-wide space maps that have lots of entries written to them at + * the end of each transaction can benefit from a higher I/O bandwidth + * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. + */ +extern int zfs_vdev_standard_sm_blksz; +SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, + &zfs_vdev_standard_sm_blksz, 0, + "Block size for standard space map. Power of 2 and greater than 4096."); + +extern int vdev_validate_skip; +SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, + &vdev_validate_skip, 0, + "Enable to bypass vdev_validate()."); + + +/* vdev_cache.c */ + +/* vdev_mirror.c */ +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * non_rotating_seek_inc to 0 may well provide better results as it + * will direct more reads to the non-rotating vdevs which are more + * likely to have a higher performance. + */ + + +/* vdev_queue.c */ +#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ +extern uint32_t zfs_vdev_ ## name ## _min_active; \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ + &zfs_vdev_ ## name ## _min_active, 0, \ + "Initial number of I/O requests of type " #name \ + " active for each device"); + +#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ +extern uint32_t zfs_vdev_ ## name ## _max_active; \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \ + &zfs_vdev_ ## name ## _max_active, 0, \ + "Maximum number of I/O requests of type " #name \ + " active for each device"); + + +#undef ZFS_VDEV_QUEUE_KNOB + +extern uint32_t zfs_vdev_max_active; +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, + &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device. (LEGACY)"); + +extern int zfs_vdev_def_queue_depth; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, + &zfs_vdev_def_queue_depth, 0, + "Default queue depth for each allocator"); + +/*extern uint64_t zfs_multihost_history; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN, + &zfs_multihost_history, 0, + "Historical staticists for the last N multihost updates");*/ + +#ifdef notyet +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW, + &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); +#endif + + +/* zio.c */ +#if defined(__LP64__) +int zio_use_uma = 1; +#else +int zio_use_uma = 0; +#endif + +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, + "Use uma(9) for ZIO allocations"); +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, + "Exclude metadata buffers from dumps as well"); + + +int +param_set_arc_long(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_long(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + arc_tuning_update(B_TRUE); + + return (0); +} + +int +param_set_arc_int(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_int(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + arc_tuning_update(B_TRUE); + + return (0); +} + +int +param_set_slop_shift(SYSCTL_HANDLER_ARGS) +{ + int val; + int err; + + val = *(int *)arg1; + + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 1 || val > 31) + return (EINVAL); + + *(int *)arg1 = val; + + return (0); +} diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c new file mode 100644 index 000000000000..01851378e717 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_file.c @@ -0,0 +1,326 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), + minclsyspri, max_ncpus, INT_MAX, 0); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static mode_t +vdev_file_open_mode(spa_mode_t spa_mode) +{ + mode_t mode = 0; + + if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { + mode = O_RDWR; + } else if (spa_mode & SPA_MODE_READ) { + mode = O_RDONLY; + } else if (spa_mode & SPA_MODE_WRITE) { + mode = O_WRONLY; + } + + return (mode | O_LARGEFILE); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift) +{ + vdev_file_t *vf; + zfs_file_t *fp; + zfs_file_attr_t zfa; + int error; + + /* + * Rotational optimizations only make sense on block devices. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + + error = zfs_file_open(vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_file = fp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (zfs_file_getattr(fp, &zfa)) { + return (SET_ERROR(ENODEV)); + } + if (!S_ISREG(zfa.zfa_mode)) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (SET_ERROR(ENODEV)); + } +#endif + +skip_open: + + error = zfs_file_getattr(vf->vf_file, &zfa); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *max_psize = *psize = zfa.zfa_size; + *ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_file != NULL) { + zfs_file_close(vf->vf_file); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(zio_t *zio) +{ + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf; + void *buf; + ssize_t resid; + loff_t off; + ssize_t size; + int err; + + off = zio->io_offset; + size = zio->io_size; + resid = 0; + + vf = vd->vdev_tsd; + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + if (zio->io_type == ZIO_TYPE_READ) { + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); + abd_return_buf_copy(zio->io_abd, buf, size); + } else { + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + abd_return_buf(zio->io_abd, buf, size); + } + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + + vdev_file_io_intr(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = zfs_file_fsync(vf->vf_file, + O_SYNC|O_DSYNC); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } else if (zio->io_type == ZIO_TYPE_TRIM) { +#ifdef notyet + int mode = 0; + + ASSERT3U(zio->io_size, !=, 0); + + /* XXX FreeBSD has no fallocate routine in file ops */ + zio->io_error = zfs_file_fallocate(vf->vf_file, + mode, zio->io_offset, zio->io_size); +#endif + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, 0); +} + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c new file mode 100644 index 000000000000..d87bbbc18157 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -0,0 +1,1195 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Portions Copyright (c) 2012 Martin Matuska + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for GEOM. + */ + +static g_attrchanged_t vdev_geom_attrchanged; +struct g_class zfs_vdev_class = { + .name = "ZFS::VDEV", + .version = G_VERSION, + .attrchanged = vdev_geom_attrchanged, +}; + +struct consumer_vdev_elem { + SLIST_ENTRY(consumer_vdev_elem) elems; + vdev_t *vd; +}; + +SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); +/* BEGIN CSTYLED */ +_Static_assert(sizeof (((struct g_consumer *)NULL)->private) + == sizeof (struct consumer_priv_t*), + "consumer_priv_t* can't be stored in g_consumer.private"); + +DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); + +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ +static int vdev_geom_bio_flush_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); +/* END CSTYLED */ + +/* Declare local functions */ +static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); + +/* + * Thread local storage used to indicate when a thread is probing geoms + * for their guids. If NULL, this thread is not tasting geoms. If non NULL, + * it is looking for a replacement for the vdev_t* that is its value. + */ +uint_t zfs_geom_probe_vdev_key; + +static void +vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, + boolean_t do_null_update) +{ + boolean_t needs_update = B_FALSE; + char *physpath; + int error, physpath_len; + + physpath_len = MAXPATHLEN; + physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); + error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); + if (error == 0) { + char *old_physpath; + + /* g_topology lock ensures that vdev has not been closed */ + g_topology_assert(); + old_physpath = vd->vdev_physpath; + vd->vdev_physpath = spa_strdup(physpath); + + if (old_physpath != NULL) { + needs_update = (strcmp(old_physpath, + vd->vdev_physpath) != 0); + spa_strfree(old_physpath); + } else + needs_update = do_null_update; + } + g_free(physpath); + + /* + * If the physical path changed, update the config. + * Only request an update for previously unset physpaths if + * requested by the caller. + */ + if (needs_update) + spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); + +} + +static void +vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vdev_t *vd = elem->vd; + if (strcmp(attr, "GEOM::physpath") == 0) { + vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); + return; + } + } +} + +static void +vdev_geom_resize(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + spa_t *spa; + vdev_t *vd; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vd = elem->vd; + if (vd->vdev_state != VDEV_STATE_HEALTHY) + continue; + spa = vd->vdev_spa; + if (!spa->spa_autoexpand) + continue; + vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); + } +} + +static void +vdev_geom_orphan(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + // cppcheck-suppress uninitvar + struct consumer_vdev_elem *elem; + + g_topology_assert(); + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + /* Vdev close in progress. Ignore the event. */ + return; + + /* + * Orphan callbacks occur from the GEOM event thread. + * Concurrent with this call, new I/O requests may be + * working their way through GEOM about to find out + * (only once executed by the g_down thread) that we've + * been orphaned from our disk provider. These I/Os + * must be retired before we can detach our consumer. + * This is most easily achieved by acquiring the + * SPA ZIO configuration lock as a writer, but doing + * so with the GEOM topology lock held would cause + * a lock order reversal. Instead, rely on the SPA's + * async removal support to invoke a close on this + * vdev once it is safe to do so. + */ + // cppcheck-suppress All + SLIST_FOREACH(elem, priv, elems) { + // cppcheck-suppress uninitvar + vdev_t *vd = elem->vd; + + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); + } +} + +static struct g_consumer * +vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + + g_topology_assert(); + + ZFS_LOG(1, "Attaching to %s.", pp->name); + + if (sanity) { + if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible sectorsize %d\n", + pp->name, pp->sectorsize); + return (NULL); + } else if (pp->mediasize < SPA_MINDEVSIZE) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible mediasize %ju\n", + pp->name, pp->mediasize); + return (NULL); + } + } + + /* Do we have geom already? No? Create one. */ + LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + if (strcmp(gp->name, "zfs::vdev") != 0) + continue; + break; + } + if (gp == NULL) { + gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); + gp->orphan = vdev_geom_orphan; + gp->attrchanged = vdev_geom_attrchanged; + gp->resize = vdev_geom_resize; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); + } else { + /* Check if we are already connected to this provider. */ + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (cp->provider == pp) { + ZFS_LOG(1, "Found consumer for %s.", pp->name); + break; + } + } + if (cp == NULL) { + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created consumer for %s.", pp->name); + } else { + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + return (NULL); + } + ZFS_LOG(1, "Used existing consumer for %s.", pp->name); + } + } + + if (vd != NULL) + vd->vdev_tsd = cp; + + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; + return (cp); +} + +static void +vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) +{ + struct g_geom *gp; + + g_topology_assert(); + + ZFS_LOG(1, "Detaching from %s.", + cp->provider && cp->provider->name ? cp->provider->name : "NULL"); + + gp = cp->geom; + if (open_for_read) + g_access(cp, -1, 0, -1); + /* Destroy consumer on last close. */ + if (cp->acr == 0 && cp->ace == 0) { + if (cp->acw > 0) + g_access(cp, 0, -cp->acw, 0); + if (cp->provider != NULL) { + ZFS_LOG(1, "Destroying consumer for %s.", + cp->provider->name ? cp->provider->name : "NULL"); + g_detach(cp); + } + g_destroy_consumer(cp); + } + /* Destroy geom if there are no consumers left. */ + if (LIST_EMPTY(&gp->consumer)) { + ZFS_LOG(1, "Destroyed geom %s.", gp->name); + g_wither_geom(gp, ENXIO); + } +} + +static void +vdev_geom_close_locked(vdev_t *vd) +{ + struct g_consumer *cp; + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem, *elem_temp; + + g_topology_assert(); + + cp = vd->vdev_tsd; + vd->vdev_delayed_close = B_FALSE; + if (cp == NULL) + return; + + ZFS_LOG(1, "Closing access to %s.", cp->provider->name); + KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); + priv = (struct consumer_priv_t *)&cp->private; + vd->vdev_tsd = NULL; + SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { + if (elem->vd == vd) { + SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); + g_free(elem); + } + } + + vdev_geom_detach(cp, B_TRUE); +} + +/* + * Issue one or more bios to the vdev in parallel + * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO + * operation is described by parallel entries from each array. There may be + * more bios actually issued than entries in the array + */ +static void +vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, + off_t *sizes, int *errors, int ncmds) +{ + struct bio **bios; + uint8_t *p; + off_t off, maxio, s, end; + int i, n_bios, j; + size_t bios_size; + + maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); + n_bios = 0; + + /* How many bios are required for all commands ? */ + for (i = 0; i < ncmds; i++) + n_bios += (sizes[i] + maxio - 1) / maxio; + + /* Allocate memory for the bios */ + bios_size = n_bios * sizeof (struct bio *); + bios = kmem_zalloc(bios_size, KM_SLEEP); + + /* Prepare and issue all of the bios */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + p = datas[i]; + s = sizes[i]; + end = off + s; + ASSERT((off % cp->provider->sectorsize) == 0); + ASSERT((s % cp->provider->sectorsize) == 0); + + for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { + bios[j] = g_alloc_bio(); + bios[j]->bio_cmd = cmds[i]; + bios[j]->bio_done = NULL; + bios[j]->bio_offset = off; + bios[j]->bio_length = MIN(s, maxio); + bios[j]->bio_data = (caddr_t)p; + g_io_request(bios[j], cp); + } + } + ASSERT(j == n_bios); + + /* Wait for all of the bios to complete, and clean them up */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + s = sizes[i]; + end = off + s; + + for (; off < end; off += maxio, s -= maxio, j++) { + errors[i] = biowait(bios[j], "vdev_geom_io") || + errors[i]; + g_destroy_bio(bios[j]); + } + } + kmem_free(bios, bios_size); +} + +/* + * Read the vdev config from a device. Return the number of valid labels that + * were found. The vdev config will be returned in config if and only if at + * least one valid label was found. + */ +static int +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) +{ + struct g_provider *pp; + nvlist_t *config; + vdev_phys_t *vdev_lists[VDEV_LABELS]; + char *buf; + size_t buflen; + uint64_t psize, state, txg; + off_t offsets[VDEV_LABELS]; + off_t size; + off_t sizes[VDEV_LABELS]; + int cmds[VDEV_LABELS]; + int errors[VDEV_LABELS]; + int l, nlabels; + + g_topology_assert_not(); + + pp = cp->provider; + ZFS_LOG(1, "Reading config from %s...", pp->name); + + psize = pp->mediasize; + psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + + size = sizeof (*vdev_lists[0]) + pp->sectorsize - + ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; + + buflen = sizeof (vdev_lists[0]->vp_nvlist); + + /* Create all of the IO requests */ + for (l = 0; l < VDEV_LABELS; l++) { + cmds[l] = BIO_READ; + vdev_lists[l] = kmem_alloc(size, KM_SLEEP); + offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; + sizes[l] = size; + errors[l] = 0; + ASSERT(offsets[l] % pp->sectorsize == 0); + } + + /* Issue the IO requests */ + vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, + VDEV_LABELS); + + /* Parse the labels */ + config = *configp = NULL; + nlabels = 0; + for (l = 0; l < VDEV_LABELS; l++) { + if (errors[l] != 0) + continue; + + buf = vdev_lists[l]->vp_nvlist; + + if (nvlist_unpack(buf, buflen, &config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(config); + continue; + } + + if (state != POOL_STATE_SPARE && + state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(config); + continue; + } + + if (*configp != NULL) + nvlist_free(*configp); + *configp = config; + nlabels++; + } + + /* Free the label storage */ + for (l = 0; l < VDEV_LABELS; l++) + kmem_free(vdev_lists[l], size); + + return (nlabels); +} + +static void +resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) +{ + nvlist_t **new_configs; + uint64_t i; + + if (id < *count) + return; + new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), + KM_SLEEP); + for (i = 0; i < *count; i++) + new_configs[i] = (*configs)[i]; + if (*configs != NULL) + kmem_free(*configs, *count * sizeof (void *)); + *configs = new_configs; + *count = id + 1; +} + +static void +process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, + const char *name, uint64_t *known_pool_guid) +{ + nvlist_t *vdev_tree; + uint64_t pool_guid; + uint64_t vdev_guid; + uint64_t id, txg, known_txg; + char *pname; + + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + strcmp(pname, name) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) + goto ignore; + + if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + goto ignore; + + if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) + goto ignore; + + VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + if (*known_pool_guid != 0) { + if (pool_guid != *known_pool_guid) + goto ignore; + } else + *known_pool_guid = pool_guid; + + resize_configs(configs, count, id); + + if ((*configs)[id] != NULL) { + VERIFY(nvlist_lookup_uint64((*configs)[id], + ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); + if (txg <= known_txg) + goto ignore; + nvlist_free((*configs)[id]); + } + + (*configs)[id] = cfg; + return; + +ignore: + nvlist_free(cfg); +} + +int +vdev_geom_read_pool_label(const char *name, + nvlist_t ***configs, uint64_t *count) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *zcp; + nvlist_t *vdev_cfg; + uint64_t pool_guid; + int nlabels; + + DROP_GIANT(); + g_topology_lock(); + + *configs = NULL; + *count = 0; + pool_guid = 0; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + if (pp->flags & G_PF_WITHER) + continue; + zcp = vdev_geom_attach(pp, NULL, B_TRUE); + if (zcp == NULL) + continue; + g_topology_unlock(); + nlabels = vdev_geom_read_config(zcp, &vdev_cfg); + g_topology_lock(); + vdev_geom_detach(zcp, B_TRUE); + if (nlabels == 0) + continue; + ZFS_LOG(1, "successfully read vdev config"); + + process_vdev_config(configs, count, + vdev_cfg, name, &pool_guid); + } + } + } + g_topology_unlock(); + PICKUP_GIANT(); + + return (*count > 0 ? 0 : ENOENT); +} + +enum match { + NO_MATCH = 0, /* No matching labels found */ + TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ + ZERO_MATCH = 1, /* Should never be returned */ + ONE_MATCH = 2, /* 1 label matching the vdev_guid */ + TWO_MATCH = 3, /* 2 label matching the vdev_guid */ + THREE_MATCH = 4, /* 3 label matching the vdev_guid */ + FULL_MATCH = 5 /* all labels match the vdev_guid */ +}; + +static enum match +vdev_attach_ok(vdev_t *vd, struct g_provider *pp) +{ + nvlist_t *config; + uint64_t pool_guid, top_guid, vdev_guid; + struct g_consumer *cp; + int nlabels; + + cp = vdev_geom_attach(pp, NULL, B_TRUE); + if (cp == NULL) { + ZFS_LOG(1, "Unable to attach tasting instance to %s.", + pp->name); + return (NO_MATCH); + } + g_topology_unlock(); + nlabels = vdev_geom_read_config(cp, &config); + g_topology_lock(); + vdev_geom_detach(cp, B_TRUE); + if (nlabels == 0) { + ZFS_LOG(1, "Unable to read config from %s.", pp->name); + return (NO_MATCH); + } + + pool_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); + top_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); + vdev_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + nvlist_free(config); + + /* + * Check that the label's pool guid matches the desired guid. + * Inactive spares and L2ARCs do not have any pool guid in the label. + */ + if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { + ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", + pp->name, + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); + return (NO_MATCH); + } + + /* + * Check that the label's vdev guid matches the desired guid. + * The second condition handles possible race on vdev detach, when + * remaining vdev receives GUID of destroyed top level mirror vdev. + */ + if (vdev_guid == vd->vdev_guid) { + ZFS_LOG(1, "guids match for provider %s.", pp->name); + return (ZERO_MATCH + nlabels); + } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { + ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); + return (TOPGUID_MATCH); + } + ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", + pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); + return (NO_MATCH); +} + +static struct g_consumer * +vdev_geom_attach_by_guids(vdev_t *vd) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp, *best_pp; + struct g_consumer *cp; + const char *vdpath; + enum match match, best_match; + + g_topology_assert(); + + vdpath = vd->vdev_path + sizeof ("/dev/") - 1; + cp = NULL; + best_pp = NULL; + best_match = NO_MATCH; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + match = vdev_attach_ok(vd, pp); + if (match > best_match) { + best_match = match; + best_pp = pp; + } else if (match == best_match) { + if (strcmp(pp->name, vdpath) == 0) { + best_pp = pp; + } + } + if (match == FULL_MATCH) + goto out; + } + } + } + +out: + if (best_pp) { + cp = vdev_geom_attach(best_pp, vd, B_TRUE); + if (cp == NULL) { + printf("ZFS WARNING: Unable to attach to %s.\n", + best_pp->name); + } + } + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_guids(vdev_t *vd) +{ + struct g_consumer *cp; + char *buf; + size_t len; + + g_topology_assert(); + + ZFS_LOG(1, "Searching by guids [%ju:%ju].", + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); + cp = vdev_geom_attach_by_guids(vd); + if (cp != NULL) { + len = strlen(cp->provider->name) + strlen("/dev/") + 1; + buf = kmem_alloc(len, KM_SLEEP); + + snprintf(buf, len, "/dev/%s", cp->provider->name); + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + + ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid, cp->provider->name); + } else { + ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid); + } + + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_path(vdev_t *vd, int check_guid) +{ + struct g_provider *pp; + struct g_consumer *cp; + + g_topology_assert(); + + cp = NULL; + pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); + if (pp != NULL) { + ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); + if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) + cp = vdev_geom_attach(pp, vd, B_FALSE); + } + + return (cp); +} + +static int +vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift) +{ + struct g_provider *pp; + struct g_consumer *cp; + int error, has_trim; + uint16_t rate; + + /* + * Set the TLS to indicate downstack that we + * should not access zvols + */ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if ((cp = vd->vdev_tsd) != NULL) { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + + DROP_GIANT(); + g_topology_lock(); + error = 0; + + if (vd->vdev_spa->spa_is_splitting || + ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && + (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { + /* + * We are dealing with a vdev that hasn't been previously + * opened (since boot), and we are not loading an + * existing pool configuration. This looks like a + * vdev add operation to a new or existing pool. + * Assume the user knows what he/she is doing and find + * GEOM provider by its name, ignoring GUID mismatches. + * + * XXPOLICY: It would be safer to only allow a device + * that is unlabeled or labeled but missing + * GUID information to be opened in this fashion, + * unless we are doing a split, in which case we + * should allow any guid. + */ + cp = vdev_geom_open_by_path(vd, 0); + } else { + /* + * Try using the recorded path for this device, but only + * accept it if its label data contains the expected GUIDs. + */ + cp = vdev_geom_open_by_path(vd, 1); + if (cp == NULL) { + /* + * The device at vd->vdev_path doesn't have the + * expected GUIDs. The disks might have merely + * moved around so try all other GEOM providers + * to find one with the right GUIDs. + */ + cp = vdev_geom_open_by_guids(vd); + } + } + + /* Clear the TLS now that tasting is done */ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); + + if (cp == NULL) { + ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); + error = ENOENT; + } else { + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + int spamode; + + priv = (struct consumer_priv_t *)&cp->private; + if (cp->private == NULL) + SLIST_INIT(priv); + elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); + elem->vd = vd; + SLIST_INSERT_HEAD(priv, elem, elems); + + spamode = spa_mode(vd->vdev_spa); + if (cp->provider->sectorsize > VDEV_PAD_SIZE || + !ISP2(cp->provider->sectorsize)) { + ZFS_LOG(1, "Provider %s has unsupported sectorsize.", + cp->provider->name); + + vdev_geom_close_locked(vd); + error = EINVAL; + cp = NULL; + } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { + int i; + + for (i = 0; i < 5; i++) { + error = g_access(cp, 0, 1, 0); + if (error == 0) + break; + g_topology_unlock(); + tsleep(vd, 0, "vdev", hz / 2); + g_topology_lock(); + } + if (error != 0) { + printf("ZFS WARNING: Unable to open %s for " + "writing (error=%d).\n", + cp->provider->name, error); + vdev_geom_close_locked(vd); + cp = NULL; + } + } + } + + /* Fetch initial physical path information for this device. */ + if (cp != NULL) { + vdev_geom_attrchanged(cp, "GEOM::physpath"); + + /* Set other GEOM characteristics */ + vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); + } + + g_topology_unlock(); + PICKUP_GIANT(); + if (cp == NULL) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", + error); + return (error); + } +skip_open: + pp = cp->provider; + + /* + * Determine the actual size of the device. + */ + *max_psize = *psize = pp->mediasize; + + /* + * Determine the device's minimum transfer size and preferred + * transfer size. + */ + *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; +#ifdef notyet + if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) && + pp->stripesize <= (1 << ASHIFT_MAX) && pp->stripeoffset == 0) + *physical_ashift = highbit(pp->stripesize) - 1; +#endif + /* + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. + */ + vd->vdev_nowritecache = B_FALSE; + + /* Inform the ZIO pipeline that we are non-rotational. */ + error = g_getattr("GEOM::rotation_rate", cp, &rate); + if (error == 0 && rate == DISK_RR_NON_ROTATING) + vd->vdev_nonrot = B_TRUE; + else + vd->vdev_nonrot = B_FALSE; + + /* Set when device reports it supports TRIM. */ + error = g_getattr("GEOM::candelete", cp, &has_trim); + vd->vdev_has_trim = (error == 0 && has_trim); + + /* Set when device reports it supports secure TRIM. */ + /* unavailable on FreeBSD */ + vd->vdev_has_securetrim = B_FALSE; + + return (0); +} + +static void +vdev_geom_close(vdev_t *vd) +{ + struct g_consumer *cp; + + cp = vd->vdev_tsd; + + DROP_GIANT(); + g_topology_lock(); + + if (!vd->vdev_reopening || + (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || + (cp->provider != NULL && cp->provider->error != 0)))) + vdev_geom_close_locked(vd); + + g_topology_unlock(); + PICKUP_GIANT(); +} + +static void +vdev_geom_io_intr(struct bio *bp) +{ + vdev_t *vd; + zio_t *zio; + + zio = bp->bio_caller1; + vd = zio->io_vd; + zio->io_error = bp->bio_error; + if (zio->io_error == 0 && bp->bio_resid != 0) + zio->io_error = SET_ERROR(EIO); + + switch (zio->io_error) { + case ENOTSUP: + /* + * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know + * that future attempts will never succeed. In this case + * we set a persistent flag so that we don't bother with + * requests in the future. + */ + switch (bp->bio_cmd) { + case BIO_FLUSH: + vd->vdev_nowritecache = B_TRUE; + break; + case BIO_DELETE: + break; + } + break; + case ENXIO: + if (!vd->vdev_remove_wanted) { + /* + * If provider's error is set we assume it is being + * removed. + */ + if (bp->bio_to->error != 0) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, + SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } + break; + } + + /* + * We have to split bio freeing into two parts, because the ABD code + * cannot be called in this context and vdev_op_io_done is not called + * for ZIO_TYPE_IOCTL zio-s. + */ + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + g_destroy_bio(bp); + zio->io_bio = NULL; + } + zio_delay_interrupt(zio); +} + +static void +vdev_geom_io_start(zio_t *zio) +{ + vdev_t *vd; + struct g_consumer *cp; + struct bio *bp; + + vd = zio->io_vd; + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } else { + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + if (zfs_nocacheflush || + vdev_geom_bio_flush_disable) + break; + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + goto sendreq; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + } + + zio_execute(zio); + return; + case ZIO_TYPE_TRIM: + if (!vdev_geom_bio_delete_disable) { + goto sendreq; + } + zio_execute(zio); + return; + default: + ; + /* PASSTHROUGH --- placate compiler */ + } +sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_TRIM || + zio->io_type == ZIO_TYPE_IOCTL); + + cp = vd->vdev_tsd; + if (cp == NULL) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + bp = g_alloc_bio(); + bp->bio_caller1 = zio; + switch (zio->io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + if (zio->io_type == ZIO_TYPE_READ) { + bp->bio_cmd = BIO_READ; + bp->bio_data = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + bp->bio_cmd = BIO_WRITE; + bp->bio_data = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + break; + case ZIO_TYPE_TRIM: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + case ZIO_TYPE_IOCTL: + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + default: + panic("invalid zio->io_type: %d\n", zio->io_type); + } + bp->bio_done = vdev_geom_io_intr; + zio->io_bio = bp; + + g_io_request(bp, cp); +} + +static void +vdev_geom_io_done(zio_t *zio) +{ + struct bio *bp = zio->io_bio; + + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + ASSERT(bp == NULL); + return; + } + + if (bp == NULL) { + ASSERT3S(zio->io_error, ==, ENXIO); + return; + } + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size); + else + abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size); + + g_destroy_bio(bp); + zio->io_bio = NULL; +} + +static void +vdev_geom_hold(vdev_t *vd) +{ +} + +static void +vdev_geom_rele(vdev_t *vd) +{ +} + +vdev_ops_t vdev_disk_ops = { + vdev_geom_open, + vdev_geom_close, + vdev_default_asize, + vdev_geom_io_start, + vdev_geom_io_done, + NULL, + NULL, + vdev_geom_hold, + vdev_geom_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c new file mode 100644 index 000000000000..e734a2af8370 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) +{ + spa_t *spa = vd->vdev_spa; + zio_t *zio; + abd_t *pad2; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error; + + if (size > VDEV_PAD_SIZE) + return (EINVAL); + + if (!vd->vdev_ops->vdev_op_leaf) + return (ENODEV); + if (vdev_is_dead(vd)) + return (ENXIO); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); + abd_copy_from_buf(pad2, buf, size); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + vdev_label_write(zio, vd, 0, pad2, + offsetof(vdev_label_t, vl_pad2), + VDEV_PAD_SIZE, NULL, NULL, flags); + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(pad2); + return (error); +} diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c new file mode 100644 index 000000000000..c11e16437501 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -0,0 +1,2738 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (obj_type == VDIR && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +int +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize; + int acl_count; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + return (error); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = (vtype == VDIR); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + uint_t aclinherit; + boolean_t isdir = (vtype == VDIR); + boolean_t isreg = (vtype == VREG); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) + return (aclp); + + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(vtype, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops->ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_acl_t *paclp; + gid_t gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + if ((flag & IS_ROOT_NODE) == 0) { + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + } else + ASSERT(dzp->z_vnode == NULL); + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & AT_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != dzp->z_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + if (dzp->z_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } else { + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); +#ifdef __FreeBSD_kernel__ + gid = acl_ids->z_fgid = dzp->z_gid; +#else + gid = crgetgid(cr); +#endif + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY0(zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_type, paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) +{ + return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || + zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || + (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + return (error); + + mutex_enter(&zp->z_acl_lock); + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = (int)aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + zfs_acl_node_t *aclnode; + void *start = vsecp->vsa_aclentp; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == + aclp->z_acl_bytes); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_pflags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_pflags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, + &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); + } +top: + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). + */ + if ((v4_mode & WRITE_MASK_DATA) && + (zp->z_pflags & ZFS_IMMUTABLE)) { + return (SET_ERROR(EPERM)); + } + + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (EPERM); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCESS if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != UID_NOBODY && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + (ZTOV(zp)->v_type != VDIR) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +/* + * Check if VEXEC is allowed. + * + * This routine is based on zfs_fastaccesschk_execute which has slowpath + * calling zfs_zaccess. This would be incorrect on FreeBSD (see + * zfs_freebsd_access for the difference). Thus this variant let's the + * caller handle the slowpath (if necessary). + * + * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED. + * + * Safe access to znode_t is provided by the vnode lock. + */ +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t uid = crgetuid(cr); + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (1); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (ZTOV(zdp)->v_type == VDIR)); + if (is_attr) + return (1); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) + return (0); + + mutex_enter(&zdp->z_acl_lock); + if (FUID_INDEX(zdp->z_uid) != 0 || FUID_INDEX(zdp->z_gid) != 0) { + goto out_slow; + } + + if (uid == zdp->z_uid) { + owner = B_TRUE; + if (zdp->z_mode & S_IXUSR) { + goto out; + } else { + goto out_slow; + } + } + if (groupmember(zdp->z_gid, cr)) { + groupmbr = B_TRUE; + if (zdp->z_mode & S_IXGRP) { + goto out; + } else { + goto out_slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_mode & S_IXOTH) { + goto out; + } + } +out: + mutex_exit(&zdp->z_acl_lock); + return (0); +out_slow: + mutex_exit(&zdp->z_acl_lock); + return (1); +} + + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsystem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp = NULL; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); + +#ifdef __FreeBSD_kernel__ + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); +#else + /* + * If attribute then validate against base file + */ + if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + + if ((error = zfs_zget(zp->z_zfsvfs, + parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } +#endif + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + vnode_t *check_vp = ZTOV(check_zp); + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + error = secpolicy_vnode_access2(cr, check_vp, owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(check_vp, cr, owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(check_vp, cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(check_vp, cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(check_vp, cr, owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); + } + + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t available_perms, cred_t *cr) +{ + int error; + uid_t downer; + + downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); + + error = secpolicy_vnode_access2(cr, ZTOV(dzp), + downer, available_perms, VWRITE|VEXEC); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + mode_t available_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; + + /* + * We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. + */ + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * First row + * If the directory permissions allow the delete, we are done. + */ + if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + /* + * If target object has delete permission then we are done + */ + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + ASSERT(dzp_error && zp_error); + + if (!dzpcheck_privs) + return (dzp_error); + if (!zpcheck_privs) + return (zp_error); + + /* + * Second row + * + * If directory returns EACCES then delete_child was denied + * due to deny delete_child. In this case send the request through + * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() + * since that *could* allow the delete based on write/execute permission + * and we want delete permissions to override write/execute. + */ + + if (dzp_error == EACCES) { + /* XXXPJD: s/dzp/zp/ ? */ + return (secpolicy_vnode_remove(ZTOV(dzp), cr)); + } + /* + * Third Row + * only need to see if we have write/execute on directory. + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + + if (dzp_error != 0 && !dzpcheck_privs) + return (dzp_error); + + /* + * Fourth row + */ + + available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; + available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; + + return (zfs_delete_final_check(zp, dzp, available_perms, cr)); + +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. + */ + if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { + if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + return (error); + } + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c new file mode 100644 index 000000000000..ed6652c3bab1 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -0,0 +1,1345 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" + +#include + +/* Common access mode for all virtual directories under the ctldir */ +const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + +/* + * "Synthetic" filesystem implementation. + */ + +/* + * Assert that A implies B. + */ +#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); + +static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); + +typedef struct sfs_node { + char sn_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t sn_parent_id; + uint64_t sn_id; +} sfs_node_t; + +/* + * Check the parent's ID as well as the node's to account for a chance + * that IDs originating from different domains (snapshot IDs, artifical + * IDs, znode IDs) may clash. + */ +static int +sfs_compare_ids(struct vnode *vp, void *arg) +{ + sfs_node_t *n1 = vp->v_data; + sfs_node_t *n2 = arg; + bool equal; + + equal = n1->sn_id == n2->sn_id && + n1->sn_parent_id == n2->sn_parent_id; + + /* Zero means equality. */ + return (!equal); +} + +static int +sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + sfs_node_t search; + int err; + + search.sn_id = id; + search.sn_parent_id = parent_id; + err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp, + sfs_compare_ids, &search); + return (err); +} + +static int +sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + int err; + + KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); + err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp, + sfs_compare_ids, vp->v_data); + return (err); +} + +static void +sfs_vnode_remove(struct vnode *vp) +{ + vfs_hash_remove(vp); +} + +typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); + +static int +sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, + const char *tag, struct vop_vector *vops, + sfs_vnode_setup_fn setup, void *arg, + struct vnode **vpp) +{ + struct vnode *vp; + int error; + + error = sfs_vnode_get(mp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + /* Allocate a new vnode/inode. */ + error = getnewvnode(tag, mp, vops, &vp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + /* + * Exclusively lock the vnode vnode while it's being constructed. + */ + lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); + error = insmntque(vp, mp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + setup(vp, arg); + + error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + *vpp = vp; + return (0); +} + +static void +sfs_print_node(sfs_node_t *node) +{ + printf("\tname = %s\n", node->sn_name); + printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); + printf("\tid = %ju\n", (uintmax_t)node->sn_id); +} + +static sfs_node_t * +sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) +{ + struct sfs_node *node; + + KASSERT(strlen(name) < sizeof (node->sn_name), + ("sfs node name is too long")); + KASSERT(size >= sizeof (*node), ("sfs node size is too small")); + node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); + strlcpy(node->sn_name, name, sizeof (node->sn_name)); + node->sn_parent_id = parent_id; + node->sn_id = id; + + return (node); +} + +static void +sfs_destroy_node(sfs_node_t *node) +{ + free(node, M_SFSNODES); +} + +static void * +sfs_reclaim_vnode(vnode_t *vp) +{ + void *data; + + sfs_vnode_remove(vp); + data = vp->v_data; + vp->v_data = NULL; + return (data); +} + +static int +sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, + uio_t *uio, off_t *offp) +{ + struct dirent entry; + int error; + + /* Reset ncookies for subsequent use of vfs_read_dirent. */ + if (ap->a_ncookies != NULL) + *ap->a_ncookies = 0; + + if (uio->uio_resid < sizeof (entry)) + return (SET_ERROR(EINVAL)); + + if (uio->uio_offset < 0) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == 0) { + entry.d_fileno = id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '\0'; + entry.d_namlen = 1; + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (uio->uio_offset < sizeof (entry)) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == sizeof (entry)) { + entry.d_fileno = parent_id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '.'; + entry.d_name[2] = '\0'; + entry.d_namlen = 2; + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (offp != NULL) + *offp = 2 * sizeof (entry); + return (0); +} + + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/ objectid(snap) + */ +#define ZFSCTL_INO_SNAP(id) (id) + +static struct vop_vector zfsctl_ops_root; +static struct vop_vector zfsctl_ops_snapdir; +static struct vop_vector zfsctl_ops_snapshot; +static struct vop_vector zfsctl_ops_shares_dir; + +void +zfsctl_init(void) +{ +} + +void +zfsctl_fini(void) +{ +} + +boolean_t +zfsctl_is_node(vnode_t *vp) +{ + return (vn_matchops(vp, zfsctl_ops_root) || + vn_matchops(vp, zfsctl_ops_snapdir) || + vn_matchops(vp, zfsctl_ops_snapshot) || + vn_matchops(vp, zfsctl_ops_shares_dir)); + +} + +typedef struct zfsctl_root { + sfs_node_t node; + sfs_node_t *snapdir; + timestruc_t cmtime; +} zfsctl_root_t; + + +/* + * Create the '.zfs' directory. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + zfsctl_root_t *dot_zfs; + sfs_node_t *snapdir; + vnode_t *rvp; + uint64_t crtime[2]; + + ASSERT(zfsvfs->z_ctldir == NULL); + + snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT, + ZFSCTL_INO_SNAPDIR); + dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0, + ZFSCTL_INO_ROOT); + dot_zfs->snapdir = snapdir; + + VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); + VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof (crtime))); + ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); + vput(rvp); + + zfsvfs->z_ctldir = dot_zfs; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * The nodes must not have any associated vnodes by now as they should be + * vflush-ed. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + sfs_destroy_node(zfsvfs->z_ctldir->snapdir); + sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +static int +zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + return (VFS_ROOT(mp, flags, vpp)); +} + +static void +zfsctl_common_vnode_setup(vnode_t *vp, void *arg) +{ + ASSERT_VOP_ELOCKED(vp, __func__); + + /* We support shared locking. */ + VN_LOCK_ASHARE(vp); + vp->v_type = VDIR; + vp->v_data = arg; +} + +static int +zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir; + err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, + zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +static int +zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir; + err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", + &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +int +zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) +{ + int error; + + error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); + return (error); +} + +/* + * Common open routine. Disallow any write access. + */ +static int +zfsctl_common_open(struct vop_open_args *ap) +{ + int flags = ap->a_mode; + + if (flags & FWRITE) + return (SET_ERROR(EACCES)); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(struct vop_close_args *ap) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +static int +zfsctl_common_access(struct vop_access_args *ap) +{ + accmode_t accmode = ap->a_accmode; + + if (accmode & VWRITE) + return (SET_ERROR(EACCES)); + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + timestruc_t now; + sfs_node_t *node; + + node = vp->v_data; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purely virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vn_fsid(vp, vap); + vap->va_mode = zfsctl_ctldir_mode; + vap->va_type = VDIR; + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + /* FreeBSD: Reset chflags(2) flags. */ + vap->va_flags = 0; + + vap->va_nodeid = node->sn_id; + + /* At least '.' and '..'. */ + vap->va_nlink = 2; +} + +#ifndef _OPENSOLARIS_SYS_VNODE_H_ +struct vop_fid_args { + struct vnode *a_vp; + struct fid *a_fid; +}; +#endif + +static int +zfsctl_common_fid(struct vop_fid_args *ap) +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + sfs_node_t *node = vp->v_data; + uint64_t object = node->sn_id; + zfid_short_t *zfid; + int i; + + zfid = (zfid_short_t *)fidp; + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs nodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_reclaim_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfsctl_common_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + + (void) sfs_reclaim_vnode(vp); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_print_args { + struct vnode *a_vp; +}; +#endif + +static int +zfsctl_common_print(struct vop_print_args *ap) +{ + sfs_print_node(ap->a_vp->v_data); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +/* + * Get root directory attributes. + */ +static int +zfsctl_root_getattr(struct vop_getattr_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + zfsctl_root_t *node = vp->v_data; + + zfsctl_common_getattr(vp, vap); + vap->va_ctime = node->cmtime; + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + vap->va_nlink += 1; /* snapdir */ + vap->va_size = vap->va_nlink; + return (0); +} + +/* + * When we lookup "." we still can be asked to lock it + * differently, can't we? + */ +int +zfsctl_relock_dot(vnode_t *dvp, int ltype) +{ + vref(dvp); + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* Relock for the "." case may left us with reclaimed vnode. */ + if (VN_IS_DOOMED(dvp)) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); +} + +/* + * Special case the handling of "..". + */ +int +zfsctl_root_lookup(struct vop_lookup_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + int flags = ap->a_cnp->cn_flags; + int lkflags = ap->a_cnp->cn_lkflags; + int nameiop = ap->a_cnp->cn_nameiop; + int err; + + ASSERT(dvp->v_type == VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + } else if ((flags & ISDOTDOT) != 0) { + err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, + lkflags, vpp); + } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { + err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); + } else { + err = SET_ERROR(ENOENT); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_root_readdir(struct vop_readdir_args *ap) +{ + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_root_t *node = vp->v_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + ASSERT(vp->v_type == VDIR); + + error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + if (uio->uio_offset != dots_offset) + return (SET_ERROR(EINVAL)); + + CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name)); + entry.d_fileno = node->snapdir->sn_id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, node->snapdir->sn_name); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + return (SET_ERROR(error)); + } + if (eofp != NULL) + *eofp = 1; + return (0); +} + +static int +zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) +{ + static const char dotzfs_name[4] = ".zfs"; + vnode_t *dvp; + int error; + + if (*ap->a_buflen < sizeof (dotzfs_name)) + return (SET_ERROR(ENOMEM)); + + error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, + LK_SHARED, &dvp); + if (error != 0) + return (SET_ERROR(error)); + + VOP_UNLOCK1(dvp); + *ap->a_vpp = dvp; + *ap->a_buflen -= sizeof (dotzfs_name); + bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); + return (0); +} + +static int +zfsctl_common_pathconf(struct vop_pathconf_args *ap) +{ + /* + * We care about ACL variables so that user land utilities like ls + * can display them correctly. Since the ctldir's st_dev is set to be + * the same as the parent dataset, we must support all variables that + * it supports. + */ + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *ap->a_retval = 64; + return (0); + + case _PC_MIN_HOLE_SIZE: + *ap->a_retval = (int)SPA_MINBLOCKSIZE; + return (0); + + case _PC_ACL_EXTENDED: + *ap->a_retval = 0; + return (0); + + case _PC_ACL_NFS4: + *ap->a_retval = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *ap->a_retval = ACL_MAX_ENTRIES; + return (0); + + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * Returns a trivial ACL + */ +int +zfsctl_common_getacl(struct vop_getacl_args *ap) +{ + int i; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0); + /* + * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify + * attributes. That is not the case for the ctldir, so we must clear + * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs + * aren't supported by the ctldir. + */ + for (i = 0; i < ap->a_aclp->acl_cnt; i++) { + struct acl_entry *entry; + entry = &(ap->a_aclp->acl_entry[i]); + entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | + ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS | + ACL_READ_NAMED_ATTRS); + } + + return (0); +} + +static struct vop_vector zfsctl_ops_root = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_root_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_root_readdir, + .vop_lookup = zfsctl_root_lookup, + .vop_inactive = VOP_NULL, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_vptocnp = zfsctl_root_vptocnp, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root); + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (SET_ERROR(ENAMETOOLONG)); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + int err; + + err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); + return (err); +} + +/* + * Given a vnode get a root vnode of a filesystem mounted on top of + * the vnode, if any. The root vnode is referenced and locked. + * If no filesystem is mounted then the orinal vnode remains referenced + * and locked. If any error happens the orinal vnode is unlocked and + * released. + */ +static int +zfsctl_mounted_here(vnode_t **vpp, int flags) +{ + struct mount *mp; + int err; + + ASSERT_VOP_LOCKED(*vpp, __func__); + ASSERT3S((*vpp)->v_type, ==, VDIR); + + if ((mp = (*vpp)->v_mountedhere) != NULL) { + err = vfs_busy(mp, 0); + KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); + KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); + vput(*vpp); + err = VFS_ROOT(mp, flags, vpp); + vfs_unbusy(mp); + return (err); + } + return (EJUSTRETURN); +} + +typedef struct { + const char *snap_name; + uint64_t snap_id; +} snapshot_setup_arg_t; + +static void +zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) +{ + snapshot_setup_arg_t *ssa = arg; + sfs_node_t *node; + + ASSERT_VOP_ELOCKED(vp, __func__); + + node = sfs_alloc_node(sizeof (sfs_node_t), + ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); + zfsctl_common_vnode_setup(vp, node); + + /* We have to support recursive locking. */ + VN_LOCK_AREC(vp); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + * There are four possibilities: + * - the snapshot node and vnode do not exist + * - the snapshot vnode is covered by the mounted snapshot + * - the snapshot vnode is not covered yet, the mount operation is in progress + * - the snapshot vnode is not covered, because the snapshot has been unmounted + * The last two states are transient and should be relatively short-lived. + */ +int +zfsctl_snapdir_lookup(struct vop_lookup_args *ap) +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + char name[NAME_MAX + 1]; + char fullname[ZFS_MAX_DATASET_NAME_LEN]; + char *mountpoint; + size_t mountpoint_len; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + uint64_t snap_id; + int nameiop = cnp->cn_nameiop; + int lkflags = cnp->cn_lkflags; + int flags = cnp->cn_flags; + int err; + + ASSERT(dvp->v_type == VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + return (err); + } + if (flags & ISDOTDOT) { + err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, + vpp); + return (err); + } + + if (cnp->cn_namelen >= sizeof (name)) + return (SET_ERROR(ENAMETOOLONG)); + + strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + err = zfsctl_snapshot_lookup(dvp, name, &snap_id); + if (err != 0) + return (SET_ERROR(ENOENT)); + + for (;;) { + snapshot_setup_arg_t ssa; + + ssa.snap_name = name; + ssa.snap_id = snap_id; + err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, + snap_id, "zfs", &zfsctl_ops_snapshot, + zfsctl_snapshot_vnode_setup, &ssa, vpp); + if (err != 0) + return (err); + + /* Check if a new vnode has just been created. */ + if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) + break; + + /* + * Check if a snapshot is already mounted on top of the vnode. + */ + err = zfsctl_mounted_here(vpp, lkflags); + if (err != EJUSTRETURN) + return (err); + + /* + * If the vnode is not covered, then either the mount operation + * is in progress or the snapshot has already been unmounted + * but the vnode hasn't been inactivated and reclaimed yet. + * We can try to re-use the vnode in the latter case. + */ + VI_LOCK(*vpp); + if (((*vpp)->v_iflag & VI_MOUNT) == 0) { + /* + * Upgrade to exclusive lock in order to: + * - avoid race conditions + * - satisfy the contract of mount_snapshot() + */ + err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); + if (err == 0) + break; + } else { + VI_UNLOCK(*vpp); + } + + /* + * In this state we can loop on uncontested locks and starve + * the thread doing the lengthy, non-trivial mount operation. + * So, yield to prevent that from happening. + */ + vput(*vpp); + kern_yield(PRI_USER); + } + + VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname)); + + mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, + "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", + dvp->v_vfsp->mnt_stat.f_mntonname, name); + + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); + kmem_free(mountpoint, mountpoint_len); + if (err == 0) { + /* + * Fix up the root vnode mounted on .zfs/snapshot/. + * + * This is where we lie about our v_vfsp in order to + * make .zfs/snapshot/ accessible over NFS + * without requiring manual mounts of . + */ + ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + + /* Clear the root flag (set via VFS_ROOT) as well. */ + (*vpp)->v_vflag &= ~VV_ROOT; + } + + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_snapdir_readdir(struct vop_readdir_args *ap) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + ASSERT(vp->v_type == VDIR); + + error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + + ZFS_ENTER(zfsvfs); + for (;;) { + uint64_t cookie; + uint64_t id; + + cookie = uio->uio_offset - dots_offset; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) { + if (eofp != NULL) + *eofp = 1; + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + entry.d_fileno = id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, snapname); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(error)); + } + uio->uio_offset = cookie + dots_offset; + } + /* NOTREACHED */ +} + +static int +zfsctl_snapdir_getattr(struct vop_getattr_args *ap) +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os); + uint64_t snap_count; + int err; + + ZFS_ENTER(zfsvfs); + zfsctl_common_getattr(vp, vap); + vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + if (err != 0) { + ZFS_EXIT(zfsvfs); + return (err); + } + vap->va_nlink += snap_count; + } + vap->va_size = vap->va_nlink; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static struct vop_vector zfsctl_ops_snapdir = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_getattr = zfsctl_snapdir_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_snapdir_readdir, + .vop_lookup = zfsctl_snapdir_lookup, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir); + + +static int +zfsctl_snapshot_inactive(struct vop_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + + VERIFY(vrecycle(vp) == 1); + return (0); +} + +static int +zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + void *data = vp->v_data; + + sfs_reclaim_vnode(vp); + sfs_destroy_node(data); + return (0); +} + +static int +zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) +{ + struct mount *mp; + vnode_t *dvp; + vnode_t *vp; + sfs_node_t *node; + size_t len; + int locked; + int error; + + vp = ap->a_vp; + node = vp->v_data; + len = strlen(node->sn_name); + if (*ap->a_buflen < len) + return (SET_ERROR(ENOMEM)); + + /* + * Prevent unmounting of the snapshot while the vnode lock + * is not held. That is not strictly required, but allows + * us to assert that an uncovered snapshot vnode is never + * "leaked". + */ + mp = vp->v_mountedhere; + if (mp == NULL) + return (SET_ERROR(ENOENT)); + error = vfs_busy(mp, 0); + KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); + + /* + * We can vput the vnode as we can now depend on the reference owned + * by the busied mp. But we also need to hold the vnode, because + * the reference may go after vfs_unbusy() which has to be called + * before we can lock the vnode again. + */ + locked = VOP_ISLOCKED(vp); +#if __FreeBSD_version >= 1300045 + enum vgetstate vs = vget_prep(vp); +#else + vhold(vp); +#endif + vput(vp); + + /* Look up .zfs/snapshot, our parent. */ + error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); + if (error == 0) { + VOP_UNLOCK1(dvp); + *ap->a_vpp = dvp; + *ap->a_buflen -= len; + bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); + } + vfs_unbusy(mp); +#if __FreeBSD_version >= 1300045 + vget_finish(vp, locked | LK_RETRY, vs); +#else + vget(vp, locked | LK_VNHELD | LK_RETRY, curthread); +#endif + return (error); +} + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static struct vop_vector zfsctl_ops_snapshot = { + .vop_default = NULL, /* ensure very restricted access */ + .vop_inactive = zfsctl_snapshot_inactive, +#if __FreeBSD_version >= 1300045 + .vop_need_inactive = vop_stdneed_inactive, +#endif + .vop_reclaim = zfsctl_snapshot_reclaim, + .vop_vptocnp = zfsctl_snapshot_vptocnp, + .vop_lock1 = vop_stdlock, + .vop_unlock = vop_stdunlock, + .vop_islocked = vop_stdislocked, + .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ + .vop_print = zfsctl_common_print, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot); + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs __unused = vfsp->vfs_data; + vnode_t *vp; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + *zfsvfsp = NULL; + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, objsetid, &vp); + if (error == 0 && vp != NULL) { + /* + * XXX Probably need to at least reference, if not busy, the mp. + */ + if (vp->v_mountedhere != NULL) + *zfsvfsp = vp->v_mountedhere->mnt_data; + vput(vp); + } + if (*zfsvfsp == NULL) + return (SET_ERROR(EINVAL)); + return (0); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct mount *mp; + vnode_t *vp; + uint64_t cookie; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + + cookie = 0; + for (;;) { + uint64_t id; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) + error = 0; + break; + } + + for (;;) { + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, id, &vp); + if (error != 0 || vp == NULL) + break; + + mp = vp->v_mountedhere; + + /* + * v_mountedhere being NULL means that the + * (uncovered) vnode is in a transient state + * (mounting or unmounting), so loop until it + * settles down. + */ + if (mp != NULL) + break; + vput(vp); + } + if (error != 0) + break; + if (vp == NULL) + continue; /* no mountpoint, nothing to do */ + + /* + * The mount-point vnode is kept locked to avoid spurious EBUSY + * from a concurrent umount. + * The vnode lock must have recursive locking enabled. + */ + vfs_ref(mp); + error = dounmount(mp, fflags, curthread); + KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, + ("extra references after unmount")); + vput(vp); + if (error != 0) + break; + } + KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, + ("force unmounting failed")); + return (error); +} + +int +zfsctl_snapshot_unmount(char *snapname, int flags __unused) +{ + vfs_t *vfsp = NULL; + zfsvfs_t *zfsvfs = NULL; + + if (strchr(snapname, '@') == NULL) + return (0); + + int err = getzfsvfs(snapname, &zfsvfs); + if (err != 0) { + ASSERT3P(zfsvfs, ==, NULL); + return (0); + } + vfsp = zfsvfs->z_vfs; + + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); + + vfs_ref(vfsp); + vfs_unbusy(vfsp); + return (dounmount(vfsp, MS_FORCE, curthread)); +} diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c new file mode 100644 index 000000000000..2f5962b25a86 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_debug.c @@ -0,0 +1,254 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include +#include + +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + int zdm_size; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size = 0; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ +kstat_t *zfs_dbgmsg_kstat; + +/* + * Internal ZFS debug messages are enabled by default. + * + * # Print debug messages + * cat /proc/spl/kstat/zfs/dbgmsg + * + * # Disable the kernel debug message log. + * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable + * + * # Clear the kernel debug message log. + * echo 0 >/proc/spl/kstat/zfs/dbgmsg + */ +int zfs_dbgmsg_enable = 1; + +static int +zfs_dbgmsg_headers(char *buf, size_t size) +{ + (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message"); + + return (0); +} + +static int +zfs_dbgmsg_data(char *buf, size_t size, void *data) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data; + + (void) snprintf(buf, size, "%-12llu %-s\n", + (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); + + return (0); +} + +static void * +zfs_dbgmsg_addr(kstat_t *ksp, loff_t n) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + if (n == 0) + ksp->ks_private = list_head(&zfs_dbgmsgs); + else if (zdm) + ksp->ks_private = list_next(&zfs_dbgmsgs, zdm); + + return (ksp->ks_private); +} + +static void +zfs_dbgmsg_purge(int max_size) +{ + zfs_dbgmsg_t *zdm; + int size; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + while (zfs_dbgmsg_size > max_size) { + zdm = list_remove_head(&zfs_dbgmsgs); + if (zdm == NULL) + return; + + size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + +static int +zfs_dbgmsg_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + zfs_dbgmsg_purge(0); + + return (0); +} + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); + + zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + if (zfs_dbgmsg_kstat) { + zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock; + zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX; + zfs_dbgmsg_kstat->ks_private = NULL; + zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update; + kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers, + zfs_dbgmsg_data, zfs_dbgmsg_addr); + kstat_install(zfs_dbgmsg_kstat); + } +} + +void +zfs_dbgmsg_fini(void) +{ + if (zfs_dbgmsg_kstat) + kstat_delete(zfs_dbgmsg_kstat); + /* + * TODO - decide how to make this permanent + */ +#ifdef _KERNEL + mutex_enter(&zfs_dbgmsgs_lock); + zfs_dbgmsg_purge(0); + mutex_exit(&zfs_dbgmsgs_lock); + mutex_destroy(&zfs_dbgmsgs_lock); +#endif +} + +void +__zfs_dbgmsg(char *buf) +{ + zfs_dbgmsg_t *zdm; + int size; + + DTRACE_PROBE1(zfs__dbgmsg, char *, buf); + + size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_size = size; + zdm->zdm_timestamp = gethrestime_sec(); + strcpy(zdm->zdm_msg, buf); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +__set_error(const char *file, const char *func, int line, int err) +{ + /* + * To enable this: + * + * $ echo 512 >/sys/module/zfs/parameters/zfs_flags + */ + if (zfs_flags & ZFS_DEBUG_SET_ERROR) + __dprintf(B_FALSE, file, func, line, "error %lu", err); +} + +#ifdef _KERNEL +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + size_t size; + char *buf; + char *nl; + int i; + + size = 1024; + buf = kmem_alloc(size, KM_SLEEP); + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); + + if (i < size) { + va_start(adx, fmt); + (void) vsnprintf(buf + i, size - i, fmt, adx); + va_end(adx); + } + + /* + * Get rid of trailing newline. + */ + nl = strrchr(buf, '\n'); + if (nl != NULL) + *nl = '\0'; + + __zfs_dbgmsg(buf); + + kmem_free(buf, size); +} + +#else + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + (void) printf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + (void) printf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} +#endif /* _KERNEL */ + +#ifdef _KERNEL +module_param(zfs_dbgmsg_enable, int, 0644); +MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log"); + +module_param(zfs_dbgmsg_maxsize, int, 0644); +MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size"); +#endif diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c new file mode 100644 index 000000000000..e93b3e2cf2ff --- /dev/null +++ b/module/os/freebsd/zfs/zfs_dir.c @@ -0,0 +1,961 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, + matchtype_t mt, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, NULL, 0, NULL); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Look up a directory entry under a locked vnode. + * dvp being locked gives us a guarantee that there are no concurrent + * modification of the directory and, thus, if a node can be found in + * the directory, then it must not be unlinked. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZXATTR: we want dzp's xattr directory + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + */ +int +zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + matchtype_t mt = 0; + uint64_t zoid; + int error = 0; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + + *zpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) || + (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect how we perform zap lookups. + * + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if (zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE + * because in that case MT_EXACT and MT_FIRST should produce exactly + * the same result. + */ + + if (dzp->z_unlinked && !(flag & ZXATTR)) + return (ENOENT); + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? ENOENT : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + return (error); + } + } else { + if (flag & ZNEW) { + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, &zp); + if (error) + return (error); + ASSERT(!zp->z_unlinked); + *zpp = zp; + } + + return (0); +} + +static int +zfs_dd_lookup(znode_t *dzp, znode_t **zpp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + uint64_t parent; + int error; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); + + if (dzp->z_unlinked) + return (ENOENT); + + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + return (error); +} + +int +zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) +{ + zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs; + znode_t *zp = NULL; + int error = 0; + +#ifdef ZFS_DEBUG + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); +#endif + if (dzp->z_unlinked) + return (SET_ERROR(ENOENT)); + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + error = zfs_dd_lookup(dzp, &zp); + if (error == 0) + *zpp = zp; + } else { + error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); + if (error == 0) { + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + *zpp = zp; + } + } + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_unlinked); + ASSERT(zp->z_links == 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + dmu_tx_t *tx; + int error; + + /* + * Interate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); + + /* + * Due to changes in zfs_rmnode we need to make sure the + * link count is set to zero here. + */ + if (zp->z_links != 0) { + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + vput(ZTOV(zp)); + continue; + } + zp->z_links = 0; + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &zp->z_links, sizeof (zp->z_links), tx)); + dmu_tx_commit(tx); + } + + zp->z_unlinked = B_TRUE; + vput(ZTOV(zp)); + } + zap_cursor_fini(&zc); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + vput(ZTOV(xzp)); + skipped += 1; + continue; + } + + error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + vput(ZTOV(xzp)); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +extern taskq_t *zfsvfs_taskq; + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + uint64_t count; + int error; + + ASSERT(zp->z_links == 0); + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && + (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } else { + /* + * Free up all the data in the file. We don't do this for + * XATTR directories because we need truncate and remove to be + * in the same tx, like in zfs_znode_delete(). Otherwise, if + * we crash here we'll end up with an inconsistent truncated + * zap object in the delete queue. Note a truncated file is + * harmless since it only contains user data. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error) + xattr_obj = 0; + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xattr_obj) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* + * FreeBSD's implemention of zfs_zget requires a vnode to back it. + * This means that we could end up calling into getnewvnode while + * calling zfs_rmnode as a result of a prior call to getnewvnode + * trying to clear vnodes out of the cache. If this repeats we can + * recurse enough that we overflow our stack. To avoid this, we + * avoid calling zfs_zget on the xattr znode and instead simply add + * it to the unlinked set and schedule a call to zfs_unlinked_drain. + */ + if (xattr_obj) { + /* Add extended attribute directory to the unlinked set. */ + VERIFY3U(0, ==, + zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); + } + + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + /* Remove this znode from the unlinked set */ + VERIFY3U(0, ==, + zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); + + if (xattr_obj) { + /* + * We're using the FreeBSD taskqueue API here instead of + * the Solaris taskq API since the FreeBSD API allows for a + * task to be enqueued multiple times but executed once. + */ + taskqueue_enqueue(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + } +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dzp. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + uint64_t value; + int zp_is_dir = (vp->v_type == VDIR); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + } + if (zp_is_dir) { + if (dzp->z_links >= ZFS_LINK_MAX) + return (SET_ERROR(EMLINK)); + } + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + return (SET_ERROR(ENOENT)); + } + if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { + return (SET_ERROR(EMLINK)); + } + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + + } else { + ASSERT(zp->z_unlinked == 0); + } + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, + 8, 1, &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + if (!(flag & ZRENAMING) && !(flag & ZNEW)) + zp->z_links--; + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (zp->z_zfsvfs->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, + name, mt, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); + } + + return (error); +} + +/* + * Unlink zp from dzp, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag, boolean_t *unlinkedp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + } + if (!(flag & ZRENAMING)) { + + if (zp_is_dir && !zfs_dirempty(zp)) + return (SET_ERROR(ENOTEMPTY)); + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) { + return (error); + } + + if (zp->z_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on vnode %p is %u, " + "should be at least %u", zp->z_vnode, + (int)zp->z_links, + zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; + } + if (--zp->z_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_links = 0; + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT0(error); + } else { + ASSERT(zp->z_unlinked == 0); + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) + return (error); + } + + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_size == 2); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t parent __unused; + + *xvpp = NULL; + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve_(); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + *xvpp = xzp; + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + vattr_t va; + int error; +top: + error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); + if (error) + return (error); + + if (xzp != NULL) { + *xzpp = xzp; + return (0); + } + + + if (!(flags & CREATE_XATTR_DIR)) + return (SET_ERROR(ENOATTR)); + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xzpp, cr); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + if (error == 0) + VOP_UNLOCK1(ZTOV(*xzpp)); + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + + if (zdp->z_zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(ZTOV(zp), cr)); +} diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c new file mode 100644 index 000000000000..ec7c04717c84 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + struct thread *td; + int rc, fd; + + td = curthread; + pwd_ensure_dirs(); + /* 12.x doesn't take a const char * */ + rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path), + UIO_SYSSPACE, flags, mode); + if (rc) + return (SET_ERROR(rc)); + fd = td->td_retval[0]; + td->td_retval[0] = 0; + if (fget(curthread, fd, &cap_no_rights, fpp)) + kern_close(td, fd); + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + fo_close(fp, curthread); +} + +static int +zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp, + ssize_t *resid) +{ + ssize_t rc; + struct uio auio; + struct thread *td; + struct iovec aiov; + + td = curthread; + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + auio.uio_offset = *offp; + + if ((fp->f_flag & FWRITE) == 0) + return (SET_ERROR(EBADF)); + + if (fp->f_type == DTYPE_VNODE) + bwillwrite(); + + rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td); + if (rc) + return (SET_ERROR(rc)); + if (resid) + *resid = auio.uio_resid; + else if (auio.uio_resid) + return (SET_ERROR(EIO)); + *offp += count - auio.uio_resid; + return (rc); +} + +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + + return (SET_ERROR(rc)); +} + +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_write_impl(fp, buf, count, &off, resid)); +} + +static int +zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp, + ssize_t *resid) +{ + ssize_t rc; + struct uio auio; + struct thread *td; + struct iovec aiov; + + td = curthread; + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + auio.uio_offset = *offp; + + if ((fp->f_flag & FREAD) == 0) + return (SET_ERROR(EBADF)); + + rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td); + if (rc) + return (SET_ERROR(rc)); + *resid = auio.uio_resid; + *offp += count - auio.uio_resid; + return (SET_ERROR(0)); +} + +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_read_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + return (rc); +} + +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_read_impl(fp, buf, count, &off, resid)); +} + +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + int rc; + struct thread *td; + + td = curthread; + if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) + return (SET_ERROR(ESPIPE)); + rc = fo_seek(fp, *offp, whence, td); + if (rc == 0) + *offp = td->td_uretoff.tdu_off; + return (SET_ERROR(rc)); +} + +int +zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) +{ + struct thread *td; + struct stat sb; + int rc; + + td = curthread; + + rc = fo_stat(fp, &sb, td->td_ucred, td); + if (rc) + return (SET_ERROR(rc)); + zfattr->zfa_size = sb.st_size; + zfattr->zfa_mode = sb.st_mode; + + return (0); +} + +static __inline int +zfs_vop_fsync(vnode_t *vp) +{ + struct mount *mp; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto drop; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(vp, MNT_WAIT, curthread); + VOP_UNLOCK1(vp); + vn_finished_write(mp); +drop: + return (SET_ERROR(error)); +} + +int +zfs_file_fsync(zfs_file_t *fp, int flags) +{ + struct vnode *v; + + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + + v = fp->f_data; + return (zfs_vop_fsync(v)); +} + +int +zfs_file_get(int fd, zfs_file_t **fpp) +{ + struct file *fp; + + if (fget(curthread, fd, &cap_no_rights, &fp)) + return (SET_ERROR(EBADF)); + + *fpp = fp; + return (0); +} + +void +zfs_file_put(int fd) +{ + struct file *fp; + + /* No CAP_ rights required, as we're only releasing. */ + if (fget(curthread, fd, &cap_no_rights, &fp) == 0) { + fdrop(fp, curthread); + fdrop(fp, curthread); + } +} + +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (fp->f_offset); +} + +void * +zfs_file_private(zfs_file_t *fp) +{ + file_t *tmpfp; + void *data; + int error; + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + curthread->td_fpop = tmpfp; + if (error != 0) + return (NULL); + return (data); +} + +int +zfs_file_unlink(const char *fnamep) +{ + enum uio_seg seg = UIO_SYSSPACE; + int rc; + +#if __FreeBSD_version >= 1300018 + rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0); +#else +#ifdef AT_BENEATH + rc = kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0, 0); +#else + rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), + seg, 0); +#endif +#endif + return (SET_ERROR(rc)); +} diff --git a/module/os/freebsd/zfs/zfs_fuid_os.c b/module/os/freebsd/zfs/zfs_fuid_os.c new file mode 100644 index 000000000000..ebd09abd65e0 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_fuid_os.c @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#endif +#include + +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); + + if (IS_EPHEMERAL(id)) + return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); + + return ((uint64_t)id); +} diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c new file mode 100644 index 000000000000..4b7e8646709f --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ioctl_os.c @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +zfs_vfs_ref(zfsvfs_t **zfvp) +{ + int error = 0; + + if (*zfvp == NULL) + return (SET_ERROR(ESRCH)); + + error = vfs_busy((*zfvp)->z_vfs, 0); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } + return (error); +} + +int +zfs_vfs_held(zfsvfs_t *zfsvfs) +{ + return (zfsvfs->z_vfs != NULL); +} + +void +zfs_vfs_rele(zfsvfs_t *zfsvfs) +{ + vfs_unbusy(zfsvfs->z_vfs); +} + +static const zfs_ioc_key_t zfs_keys_nextboot[] = { + {"command", DATA_TYPE_STRING, 0}, + { ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0}, + { ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0} +}; + +static int +zfs_ioc_jail(zfs_cmd_t *zc) +{ + + return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_zoneid)); +} + +static int +zfs_ioc_unjail(zfs_cmd_t *zc) +{ + + return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_zoneid)); +} + +static int +zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) +{ + char name[MAXNAMELEN]; + spa_t *spa; + vdev_t *vd; + char *command; + uint64_t pool_guid; + uint64_t vdev_guid; + int error; + + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + return (EINVAL); + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_GUID, &vdev_guid) != 0) + return (EINVAL); + if (nvlist_lookup_string(innvl, + "command", &command) != 0) + return (EINVAL); + + mutex_enter(&spa_namespace_lock); + spa = spa_by_guid(pool_guid, vdev_guid); + if (spa != NULL) + strcpy(name, spa_name(spa)); + mutex_exit(&spa_namespace_lock); + if (spa == NULL) + return (ENOENT); + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENXIO); + spa_close(spa, FTAG); + return (ENODEV); + } + error = vdev_label_write_pad2(vd, command, strlen(command)); + (void) spa_vdev_state_exit(spa, NULL, 0); + txg_wait_synced(spa->spa_dsl_pool, 0); + spa_close(spa, FTAG); + return (error); +} + + +void +zfs_ioctl_init_os(void) +{ + zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT, + zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3); + +} diff --git a/module/os/freebsd/zfs/zfs_onexit_os.c b/module/os/freebsd/zfs/zfs_onexit_os.c new file mode 100644 index 000000000000..8b22f2fdc3b3 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_onexit_os.c @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int +zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) +{ + *zo = zfsdev_get_state(minor, ZST_ONEXIT); + if (*zo == NULL) + return (SET_ERROR(EBADF)); + + return (0); +} + +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + file_t *fp, *tmpfp; + zfs_onexit_t *zo; + void *data; + int error; + + if ((error = zfs_file_get(fd, &fp))) + return (error); + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + if (error == 0) + *minorp = (minor_t)(uintptr_t)data; + curthread->td_fpop = tmpfp; + if (error != 0) + return (SET_ERROR(EBADF)); + return (zfs_onexit_minor_to_state(*minorp, &zo)); +} + +void +zfs_onexit_fd_rele(int fd) +{ + zfs_file_put(fd); +} diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c new file mode 100644 index 000000000000..d6f7fc11e9bd --- /dev/null +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -0,0 +1,2448 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_comutil.h" + +#ifndef MNTK_VMSETSIZE_BUG +#define MNTK_VMSETSIZE_BUG 0 +#endif +#ifndef MNTK_NOMSYNC +#define MNTK_NOMSYNC 8 +#endif + +/* BEGIN CSTYLED */ +struct mtx zfs_debug_mtx; +MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); + +SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); + +int zfs_super_owner; +SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, + "File system owner can perform privileged operation on his file systems"); + +int zfs_debug_level; +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, + "Debug level"); + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); +static int zfs_version_acl = ZFS_ACL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, + "ZFS_ACL_VERSION"); +static int zfs_version_spa = SPA_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, + "SPA_VERSION"); +static int zfs_version_zpl = ZPL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, + "ZPL_VERSION"); +/* END CSTYLED */ + +static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); +static int zfs_mount(vfs_t *vfsp); +static int zfs_umount(vfs_t *vfsp, int fflag); +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); +static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); +static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); +static int zfs_sync(vfs_t *vfsp, int waitfor); +static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors); +static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); +static void zfs_freevfs(vfs_t *vfsp); + +struct vfsops zfs_vfsops = { + .vfs_mount = zfs_mount, + .vfs_unmount = zfs_umount, +#if __FreeBSD_version >= 1300049 + .vfs_root = vfs_cache_root, + .vfs_cachedroot = zfs_root, +#else + .vfs_root = zfs_root, +#endif + .vfs_statfs = zfs_statfs, + .vfs_vget = zfs_vget, + .vfs_sync = zfs_sync, + .vfs_checkexp = zfs_checkexp, + .vfs_fhtovp = zfs_fhtovp, + .vfs_quotactl = zfs_quotactl, +}; + +VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +int +zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) +{ + int error; + zfsvfs_t *zfvp; + vfs_t *vfsp; + objset_t *os; + uint64_t tmp = *val; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + error = getzfsvfs_impl(os, &zfvp); + if (error != 0) + return (error); + if (zfvp == NULL) + return (ENOENT); + vfsp = zfvp->z_vfs; + switch (zfs_prop) { + case ZFS_PROP_ATIME: + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) + tmp = 1; + break; + case ZFS_PROP_DEVICES: + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) + tmp = 1; + break; + case ZFS_PROP_EXEC: + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) + tmp = 1; + break; + case ZFS_PROP_SETUID: + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) + tmp = 1; + break; + case ZFS_PROP_READONLY: + if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) + tmp = 1; + break; + case ZFS_PROP_XATTR: + if (zfvp->z_flags & ZSB_XATTR) + tmp = zfvp->z_xattr; + break; + case ZFS_PROP_NBMAND: + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) + tmp = 1; + break; + default: + vfs_unbusy(vfsp); + return (ENOENT); + } + + vfs_unbusy(vfsp); + if (tmp != *val) { + (void) strcpy(setpoint, "temporary"); + *val = tmp; + } + return (0); +} + +static int +zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) +{ + int error = 0; + char buf[32]; + uint64_t usedobj, quotaobj; + uint64_t quota, used = 0; + timespec_t now; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) { + error = ENOENT; + goto done; + } + (void) sprintf(buf, "%llx", (longlong_t)id); + if ((error = zap_lookup(zfsvfs->z_os, quotaobj, + buf, sizeof (quota), 1, "a)) != 0) { + dprintf("%s(%d): quotaobj lookup failed\n", + __FUNCTION__, __LINE__); + goto done; + } + /* + * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". + * So we set them to be the same. + */ + dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); + error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); + if (error && error != ENOENT) { + dprintf("%s(%d): usedobj failed; %d\n", + __FUNCTION__, __LINE__, error); + goto done; + } + dqp->dqb_curblocks = btodb(used); + dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; + vfs_timestamp(&now); + /* + * Setting this to 0 causes FreeBSD quota(8) to print + * the number of days since the epoch, which isn't + * particularly useful. + */ + dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; +done: + return (error); +} + +static int +zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct thread *td; + int cmd, type, error = 0; + int bitsize; + zfs_userquota_prop_t quota_type; + struct dqblk64 dqblk = { 0 }; + + td = curthread; + cmd = cmds >> SUBCMDSHIFT; + type = cmds & SUBCMDMASK; + + ZFS_ENTER(zfsvfs); + if (id == -1) { + switch (type) { + case USRQUOTA: + id = td->td_ucred->cr_ruid; + break; + case GRPQUOTA: + id = td->td_ucred->cr_rgid; + break; + default: + error = EINVAL; + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) + vfs_unbusy(vfsp); + goto done; + } + } + /* + * Map BSD type to: + * ZFS_PROP_USERUSED, + * ZFS_PROP_USERQUOTA, + * ZFS_PROP_GROUPUSED, + * ZFS_PROP_GROUPQUOTA + */ + switch (cmd) { + case Q_SETQUOTA: + case Q_SETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERQUOTA; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPQUOTA; + else + error = EINVAL; + break; + case Q_GETQUOTA: + case Q_GETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERUSED; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPUSED; + else + error = EINVAL; + break; + } + + /* + * Depending on the cmd, we may need to get + * the ruid and domain (see fuidstr_to_sid?), + * the fuid (how?), or other information. + * Create fuid using zfs_fuid_create(zfsvfs, id, + * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? + * I think I can use just the id? + * + * Look at zfs_id_overquota() to look up a quota. + * zap_lookup(something, quotaobj, fuidstring, + * sizeof (long long), 1, "a) + * + * See zfs_set_userquota() to set a quota. + */ + if ((uint32_t)type >= MAXQUOTAS) { + error = EINVAL; + goto done; + } + + switch (cmd) { + case Q_GETQUOTASIZE: + bitsize = 64; + error = copyout(&bitsize, arg, sizeof (int)); + break; + case Q_QUOTAON: + // As far as I can tell, you can't turn quotas on or off on zfs + error = 0; + vfs_unbusy(vfsp); + break; + case Q_QUOTAOFF: + error = ENOTSUP; + vfs_unbusy(vfsp); + break; + case Q_SETQUOTA: + error = copyin(&dqblk, arg, sizeof (dqblk)); + if (error == 0) + error = zfs_set_userquota(zfsvfs, quota_type, + "", id, dbtob(dqblk.dqb_bhardlimit)); + break; + case Q_GETQUOTA: + error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); + if (error == 0) + error = copyout(&dqblk, arg, sizeof (dqblk)); + break; + default: + error = EINVAL; + break; + } +done: + ZFS_EXIT(zfsvfs); + return (error); +} + + +boolean_t +zfs_is_readonly(zfsvfs_t *zfsvfs) +{ + return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); +} + +/*ARGSUSED*/ +static int +zfs_sync(vfs_t *vfsp, int waitfor) +{ + + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * Ignore the system syncher. ZFS already commits async data + * at zfs_txg_timeout intervals. + */ + if (waitfor == MNT_LAZY) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; + int error; + + error = vfs_stdsync(vfsp, waitfor); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (rebooting && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_flags &= ~ZSB_XATTR; + } else { + zfsvfs->z_flags |= ZSB_XATTR; + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->mnt_stat.f_iosize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + uint64_t nbmand; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; + boolean_t do_xattr = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { + atime = B_TRUE; + do_atime = B_TRUE; + } + + /* + * We need to enter pool configuration here, so that we can use + * dsl_prop_get_int_ds() to handle the special nbmand property below. + * dsl_prop_get_integer() can not be used, because it has to acquire + * spa_namespace_lock and we can not do that because we already hold + * z_teardown_lock. The problem is that spa_write_cachefile() is called + * with spa_namespace_lock held and the function calls ZFS vnode + * operations to write the cache file and thus z_teardown_lock is + * acquired after spa_namespace_lock. + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + return (error); + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + nbmand_changed_cb(zfsvfs, nbmand); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error != 0) + return (error); + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], + 8, 1, &zfsvfs->z_projectquota_obj); + if (error == ENOENT) + zfsvfs->z_projectquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], + 8, 1, &zfsvfs->z_userobjquota_obj); + if (error == ENOENT) + zfsvfs->z_userobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], + 8, 1, &zfsvfs->z_groupobjquota_obj); + if (error == ENOENT) + zfsvfs->z_groupobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], + 8, 1, &zfsvfs->z_projectobjquota_obj); + if (error == ENOENT) + zfsvfs->z_projectobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + /* + * Only use the name cache if we are looking for a + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name (which is always the case on + * FreeBSD). + */ + zfsvfs->z_use_namecache = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); + + return (0); +} + +taskq_t *zfsvfs_taskq; + +static void +zfsvfs_task_unlinked_drain(void *context, int pending __unused) +{ + + zfs_unlinked_drain((zfsvfs_t *)context); +} + +int +zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); + + /* + * XXX: Fix struct statfs so this isn't necessary! + * + * The 'osname' is used as the filesystem's special node, which means + * it must fit in statfs.f_mntfromname, or else it can't be + * enumerated, so libzfs_mnttab_find() returns NULL, which causes + * 'zfs unmount' to think it's not mounted when it is. + */ + if (strlen(osname) >= MNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, + &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + + return (error); +} + + +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, + zfsvfs_task_unlinked_drain, zfsvfs); +#ifdef DIAGNOSTIC + rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); +#else + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); +#endif + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) + return (SET_ERROR(EROFS)); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + boolean_t readonly; + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + if (readonly != 0) { + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + } else { + dsl_dir_t *dd; + + zfs_unlinked_drain(zfsvfs); + dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dd->dd_activity_cancelled = B_FALSE; + } + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + boolean_t use_nc = zfsvfs->z_use_namecache; + zfsvfs->z_use_namecache = B_FALSE; + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + zfsvfs->z_use_namecache = use_nc; + } + } + + /* restore readonly bit */ + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i; + + /* + * This is a barrier to prevent the filesystem from going away in + * zfs_znode_move() until we can safely ensure that the filesystem is + * not unmounted. We consider the filesystem valid before the barrier + * and invalid after the barrier. + */ + rw_enter(&zfsvfs_lock, RW_READER); + rw_exit(&zfsvfs_lock); + + zfs_fuid_destroy(zfsvfs); + + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + ASSERT(zfsvfs->z_nr_znodes == 0); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_vfs) { + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + +static int +zfs_domount(vfs_t *vfsp, char *osname) +{ + uint64_t recordsize, fsid_guid; + int error = 0; + zfsvfs_t *zfsvfs; + + ASSERT(vfsp); + ASSERT(osname); + + error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); + if (error) + return (error); + zfsvfs->z_vfs = vfsp; + + if ((error = dsl_prop_get_integer(osname, + "recordsize", &recordsize, NULL))) + goto out; + zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; + zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; + + vfsp->vfs_data = zfsvfs; + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; + vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; + vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; + /* + * This can cause a loss of coherence between ARC and page cache + * on ZoF - unclear if the problem is in FreeBSD or ZoF + */ + vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ + vfsp->mnt_kern_flag |= MNTK_NOMSYNC; + vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; + + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + (vfsp->mnt_vfc->vfc_typenum & 0xFF); + + /* + * Set features for file system. + */ + zfs_set_fuid_feature(zfsvfs); + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, + "xattr", &pval, NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + + vfs_mountedfrom(vfsp, osname); + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } else { + atomic_inc_32(&zfs_active_fs_count); + } + + return (error); +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + if (!dmu_objset_is_snapshot(os)) + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); +} + +#ifdef SECLABEL +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (SET_ERROR(EINVAL)); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (SET_ERROR(EINVAL)); + + (void) strcpy(outpath, bpath); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + return (0); + } + + /* if not a number, just return the root dataset name */ + if (str_to_uint64(slashp+1, &objnum)) { + return (0); + } + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +/* + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (SET_ERROR(EACCES)); + return (rdonly ? 0 : EACCES); + } + return (SET_ERROR(EACCES)); +} + +/* + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. + * + * Returns 0 if access allowed, error otherwise (e.g. EACCES) + */ +static int +zfs_mount_label_policy(vfs_t *vfsp, char *osname) +{ + int error, retv; + zone_t *mntzone = NULL; + ts_label_t *mnt_tsl; + bslabel_t *mnt_sl; + bslabel_t ds_sl; + char ds_hexsl[MAXNAMELEN]; + + retv = EACCES; /* assume the worst */ + + /* + * Start by getting the dataset label if it exists. + */ + error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (SET_ERROR(EACCES)); + + /* + * If labeling is NOT enabled, then disallow the mount of datasets + * which have a non-default label already. No other label checks + * are needed. + */ + if (!is_system_labeled()) { + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + return (SET_ERROR(EACCES)); + } + + /* + * Get the label of the mountpoint. If mounting into the global + * zone (i.e. mountpoint is not within an active zone and the + * zoned property is off), the label must be default or + * admin_low/admin_high only; no other checks are needed. + */ + mntzone = zone_find_by_any_path(vfsp->vfs_mntpt, B_FALSE); + if (mntzone->zone_id == GLOBAL_ZONEID) { + uint64_t zoned; + + zone_rele(mntzone); + + if (dsl_prop_get_integer(osname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (SET_ERROR(EACCES)); + if (!zoned) + return (zfs_check_global_label(osname, ds_hexsl)); + else + /* + * This is the case of a zone dataset being mounted + * initially, before the zone has been fully created; + * allow this mount into global zone. + */ + return (0); + } + + mnt_tsl = mntzone->zone_slabel; + ASSERT(mnt_tsl != NULL); + label_hold(mnt_tsl); + mnt_sl = label2bslabel(mnt_tsl); + + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { + /* + * The dataset doesn't have a real label, so fabricate one. + */ + char *str = NULL; + + if (l_to_str_internal(mnt_sl, &str) == 0 && + dsl_prop_set_string(osname, + zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, str) == 0) + retv = 0; + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { + /* + * Now compare labels to complete the MAC check. If the + * labels are equal then allow access. If the mountpoint + * label dominates the dataset label, allow readonly access. + * Otherwise, access is denied. + */ + if (blequal(mnt_sl, &ds_sl)) + retv = 0; + else if (bldominates(mnt_sl, &ds_sl)) { + vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); + retv = 0; + } + } + + label_rele(mnt_tsl); + zone_rele(mntzone); + return (retv); +} +#endif /* SECLABEL */ + +static int +getpoolname(const char *osname, char *poolname) +{ + char *p; + + p = strchr(osname, '/'); + if (p == NULL) { + if (strlen(osname) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(poolname, osname); + } else { + if (p - osname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(poolname, osname, p - osname); + poolname[p - osname] = '\0'; + } + return (0); +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp) +{ + kthread_t *td = curthread; + vnode_t *mvp = vfsp->mnt_vnodecovered; + cred_t *cr = td->td_ucred; + char *osname; + int error = 0; + int canwrite; + + if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) + return (SET_ERROR(EINVAL)); + + /* + * If full-owner-access is enabled and delegated administration is + * turned on, we must set nosuid. + */ + if (zfs_super_owner && + dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) + goto out; + + if (!(vfsp->vfs_flag & MS_REMOUNT)) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (VOP_GETATTR(mvp, &vattr, cr)) { + VOP_UNLOCK1(mvp); + goto out; + } + + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { + VOP_UNLOCK1(mvp); + goto out; + } + VOP_UNLOCK1(mvp); + } + + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = SET_ERROR(EPERM); + goto out; + } + +#ifdef SECLABEL + error = zfs_mount_label_policy(vfsp, osname); + if (error) + goto out; +#endif + + vfsp->vfs_flag |= MNT_NFS4ACLS; + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (vfsp->vfs_flag & MS_REMOUNT) { + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * Refresh mount options with z_teardown_lock blocking I/O while + * the filesystem is in an inconsistent state. + * The lock also serializes this code with filesystem + * manipulations between entry to zfs_suspend_fs() and return + * from zfs_resume_fs(). + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + goto out; + } + + /* Initial root mount: try hard to import the requested root pool. */ + if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && + (vfsp->vfs_flag & MNT_UPDATE) == 0) { + char pname[MAXNAMELEN]; + + error = getpoolname(osname, pname); + if (error == 0) + error = spa_import_rootpool(pname); + if (error) + goto out; + } + DROP_GIANT(); + error = zfs_domount(vfsp, osname); + PICKUP_GIANT(); + +out: + return (error); +} + +static int +zfs_statfs(vfs_t *vfsp, struct statfs *statp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + + statp->f_version = STATFS_VERSION; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_bsize = SPA_MINBLOCKSIZE; + statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; + statp->f_bfree = availbytes / statp->f_bsize; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, statp->f_bfree); + statp->f_files = statp->f_ffree + usedobjs; + + /* + * We're a zfs filesystem. + */ + strlcpy(statp->f_fstypename, "zfs", + sizeof (statp->f_fstypename)); + + strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, + sizeof (statp->f_mntfromname)); + strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, + sizeof (statp->f_mntonname)); + + statp->f_namemax = MAXNAMELEN - 1; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + + if (error == 0) { + error = vn_lock(*vpp, flags); + if (error != 0) { + VN_RELE(*vpp); + *vpp = NULL; + } + } + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + dsl_dir_t *dd; + + /* + * If someone has not already unmounted this file system, + * drain the zrele_taskq to ensure all active references to the + * zfsvfs_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) { + /* + * If we're unmounting we have to wait for the list to + * drain completely. + * + * If we're not unmounting there's no guarantee the list + * will drain completely, but zreles run from the taskq + * may add the parents of dir-based xattrs to the taskq + * so we want to wait for these. + * + * We can safely read z_nr_znodes without locking because the + * VFS has already blocked operations which add to the + * z_all_znodes list and thus increment z_nr_znodes. + */ + int round = 0; + while (zfsvfs->z_nr_znodes > 0) { + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + if (++round > 1 && !unmounting) + break; + } + } + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ +#ifdef FREEBSD_NAMECACHE + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); +#endif + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relavent for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) + if (zp->z_sa_hdl) { + ASSERT(ZTOV(zp)->v_count >= 0); + zfs_znode_dmu_fini(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (!zfs_is_readonly(zfsvfs)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + dmu_objset_evict_dbufs(zfsvfs->z_os); + dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); + + return (0); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag) +{ + kthread_t *td = curthread; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + objset_t *os; + cred_t *cr = td->td_ucred; + int ret; + + ret = secpolicy_fs_unmount(cr, vfsp); + if (ret) { + if (dsl_deleg_access((char *)vfsp->vfs_resource, + ZFS_DELEG_PERM_MOUNT, cr)) + return (ret); + } + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL) { + if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) + return (ret); + } + + if (fflag & MS_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfsvfs->z_unmounted = B_TRUE; + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * Flush all the files. + */ + ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); + if (ret != 0) + return (ret); + while (taskqueue_cancel(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task, NULL) != 0) + taskqueue_drain(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + /* + * We can now safely destroy the '.zfs' directory node. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + zfs_freevfs(vfsp); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + int err; + + /* + * zfs_zget() can't operate on virtual entries like .zfs/ or + * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. + * This will make NFS to switch to LOOKUP instead of using VGET. + */ + if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || + (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) + return (EOPNOTSUPP); + + ZFS_ENTER(zfsvfs); + err = zfs_zget(zfsvfs, ino, &zp); + if (err == 0 && zp->z_unlinked) { + vrele(ZTOV(zp)); + err = EINVAL; + } + if (err == 0) + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + if (err == 0) { + err = vn_lock(*vpp, flags); + if (err != 0) + vrele(*vpp); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * If this is regular file system vfsp is the same as + * zfsvfs->z_parent->z_vfs, but if it is snapshot, + * zfsvfs->z_parent->z_vfs represents parent file system + * which we have to use here, because only this file system + * has mnt_export configured. + */ + return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, + credanonp, numsecflavors, secflavors)); +} + +CTASSERT(SHORT_FID_LEN <= sizeof (struct fid)); +CTASSERT(LONG_FID_LEN <= sizeof (struct fid)); + +static int +zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) +{ + struct componentname cn; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + vnode_t *dvp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + /* + * On FreeBSD we can get snapshot's mount point or its parent file + * system mount point depending if snapshot is already mounted or not. + */ + if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * A zero fid_gen means we are in .zfs or the .zfs/snapshot + * directory tree. If the object == zfsvfs->z_shares_dir, then + * we are in the .zfs/shares directory tree. + */ + if ((fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || + (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { + ZFS_EXIT(zfsvfs); + VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); + if (object == ZFSCTL_INO_SNAPDIR) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | LOCKLEAF; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else if (object == zfsvfs->z_shares_dir) { + /* + * XXX This branch must not be taken, + * if it is, then the lookup below will + * explode. + */ + cn.cn_nameptr = "shares"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else { + *vpp = dvp; + } + return (err); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + if ((err = zfs_zget(zfsvfs, object, &zp))) { + ZFS_EXIT(zfsvfs); + return (err); + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + vrele(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + err = vn_lock(*vpp, flags); + if (err == 0) + vnode_create_vobject(*vpp, zp->z_size, curthread); + else + *vpp = NULL; + return (err); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + ds->ds_dir->dd_activity_cancelled = B_FALSE; + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + +bail: + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { + vfs_ref(zfsvfs->z_vfs); + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); + } + } + return (err); +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + zfsvfs_free(zfsvfs); + + atomic_dec_32(&zfs_active_fs_count); +} + +#ifdef __i386__ +static int desiredvnodes_backup; +#endif + +static void +zfs_vnodes_adjust(void) +{ +#ifdef __i386__ + int newdesiredvnodes; + + desiredvnodes_backup = desiredvnodes; + + /* + * We calculate newdesiredvnodes the same way it is done in + * vntblinit(). If it is equal to desiredvnodes, it means that + * it wasn't tuned by the administrator and we can tune it down. + */ + newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * + vm_kmem_size / (5 * (sizeof (struct vm_object) + + sizeof (struct vnode)))); + if (newdesiredvnodes == desiredvnodes) + desiredvnodes = (3 * newdesiredvnodes) / 4; +#endif +} + +static void +zfs_vnodes_adjust_back(void) +{ + +#ifdef __i386__ + desiredvnodes = desiredvnodes_backup; +#endif +} + +void +zfs_init(void) +{ + + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); + + /* + * Reduce number of vnodes. Originally number of vnodes is calculated + * with UFS inode in mind. We reduce it here, because it's too big for + * ZFS/i386. + */ + zfs_vnodes_adjust(); + + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); + + zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); +} + +void +zfs_fini(void) +{ + taskq_destroy(zfsvfs_taskq); + zfsctl_fini(); + zfs_znode_fini(); + zfs_vnodes_adjust_back(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + /* + * Try to force unmount this file system. + */ + (void) zfs_umount(zfsvfs->z_vfs, 0); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, + "from %lu to %lu", zfsvfs->z_version, newvers); + + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the coresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_vfs != NULL && + (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} + +#ifdef _KERNEL +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ + char tmpbuf[MAXPATHLEN]; + struct mount *mp; + char *fromname; + size_t oldlen; + + oldlen = strlen(oldname); + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + fromname = mp->mnt_stat.f_mntfromname; + if (strcmp(fromname, oldname) == 0) { + (void) strlcpy(fromname, newname, + sizeof (mp->mnt_stat.f_mntfromname)); + continue; + } + if (strncmp(fromname, oldname, oldlen) == 0 && + (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { + (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", + newname, fromname + oldlen); + (void) strlcpy(fromname, tmpbuf, + sizeof (mp->mnt_stat.f_mntfromname)); + continue; + } + } + mtx_unlock(&mountlist_mtx); +} +#endif diff --git a/module/os/freebsd/zfs/zfs_vnops.c b/module/os/freebsd/zfs/zfs_vnops.c new file mode 100644 index 000000000000..d7b92035f7c6 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_vnops.c @@ -0,0 +1,6533 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#ifndef VN_OPEN_INVFS +#define VN_OPEN_INVFS 0x0 +#endif + +#if __FreeBSD_version >= 1300047 +#define vm_page_wire_lock(pp) +#define vm_page_wire_unlock(pp) +#else +#define vm_page_wire_lock(pp) vm_page_lock(pp) +#define vm_page_wire_unlock(pp) vm_page_unlock(pp) +#endif + +static int +zfs_u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum) +{ + + return (u8_validate(__DECONST(char *, u8str), n, list, flag, errnum)); +} +#define u8_validate zfs_u8_validate + +#ifdef DEBUG_VFS_LOCKS +#define VNCHECKREF(vp) \ + VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \ + ("%s: wrong ref counts", __func__)); +#else +#define VNCHECKREF(vp) +#endif + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +static int +zfs_open(vnode_t **vpp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (fs_vscan(*vpp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(fs_vscan(vp, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and + * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == _FIO_SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, + int *rvalp) +{ + offset_t off; + int error; + zfsvfs_t *zfsvfs; + znode_t *zp; + + switch (com) { + case _FIOFFS: + { + return (0); + + /* + * The following two ioctls are used by bfu. Faking out, + * necessary to avoid bfu errors. + */ + } + case _FIOGDIO: + case _FIOSDIO: + { + return (0); + } + + case _FIO_SEEK_DATA: + case _FIO_SEEK_HOLE: + { + off = *(offset_t *)data; + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* offset parameter is in/out */ + error = zfs_holey(vp, com, &off); + ZFS_EXIT(zfsvfs); + if (error) + return (error); + *(offset_t *)data = off; + return (0); + } + } + return (SET_ERROR(ENOTTY)); +} + +static vm_page_t +page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) +{ + vm_object_t obj; + vm_page_t pp; + int64_t end; + + /* + * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE + * aligned boundaries, if the range is not aligned. As a result a + * DEV_BSIZE subrange with partially dirty data may get marked as clean. + * It may happen that all DEV_BSIZE subranges are marked clean and thus + * the whole page would be considred clean despite have some dirty data. + * For this reason we should shrink the range to DEV_BSIZE aligned + * boundaries before calling vm_page_clear_dirty. + */ + end = rounddown2(off + nbytes, DEV_BSIZE); + off = roundup2(off, DEV_BSIZE); + nbytes = end - off; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); +#if __FreeBSD_version < 1300050 + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + vm_page_sbusy(pp); + } else if (pp != NULL) { + ASSERT(!pp->valid); + pp = NULL; + } + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } + break; + } +#else + vm_page_grab_valid(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | + VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } +#endif + return (pp); +} + +static void +page_unbusy(vm_page_t pp) +{ + + vm_page_sunbusy(pp); +#if __FreeBSD_version >= 1300041 + vm_object_pip_wakeup(pp->object); +#else + vm_object_pip_subtract(pp->object, 1); +#endif +} + +#if __FreeBSD_version > 1300051 +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t m; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + vm_page_grab_valid(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | + VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY); + return (m); +} +#else +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t pp; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_wire_lock(pp); + vm_page_hold(pp); + vm_page_wire_unlock(pp); + + } else + pp = NULL; + break; + } + return (pp); +} +#endif + +static void +page_unhold(vm_page_t pp) +{ + + vm_page_wire_lock(pp); +#if __FreeBSD_version >= 1300035 + vm_page_unwire(pp, PQ_ACTIVE); +#else + vm_page_unhold(pp); +#endif + vm_page_wire_unlock(pp); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, + int segflg, dmu_tx_t *tx) +{ + vm_object_t obj; + struct sf_buf *sf; + caddr_t va; + int off; + + ASSERT(segflg != UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + off = start & PAGEOFFSET; + zfs_vmobject_wlock(obj); +#if __FreeBSD_version >= 1300041 + vm_object_pip_add(obj, 1); +#endif + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + int nbytes = imin(PAGESIZE - off, len); + + if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { + zfs_vmobject_wunlock(obj); + + va = zfs_map_page(pp, &sf); + (void) dmu_read(os, oid, start+off, nbytes, + va+off, DMU_READ_PREFETCH); + zfs_unmap_page(sf); + + zfs_vmobject_wlock(obj); + page_unbusy(pp); + } + len -= nbytes; + off = 0; + } +#if __FreeBSD_version >= 1300041 + vm_object_pip_wakeup(obj); +#else + vm_object_pip_wakeupn(obj, 0); +#endif + zfs_vmobject_wunlock(obj); +} + +/* + * Read with UIO_NOCOPY flag means that sendfile(2) requests + * ZFS to populate a range of page cache pages with data. + * + * NOTE: this function could be optimized to pre-allocate + * all pages in advance, drain exclusive busy on all of them, + * map them into contiguous KVA region and populate them + * in one single dmu_read() call. + */ +static int +mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + struct sf_buf *sf; + vm_object_t obj; + vm_page_t pp; + int64_t start; + caddr_t va; + int len = nbytes; + int error = 0; + + ASSERT(uio->uio_segflg == UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); + + zfs_vmobject_wlock(obj); + for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { + int bytes = MIN(PAGESIZE, len); + + pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | + VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); + if (vm_page_none_valid(pp)) { + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); + error = dmu_read(os, zp->z_id, start, bytes, va, + DMU_READ_PREFETCH); + if (bytes != PAGESIZE && error == 0) + bzero(va + bytes, PAGESIZE - bytes); + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + vm_page_do_sunbusy(pp); +#if __FreeBSD_version >= 1300047 && __FreeBSD_version < 1300051 +#error "unsupported version window" +#elif __FreeBSD_version >= 1300051 + if (error == 0) { + vm_page_valid(pp); + vm_page_lock(pp); + vm_page_activate(pp); + vm_page_unlock(pp); + } + vm_page_do_sunbusy(pp); + if (error != 0 && !vm_page_wired(pp) == 0 && + pp->valid == 0 && vm_page_tryxbusy(pp)) + vm_page_free(pp); +#else + vm_page_lock(pp); + if (error) { + if (pp->wire_count == 0 && pp->valid == 0 && + !vm_page_busied(pp)) + vm_page_free(pp); + } else { + pp->valid = VM_PAGE_BITS_ALL; + vm_page_activate(pp); + } + vm_page_unlock(pp); +#endif + } else { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(pp); + } + if (error) + break; + uio->uio_resid -= bytes; + uio->uio_offset += bytes; + len -= bytes; + } + zfs_vmobject_wunlock(obj); + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + vm_object_t obj; + int64_t start; + int len = nbytes; + int off; + int error = 0; + + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + zfs_vmobject_wlock(obj); + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + if ((pp = page_hold(vp, start))) { + struct sf_buf *sf; + caddr_t va; + + zfs_vmobject_wunlock(obj); + va = zfs_map_page(pp, &sf); + error = vn_io_fault_uiomove(va + off, bytes, uio); + zfs_unmap_page(sf); + zfs_vmobject_wlock(obj); + page_unhold(pp); + } else { + zfs_vmobject_wunlock(obj); + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + zfs_vmobject_wlock(obj); + } + len -= bytes; + off = 0; + if (error) + break; + } + zfs_vmobject_wunlock(obj); + return (error); +} + +offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ssize_t n, nbytes; + int error = 0; + zfs_locked_range_t *lr; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + if (zfsvfs->z_log && + (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset, + uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_size); + n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (uio->uio_segflg == UIO_NOCOPY) + error = mappedread_sf(vp, nbytes, uio); + else if (vn_has_cached_data(vp)) { + error = mappedread(vp, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } +out: + zfs_rangelock_exit(lr); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is + * set if in append mode. + * cr - credentials of caller. + * ct - caller context (NFS/CIFS fem monitor only) + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +static int +zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + rlim64_t limit = MAXOFFSET_T; + ssize_t start_resid = uio->uio_resid; + ssize_t tx_bytes; + uint64_t end_size; + dmu_buf_impl_t *db; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + offset_t woff; + ssize_t n, nbytes; + zfs_locked_range_t *lr; + int max_blksz = zfsvfs->z_max_blksz; + int error = 0; + arc_buf_t *abuf; + iovec_t *aiov = NULL; + xuio_t *xuio = NULL; + int i_iov = 0; + int iovcnt __unused = uio->uio_iovcnt; + iovec_t *iovp = uio->uio_iov; + int write_eof; + int count = 0; + sa_bulk_attr_t bulk[4]; + uint64_t mtime[2], ctime[2]; + uint64_t uid, gid, projid; + + /* + * Fasttrack empty write + */ + n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + zilog = zfsvfs->z_log; + + /* + * Validate file offset + */ + woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If in append mode, set the io offset pointer to eof. + */ + if (ioflag & FAPPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio->uio_loffset = woff; + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (EFBIG); + } + + if (woff >= limit) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_size); + + end_size = MAX(zp->z_size, woff + n); + + uid = zp->z_uid; + gid = zp->z_gid; + projid = zp->z_projid; + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + woff = uio->uio_loffset; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || + (projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + abuf = NULL; + if (xuio) { + ASSERT(i_iov < iovcnt); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + DTRACE_PROBE3(zfs_cp_write, int, i_iov, + iovec_t *, aiov, arc_buf_t *, abuf); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; + } else if (n >= max_blksz && + woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If zfs_range_lock() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since zfs_range_reduce() will + * shrink down r_len to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + if (woff + nbytes > zp->z_size) + vnode_pager_setsize(vp, woff + nbytes); + + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + tx_bytes -= uio->uio_resid; + } else { + tx_bytes = nbytes; + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, + abuf, tx); + } + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && vn_has_cached_data(vp)) { + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id, uio->uio_segflg, tx); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the excute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(vp, cr, + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio->uio_loffset); + ASSERT(error == 0 || error == EFAULT); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + if (error == 0) + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + else + (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, + ioflag, NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + + } + + zfs_rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * EFAULT means that at least one page of the source buffer was not + * available. VFS will re-try remaining I/O upon this error. + */ + if (error == EFAULT) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (FSYNC | FDSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + ZFS_EXIT(zfsvfs); + return (0); +} + +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *presid) +{ + int error = 0; + ssize_t resid; + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos, + UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread); + + if (error) { + return (SET_ERROR(error)); + } else if (presid == NULL) { + if (resid != 0) { + error = SET_ERROR(EIO); + } + } else { + *presid = resid; + } + return (error); +} + +void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + objset_t *os = zp->z_zfsvfs->z_os; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os))); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq(dmu_objset_pool(os))); + return (SET_ERROR(ENOENT)); + } + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, + size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP, + * so there's no need to zio_flush() its + * vdevs (flushing would needlesly hurt + * performance, and doesn't work on + * indirect vdevs). + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +static int +zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + int error; + + *vpp = arg; + error = vn_lock(*vpp, lkflags); + if (error != 0) + vrele(*vpp); + return (error); +} + +static int +zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs; + int error; + int ltype; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(dvp, __func__); +#ifdef DIAGNOSTIC + if ((zdp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); +#endif + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + ASSERT3P(dvp, ==, vp); + vref(dvp); + ltype = lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* + * Relock for the "." case could leave us with + * reclaimed vnode. + */ + if (VN_IS_DOOMED(dvp)) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + /* + * Note that in this case, dvp is the child vnode, and we + * are looking up the parent vnode - exactly reverse from + * normal operation. Unlocking dvp requires some rather + * tricky unlock/relock dance to prevent mp from being freed; + * use vn_vget_ino_gen() which takes care of all that. + * + * XXX Note that there is a time window when both vnodes are + * unlocked. It is possible, although highly unlikely, that + * during that window the parent-child relationship between + * the vnodes may change, for example, get reversed. + * In that case we would have a wrong lock order for the vnodes. + * All other filesystems seem to ignore this problem, so we + * do the same here. + * A potential solution could be implemented as follows: + * - using LK_NOWAIT when locking the second vnode and retrying + * if necessary + * - checking that the parent-child relationship still holds + * after locking both vnodes and retrying if it doesn't + */ + error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); + return (error); + } else { + error = vn_lock(vp, lkflags); + if (error != 0) + vrele(vp); + return (error); + } +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, + int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached) +{ + znode_t *zdp = VTOZ(dvp); + znode_t *zp; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & LOOKUP_XATTR)) { + if (dvp->v_type != VDIR) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) { + ZFS_EXIT(zfsvfs); + return (error); + } + *vpp = ZTOV(zp); + + /* + * Do we have permission to get into attribute directory? + */ + error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr); + if (error) { + vrele(ZTOV(zp)); + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Check accessibility of directory if we're not coming in via + * VOP_CACHEDLOOKUP. + */ + if (!cached) { +#ifdef NOEXECCHECK + if ((cnp->cn_flags & NOEXECCHECK) != 0) { + cnp->cn_flags &= ~NOEXECCHECK; + } else +#endif + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + + /* + * First handle the special cases. + */ + if ((cnp->cn_flags & ISDOTDOT) != 0) { + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { + struct componentname cn; + vnode_t *zfsctl_vp; + int ltype; + + ZFS_EXIT(zfsvfs); + ltype = VOP_ISLOCKED(dvp); + VOP_UNLOCK1(dvp); + error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, + &zfsctl_vp); + if (error == 0) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = cnp->cn_nameiop; + cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; + cn.cn_lkflags = cnp->cn_lkflags; + error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); + vput(zfsctl_vp); + } + vn_lock(dvp, ltype | LK_RETRY); + return (error); + } + } + if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { + ZFS_EXIT(zfsvfs); + if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); + return (error); + } + + /* + * The loop is retry the lookup if the parent-child relationship + * changes during the dot-dot locking complexities. + */ + for (;;) { + uint64_t parent; + + error = zfs_dirlook(zdp, nm, &zp); + if (error == 0) + *vpp = ZTOV(zp); + + ZFS_EXIT(zfsvfs); + if (error != 0) + break; + + error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); + if (error != 0) { + /* + * If we've got a locking error, then the vnode + * got reclaimed because of a force unmount. + * We never enter doomed vnodes into the name cache. + */ + *vpp = NULL; + return (error); + } + + if ((cnp->cn_flags & ISDOTDOT) == 0) + break; + + ZFS_ENTER(zfsvfs); + if (zdp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + } else { + error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + } + if (error != 0) { + ZFS_EXIT(zfsvfs); + vput(ZTOV(zp)); + break; + } + if (zp->z_id == parent) { + ZFS_EXIT(zfsvfs); + break; + } + vput(ZTOV(zp)); + } + + if (error != 0) + *vpp = NULL; + + /* Translate errors and add SAVENAME when needed. */ + if (cnp->cn_flags & ISLASTCN) { + switch (nameiop) { + case CREATE: + case RENAME: + if (error == ENOENT) { + error = EJUSTRETURN; + cnp->cn_flags |= SAVENAME; + break; + } + /* FALLTHROUGH */ + case DELETE: + if (error == 0) + cnp->cn_flags |= SAVENAME; + break; + } + } + + /* Insert name into cache (as non-existent) if appropriate. */ + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && + error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(dvp, NULL, cnp); + + /* Insert name into cache if appropriate. */ + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && + error == 0 && (cnp->cn_flags & MAKEENTRY)) { + if (!(cnp->cn_flags & ISLASTCN) || + (nameiop != DELETE && nameiop != RENAME)) { + cache_enter(dvp, *vpp, cnp); + } + } + + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, + znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + objset_t *os; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + uint64_t projid = ZFS_DEFAULT_PROJID; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype; +#ifdef DEBUG_VFS_LOCKS + vnode_t *dvp = ZTOV(dzp); +#endif + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + *zpp = NULL; + + if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~S_ISVTX; + + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = SET_ERROR(EINVAL); + goto out; + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + getnewvnode_reserve_(); + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + +out: + VNCHECKREF(dvp); + if (error == 0) { + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ + +/*ARGSUSED*/ +static int +zfs_remove_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t xattr_obj; + uint64_t obj = 0; + dmu_tx_t *tx; + boolean_t unlinked; + uint64_t txtype; + int error; + + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zp = VTOZ(vp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + xattr_obj = 0; + xzp = NULL; + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (vp->v_type == VDIR) { + error = SET_ERROR(EPERM); + goto out; + } + + vnevent_remove(vp, dvp, name, ct); + + obj = zp->z_id; + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + } + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + + if (xzp) { + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + zfs_unlinked_add(zp, tx); + vp->v_vflag |= VV_NOSYNC; + } + /* XXX check changes to linux vnops */ + txtype = TX_REMOVE; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + + if (xzp) + vrele(ZTOV(xzp)); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + + ZFS_EXIT(zfsvfs); + return (error); +} + + +int +zfs_lookup_internal(znode_t *dzp, char *name, vnode_t **vpp, + struct componentname *cnp, int nameiop) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int error; + + cnp->cn_nameptr = name; + cnp->cn_namelen = strlen(name); + cnp->cn_nameiop = nameiop; + cnp->cn_flags = ISLASTCN | SAVENAME; + cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + cnp->cn_cred = kcred; + cnp->cn_thread = curthread; + + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) { + struct vop_lookup_args a; + + a.a_gen.a_desc = &vop_lookup_desc; + a.a_dvp = ZTOV(dzp); + a.a_vpp = vpp; + a.a_cnp = cnp; + error = vfs_cache_lookup(&a); + } else { + error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, + curthread, 0, B_FALSE); + } +#ifdef ZFS_DEBUG + if (error) { + printf("got error %d on name %s on op %d\n", error, name, + nameiop); + kdb_backtrace(); + } +#endif + return (error); +} + +int +zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) +{ + vnode_t *vp; + int error; + struct componentname cn; + + if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) + return (error); + + error = zfs_remove_(ZTOV(dzp), vp, name, cr); + vput(vp); + return (error); +} +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, + int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t txtype; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + + ASSERT(vap->va_type == VDIR); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + ((vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ + *zpp = NULL; + + if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + getnewvnode_reserve_(); + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, + acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rmdir_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (vp->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + vnevent_rmdir(vp, dvp, name, ct); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + cache_purge(dvp); + + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + zfs_log_remove(zilog, tx, txtype, dzp, name, + ZFS_NO_OBJECT, B_FALSE); + } + + dmu_tx_commit(tx); + + cache_purge(vp); +out: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +int +zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) +{ + struct componentname cn; + vnode_t *vp; + int error; + + if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) + return (error); + + error = zfs_rmdir_(ZTOV(dzp), vp, name, cr); + vput(vp); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure). + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + int *ncookies, ulong_t **cookies) +{ + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + edirent_t *eodp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; + int local_eof; + int outcount; + int error; + uint8_t prefetch; + boolean_t check_sysattrs; + uint8_t type; + int ncooks; + ulong_t *cooks = NULL; + int flags = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + error = 0; + os = zfsvfs->z_os; + offset = uio->uio_loffset; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = uio->uio_iov; + bytes_wanted = iovp->iov_len; + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + outbuf = NULL; + odp = (struct dirent64 *)iovp->iov_base; + } + eodp = (struct edirent *)odp; + + if (ncookies != NULL) { + /* + * Minimum entry size is dirent size and 1 byte for a file name. + */ + ncooks = uio->uio_resid / (sizeof (struct dirent) - + sizeof (((struct dirent *)NULL)->d_name) + 1); + cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK); + *cookies = cooks; + *ncookies = ncooks; + } + /* + * If this VFS supports the system attribute view interface; and + * we're looking at an extended attribute directory; and we care + * about normalization conflicts on this vfs; then we must check + * for normalization conflicts with the sysattr name space. + */ +#ifdef TODO + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); +#else + check_sysattrs = 0; +#endif + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + off64_t *next = NULL; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + if (check_sysattrs && !zap.za_normalization_conflict) { +#ifdef TODO + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); +#else + panic("%s:%u: TODO", __func__, __LINE__); +#endif + } + } + + if (flags & V_RDDIR_ACCFILTER) { + /* + * If we have no access at all, don't include + * this entry in the returned information + */ + znode_t *ezp; + if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) + goto skip_entry; + if (!zfs_has_access(ezp, cr)) { + vrele(ZTOV(ezp)); + goto skip_entry; + } + vrele(ZTOV(ezp)); + } + + if (flags & V_RDDIR_ENTFLAGS) + reclen = EDIRENT_RECLEN(strlen(zap.za_name)); + else + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = SET_ERROR(EINVAL); + goto update; + } + break; + } + if (flags & V_RDDIR_ENTFLAGS) { + /* + * Add extended flag entry: + */ + eodp->ed_ino = objnum; + eodp->ed_reclen = reclen; + /* NOTE: ed_off is the offset for the *next* entry */ + next = &(eodp->ed_off); + eodp->ed_eflags = zap.za_normalization_conflict ? + ED_CASE_CONFLICT : 0; + (void) strncpy(eodp->ed_name, zap.za_name, + EDIRENT_NAMELEN(reclen)); + eodp = (edirent_t *)((intptr_t)eodp + reclen); + } else { + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + odp->d_namlen = strlen(zap.za_name); + /* NOTE: d_off is the offset for the *next* entry. */ + next = &odp->d_off; + strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); + odp->d_type = type; + dirent_terminate(odp); + odp = (dirent64_t *)((intptr_t)odp + reclen); + } + outcount += reclen; + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + + skip_entry: + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + /* Fill the offset right after advancing the cursor. */ + if (next != NULL) + *next = offset; + if (cooks != NULL) { + *cooks++ = offset; + ncooks--; + KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); + } + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + /* Subtract unused cookies */ + if (ncookies != NULL) + *ncookies -= ncooks; + + if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + uio->uio_resid -= outcount; + } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) { + /* + * Reset the pointer. + */ + offset = uio->uio_loffset; + } + +update: + zap_cursor_fini(&zc); + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio->uio_loffset = offset; + ZFS_EXIT(zfsvfs); + if (error != 0 && cookies != NULL) { + free(*cookies, M_TEMP); + *cookies = NULL; + *ncookies = 0; + } + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +static int +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + tsd_set(zfs_fsyncer_key, NULL); + return (0); +} + + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds). + */ +/* ARGSUSED */ +static int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint32_t blksize; + u_longlong_t nblocks; + uint64_t mtime[2], ctime[2], crtime[2], rdev; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vp->v_type == VBLK || vp->v_type == VCHR) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; + vn_fsid(vp, vap); + vap->va_nodeid = zp->z_id; + vap->va_nlink = zp->z_links; + if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && + zp->z_links < ZFS_LINK_MAX) + vap->va_nlink++; + vap->va_size = zp->z_size; + if (vp->v_type == VBLK || vp->v_type == VCHR) + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_seq = zp->z_seq; + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + vap->va_filerev = zp->z_seq; + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vp->v_type == VREG) { + zfs_sa_get_scanstamp(zp, xvap); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + xoap->xoa_projinherit = + ((zp->z_pflags & ZFS_PROJINHERIT) != 0); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + xoap->xoa_projid = zp->z_projid; + XVA_SET_RTN(xvap, XAT_PROJID); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, crtime); + + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + vap->va_blksize = blksize; + vap->va_bytes = nblocks << 9; /* nblocks * 512 */ + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + vnode_t *vp = ZTOV(zp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + uint64_t saved_mode; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + sa_bulk_attr_t bulk[7], xattr_bulk[7]; + int count = 0, xattr_count = 0; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & AT_SIZE && vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + xva_init(&tmpxvattr); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Note: ZFS_READONLY is handled in zfs_zaccess_common. + */ + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + } + if (xoap != NULL && (mask & AT_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) && + TIMESPEC_OVERFLOW(&vap->va_birthtime)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + } + + attrzp = NULL; + aclp = NULL; + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, vp, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((vp->v_type != VREG && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + if (trim_mask & AT_MODE) { + /* + * Save the mode, as secpolicy_vnode_setattr() + * will overwrite it with ova.va_mode. + */ + saved_mode = vap->va_mode; + } + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + + if (trim_mask) { + vap->va_mask |= saved_mask; + if (trim_mask & AT_MODE) { + /* + * Recover the mode after + * secpolicy_vnode_setattr(). + */ + vap->va_mode = saved_mode; + } + } + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); + if (err == 0) { + err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); + if (err != 0) + vrele(ZTOV(attrzp)); + } + if (err) + goto out2; + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != zp->z_uid && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_uid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != zp->z_gid && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_gid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & AT_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = SET_ERROR(EPERM); + goto out; + } + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (AT_UID|AT_GID)) { + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & AT_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & AT_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3U((uintptr_t)aclp, !=, 0); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + } + + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime); + } + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + xoap->xoa_createtime = vap->va_birthtime; + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(vp->v_type == VREG); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + } +out: + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (attrzp) + vput(ZTOV(attrzp)); + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * We acquire all but fdvp locks using non-blocking acquisitions. If we + * fail to acquire any lock in the path we will drop all held locks, + * acquire the new lock in a blocking fashion, and then release it and + * restart the rename. This acquire/release step ensures that we do not + * spin on a lock waiting for release. On error release all vnode locks + * and decrement references the way tmpfs_rename() would do. + */ +static int +zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, + struct vnode *tdvp, struct vnode **tvpp, + const struct componentname *scnp, const struct componentname *tcnp) +{ + zfsvfs_t *zfsvfs; + struct vnode *nvp, *svp, *tvp; + znode_t *sdzp, *tdzp, *szp, *tzp; + const char *snm = scnp->cn_nameptr; + const char *tnm = tcnp->cn_nameptr; + int error; + + VOP_UNLOCK1(tdvp); + if (*tvpp != NULL && *tvpp != tdvp) + VOP_UNLOCK1(*tvpp); + +relock: + error = vn_lock(sdvp, LK_EXCLUSIVE); + if (error) + goto out; + sdzp = VTOZ(sdvp); + + error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + if (error != EBUSY) + goto out; + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto out; + VOP_UNLOCK1(tdvp); + goto relock; + } + tdzp = VTOZ(tdvp); + + /* + * Before using sdzp and tdzp we must ensure that they are live. + * As a porting legacy from illumos we have two things to worry + * about. One is typical for FreeBSD and it is that the vnode is + * not reclaimed (doomed). The other is that the znode is live. + * The current code can invalidate the znode without acquiring the + * corresponding vnode lock if the object represented by the znode + * and vnode is no longer valid after a rollback or receive operation. + * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock + * that protects the znodes from the invalidation. + */ + zfsvfs = sdzp->z_zfsvfs; + ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); + ZFS_ENTER(zfsvfs); + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + error = SET_ERROR(EIO); + goto out; + } + + /* + * Re-resolve svp to be certain it still exists and fetch the + * correct vnode. + */ + error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); + if (error != 0) { + /* Source entry invalid or not there. */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + if ((scnp->cn_flags & ISDOTDOT) != 0 || + (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) + error = SET_ERROR(EINVAL); + goto out; + } + svp = ZTOV(szp); + + /* + * Re-resolve tvp, if it disappeared we just carry on. + */ + error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); + if (error != 0) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + vrele(svp); + if ((tcnp->cn_flags & ISDOTDOT) != 0) + error = SET_ERROR(EINVAL); + goto out; + } + if (tzp != NULL) + tvp = ZTOV(tzp); + else + tvp = NULL; + + /* + * At present the vnode locks must be acquired before z_teardown_lock, + * although it would be more logical to use the opposite order. + */ + ZFS_EXIT(zfsvfs); + + /* + * Now try acquire locks on svp and tvp. + */ + nvp = svp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + if (tvp != NULL) + vrele(tvp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + VOP_UNLOCK1(nvp); + /* + * Concurrent rename race. + * XXX ? + */ + if (nvp == tdvp) { + vrele(nvp); + error = SET_ERROR(EINVAL); + goto out; + } + vrele(*svpp); + *svpp = nvp; + goto relock; + } + vrele(*svpp); + *svpp = nvp; + + if (*tvpp != NULL) + vrele(*tvpp); + *tvpp = NULL; + if (tvp != NULL) { + nvp = tvp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + VOP_UNLOCK1(*svpp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + vput(nvp); + goto relock; + } + *tvpp = nvp; + } + + return (0); + +out: + return (error); +} + +/* + * Note that we must use VRELE_ASYNC in this function as it walks + * up the directory tree and vrele may need to acquire an exclusive + * lock if a last reference to a vnode is dropped. + */ +static int +zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) +{ + zfsvfs_t *zfsvfs; + znode_t *zp, *zp1; + uint64_t parent; + int error; + + zfsvfs = tdzp->z_zfsvfs; + if (tdzp == szp) + return (SET_ERROR(EINVAL)); + if (tdzp == sdzp) + return (0); + if (tdzp->z_id == zfsvfs->z_root) + return (0); + zp = tdzp; + for (;;) { + ASSERT(!zp->z_unlinked); + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + break; + + if (parent == szp->z_id) { + error = SET_ERROR(EINVAL); + break; + } + if (parent == zfsvfs->z_root) + break; + if (parent == sdzp->z_id) + break; + + error = zfs_zget(zfsvfs, parent, &zp1); + if (error != 0) + break; + + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os))); + zp = zp1; + } + + if (error == ENOTDIR) + panic("checkpath: .. not a directory\n"); + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + return (error); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, + vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, + cred_t *cr, int log) +{ + zfsvfs_t *zfsvfs; + znode_t *sdzp, *tdzp, *szp, *tzp; + zilog_t *zilog = NULL; + dmu_tx_t *tx; + char *snm = scnp->cn_nameptr; + char *tnm = tcnp->cn_nameptr; + int error = 0; + + /* Reject renames across filesystems. */ + if ((*svpp)->v_mount != tdvp->v_mount || + ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { + error = SET_ERROR(EXDEV); + goto out; + } + + if (zfsctl_is_node(tdvp)) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Lock all four vnodes to ensure safety and semantics of renaming. + */ + error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); + if (error != 0) { + /* no vnodes are locked in the case of error here */ + return (error); + } + + tdzp = VTOZ(tdvp); + sdzp = VTOZ(sdvp); + zfsvfs = tdzp->z_zfsvfs; + zilog = zfsvfs->z_log; + + /* + * After we re-enter ZFS_ENTER() we will have to revalidate all + * znodes involved. + */ + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + error = SET_ERROR(EILSEQ); + goto unlockout; + } + + /* If source and target are the same file, there is nothing to do. */ + if ((*svpp) == (*tvpp)) { + error = 0; + goto unlockout; + } + + if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || + ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && + (*tvpp)->v_mountedhere != NULL)) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + goto unlockout; + } + + szp = VTOZ(*svpp); + tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); + if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { + error = SET_ERROR(EIO); + goto unlockout; + } + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + error = SET_ERROR(EINVAL); + goto unlockout; + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto unlockout; + + if ((*svpp)->v_type == VDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || + sdzp == szp || + (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { + error = EINVAL; + goto unlockout; + } + + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_check(szp, sdzp, tdzp))) + goto unlockout; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if ((*svpp)->v_type == VDIR) { + if ((*tvpp)->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto unlockout; + } else { + cache_purge(tdvp); + if (sdvp != tdvp) + cache_purge(sdvp); + } + } else { + if ((*tvpp)->v_type == VDIR) { + error = SET_ERROR(EISDIR); + goto unlockout; + } + } + } + + vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); + if (tzp) + vnevent_rename_dest(*tvpp, tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto unlockout; + } + + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); + + if (error == 0) { + error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, + NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME, sdzp, + snm, tdzp, tnm, szp); + + /* + * Update path information for the target vnode + */ + vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } + if (error == 0) { + cache_purge(*svpp); + if (*tvpp != NULL) + cache_purge(*tvpp); + cache_purge_negative(tdvp); + } + } + + dmu_tx_commit(tx); + +unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(*svpp); + VOP_UNLOCK1(sdvp); + +out: /* original two vnodes are locked */ + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (*tvpp != NULL) + VOP_UNLOCK1(*tvpp); + if (tdvp != *tvpp) + VOP_UNLOCK1(tdvp); + return (error); +} + +int +zfs_rename(znode_t *sdzp, char *sname, znode_t *tdzp, char *tname, + cred_t *cr, int flags) +{ + struct componentname scn, tcn; + vnode_t *sdvp, *tdvp; + vnode_t *svp, *tvp; + int error; + svp = tvp = NULL; + + sdvp = ZTOV(sdzp); + tdvp = ZTOV(tdzp); + error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); + if (sdzp->z_zfsvfs->z_replay == B_FALSE) + VOP_UNLOCK1(sdvp); + if (error != 0) + goto fail; + VOP_UNLOCK1(svp); + + vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); + error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME); + if (error == EJUSTRETURN) + tvp = NULL; + else if (error != 0) { + VOP_UNLOCK1(tdvp); + goto fail; + } + + error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0); +fail: + if (svp != NULL) + vrele(svp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, + const char *link, znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + + ASSERT(vap->va_type == VLNK); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, + 0 /* projid */)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve_(); + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + __DECONST(void *, link), len, tx); + else + zfs_sa_symlink(zp, __DECONST(char *, link), len, tx); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + + zfs_log_symlink(zilog, tx, txtype, dzp, zp, + __DECONST(char *, name), __DECONST(char *, link)); + *zpp = zp; + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uio - structure to contain the link path. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - structure containing the link path. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, + int flags) +{ + znode_t *tzp; + zfsvfs_t *zfsvfs = tdzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + uint64_t parent; + uid_t owner; + + ASSERT(ZTOV(tdzp)->v_type == VDIR); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (ZTOV(szp)->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (szp->z_pflags & (ZFS_APPENDONLY | + ZFS_IMMUTABLE | ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(tdzp, name, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } + + dmu_tx_commit(tx); + + if (error == 0) { + vnevent_link(ZTOV(szp), ct); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: ip - inode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + dmu_tx_commit(tx); + } + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + + +CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid)); +CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid)); + +/*ARGSUSED*/ +static int +zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + fidp->fid_len = size; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + if (size == LONG_FID_LEN) { + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + zfid_long_t *zlfid; + + zlfid = (zfid_long_t *)fidp; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + /* XXX - this should be the generation number for the objset */ + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + + switch (cmd) { + case _PC_LINK_MAX: + *valp = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *valp = 64; + return (0); + case _PC_MIN_HOLE_SIZE: + *valp = (int)SPA_MINBLOCKSIZE; + return (0); + case _PC_ACL_EXTENDED: + *valp = 0; + return (0); + + case _PC_ACL_NFS4: + *valp = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *valp = ACL_MAX_ENTRIES; + return (0); + + default: + return (EOPNOTSUPP); + } +} + +/*ARGSUSED*/ +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, + int *rahead) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zp->z_zfsvfs->z_os; + zfs_locked_range_t *lr; + vm_object_t object; + off_t start, end, obj_size; + uint_t blksz; + int pgsin_b, pgsin_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + start = IDX_TO_OFF(ma[0]->pindex); + end = IDX_TO_OFF(ma[count - 1]->pindex + 1); + + /* + * Lock a range covering all required and optional pages. + * Note that we need to handle the case of the block size growing. + */ + for (;;) { + blksz = zp->z_blksz; + lr = zfs_rangelock_enter(&zp->z_rangelock, + rounddown(start, blksz), + roundup(end, blksz) - rounddown(start, blksz), RL_READER); + if (blksz == zp->z_blksz) + break; + zfs_rangelock_exit(lr); + } + + object = ma[0]->object; + zfs_vmobject_wlock(object); + obj_size = object->un_pager.vnp.vnp_size; + zfs_vmobject_wunlock(object); + if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (zfs_vm_pagerret_bad); + } + + pgsin_b = 0; + if (rbehind != NULL) { + pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); + pgsin_b = MIN(*rbehind, pgsin_b); + } + + pgsin_a = 0; + if (rahead != NULL) { + pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); + if (end + IDX_TO_OFF(pgsin_a) >= obj_size) + pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); + pgsin_a = MIN(*rahead, pgsin_a); + } + + /* + * NB: we need to pass the exact byte size of the data that we expect + * to read after accounting for the file size. This is required because + * ZFS will panic if we request DMU to read beyond the end of the last + * allocated block. + */ + error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, + MIN(end, obj_size) - (end - PAGE_SIZE)); + + zfs_rangelock_exit(lr); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + + if (error != 0) + return (zfs_vm_pagerret_error); + + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); + if (rbehind != NULL) + *rbehind = pgsin_b; + if (rahead != NULL) + *rahead = pgsin_a; + return (zfs_vm_pagerret_ok); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getpages_args { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int *a_rbehind; + int *a_rahead; +}; +#endif + +static int +zfs_freebsd_getpages(struct vop_getpages_args *ap) +{ + + return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead)); +} + +static int +zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, + int *rtvals) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + dmu_tx_t *tx; + struct sf_buf *sf; + vm_object_t object; + vm_page_t m; + caddr_t va; + size_t tocopy; + size_t lo_len; + vm_ooffset_t lo_off; + vm_ooffset_t off; + uint_t blksz; + int ncount; + int pcount; + int err; + int i; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + object = vp->v_object; + pcount = btoc(len); + ncount = pcount; + + KASSERT(ma[0]->object == object, ("mismatching object")); + KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); + + for (i = 0; i < pcount; i++) + rtvals[i] = zfs_vm_pagerret_error; + + off = IDX_TO_OFF(ma[0]->pindex); + blksz = zp->z_blksz; + lo_off = rounddown(off, blksz); + lo_len = roundup(len + (off - lo_off), blksz); + lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); + + zfs_vmobject_wlock(object); + if (len + off > object->un_pager.vnp.vnp_size) { + if (object->un_pager.vnp.vnp_size > off) { + int pgoff; + + len = object->un_pager.vnp.vnp_size - off; + ncount = btoc(len); + if ((pgoff = (int)len & PAGE_MASK) != 0) { + /* + * If the object is locked and the following + * conditions hold, then the page's dirty + * field cannot be concurrently changed by a + * pmap operation. + */ + m = ma[ncount - 1]; + vm_page_assert_sbusied(m); + KASSERT(!pmap_page_is_write_mapped(m), + ("zfs_putpages: page %p is not read-only", + m)); + vm_page_clear_dirty(m, pgoff, PAGE_SIZE - + pgoff); + } + } else { + len = 0; + ncount = 0; + } + if (ncount < pcount) { + for (i = ncount; i < pcount; i++) { + rtvals[i] = zfs_vm_pagerret_bad; + } + } + } + zfs_vmobject_wunlock(object); + + if (ncount == 0) + goto out; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + goto out; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz < PAGE_SIZE) { + for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { + tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + va = zfs_map_page(ma[i], &sf); + dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + zfs_unmap_page(sf); + } + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); + } + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(err); + /* + * XXX we should be passing a callback to undirty + * but that would make the locking messier + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, + len, 0, NULL, NULL); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, ncount); + } + dmu_tx_commit(tx); + +out: + zfs_rangelock_exit(lr); + if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + return (rtvals[0]); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_putpages_args { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_sync; + int *a_rtvals; +}; +#endif + +int +zfs_freebsd_putpages(struct vop_putpages_args *ap) +{ + + return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, + ap->a_rtvals)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_bmap_args { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; +}; +#endif + +static int +zfs_freebsd_bmap(struct vop_bmap_args *ap) +{ + + if (ap->a_bop != NULL) + *ap->a_bop = &ap->a_vp->v_bufobj; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_open_args { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_open(struct vop_open_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + int error; + + error = zfs_open(&vp, ap->a_mode, ap->a_cred); + if (error == 0) + vnode_create_vobject(vp, zp->z_size, ap->a_td); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_close_args { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_close(struct vop_close_args *ap) +{ + + return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_ioctl_args { + struct vnode *a_vp; + ulong_t a_command; + caddr_t a_data; + int a_fflag; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_ioctl(struct vop_ioctl_args *ap) +{ + + return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, + ap->a_fflag, ap->a_cred, NULL)); +} + +static int +ioflags(int ioflags) +{ + int flags = 0; + + if (ioflags & IO_APPEND) + flags |= FAPPEND; + if (ioflags & IO_NDELAY) + flags |= FNONBLOCK; + if (ioflags & IO_SYNC) + flags |= (FSYNC | FDSYNC | FRSYNC); + + return (flags); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_read_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_read(struct vop_read_args *ap) +{ + + return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_write_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_write(struct vop_write_args *ap) +{ + + return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_access_args { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_access(struct vop_access_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + accmode_t accmode; + int error = 0; + + + if (ap->a_accmode == VEXEC) { + if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0) + return (0); + } + + /* + * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, + */ + accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) + error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); + + /* + * VADMIN has to be handled by vaccess(). + */ + if (error == 0) { + accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) { + error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, + zp->z_gid, accmode, ap->a_cred, NULL); + } + } + + /* + * For VEXEC, ensure that at least one execute bit is set for + * non-directories. + */ + if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && + (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { + error = EACCES; + } + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) +{ + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + + ASSERT(cnp->cn_namelen < sizeof (nm)); + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm))); + + return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, + cnp->cn_cred, cnp->cn_thread, 0, cached)); +} + +static int +zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) +{ + + return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_cache_lookup(struct vop_lookup_args *ap) +{ + zfsvfs_t *zfsvfs; + + zfsvfs = ap->a_dvp->v_mount->mnt_data; + if (zfsvfs->z_use_namecache) + return (vfs_cache_lookup(ap)); + else + return (zfs_freebsd_lookup(ap, B_FALSE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_create_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; +}; +#endif + +static int +zfs_freebsd_create(struct vop_create_args *ap) +{ + zfsvfs_t *zfsvfs; + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc, mode; + + ASSERT(cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + mode = vap->va_mode & ALLPERMS; + zfsvfs = ap->a_dvp->v_mount->mnt_data; + *ap->a_vpp = NULL; + + rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode, + &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + if (zfsvfs->z_use_namecache && + rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(ap->a_dvp, *ap->a_vpp, cnp); + + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_remove_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_remove(struct vop_remove_args *ap) +{ + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, + ap->a_cnp->cn_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_mkdir_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; +}; +#endif + +static int +zfs_freebsd_mkdir(struct vop_mkdir_args *ap) +{ + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + *ap->a_vpp = NULL; + + rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp, + ap->a_cnp->cn_cred, 0, NULL); + + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_rmdir_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_rmdir(struct vop_rmdir_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_readdir_args { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + ulong_t **a_cookies; +}; +#endif + +static int +zfs_freebsd_readdir(struct vop_readdir_args *ap) +{ + + return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, + ap->a_ncookies, ap->a_cookies)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_fsync_args { + struct vnode *a_vp; + int a_waitfor; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_fsync(struct vop_fsync_args *ap) +{ + + vop_stdfsync(ap); + return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_getattr(struct vop_getattr_args *ap) +{ + vattr_t *vap = ap->a_vap; + xvattr_t xvap; + ulong_t fflags = 0; + int error; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + xvap.xva_vattr.va_mask |= AT_XVATTR; + + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + XVA_SET_REQ(&xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&xvap, XAT_APPENDONLY); + XVA_SET_REQ(&xvap, XAT_NOUNLINK); + XVA_SET_REQ(&xvap, XAT_NODUMP); + XVA_SET_REQ(&xvap, XAT_READONLY); + XVA_SET_REQ(&xvap, XAT_ARCHIVE); + XVA_SET_REQ(&xvap, XAT_SYSTEM); + XVA_SET_REQ(&xvap, XAT_HIDDEN); + XVA_SET_REQ(&xvap, XAT_REPARSE); + XVA_SET_REQ(&xvap, XAT_OFFLINE); + XVA_SET_REQ(&xvap, XAT_SPARSE); + + error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred); + if (error != 0) + return (error); + + /* Convert ZFS xattr into chflags. */ +#define FLAG_CHECK(fflag, xflag, xfield) do { \ + if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ + fflags |= (fflag); \ +} while (0) + FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHECK(UF_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHECK(UF_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHECK(UF_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHECK(UF_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); + +#undef FLAG_CHECK + *vap = xvap.xva_vattr; + vap->va_flags = fflags; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_setattr(struct vop_setattr_args *ap) +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cred = ap->a_cred; + xvattr_t xvap; + ulong_t fflags; + uint64_t zflags; + + vattr_init_mask(vap); + vap->va_mask &= ~AT_NOSET; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + + zflags = VTOZ(vp)->z_pflags; + + if (vap->va_flags != VNOVAL) { + zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; + int error; + + if (zfsvfs->z_use_fuids == B_FALSE) + return (EOPNOTSUPP); + + fflags = vap->va_flags; + /* + * XXX KDM + * We need to figure out whether it makes sense to allow + * UF_REPARSE through, since we don't really have other + * facilities to handle reparse points and zfs_setattr() + * doesn't currently allow setting that attribute anyway. + */ + if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| + UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| + UF_OFFLINE|UF_SPARSE)) != 0) + return (EOPNOTSUPP); + /* + * Unprivileged processes are not permitted to unset system + * flags, or modify flags if any system flags are set. + * Privileged non-jail processes may not modify system flags + * if securelevel > 0 and any existing system flags are set. + * Privileged jail processes behave like privileged non-jail + * processes if the PR_ALLOW_CHFLAGS permission bit is set; + * otherwise, they behave like unprivileged processes. + */ + if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || + spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { + error = securelevel_gt(cred, 0); + if (error != 0) + return (error); + } + } else { + /* + * Callers may only modify the file flags on + * objects they have VADMIN rights for. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, + curthread)) != 0) + return (error); + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | + ZFS_NOUNLINK)) { + return (EPERM); + } + if (fflags & + (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { + return (EPERM); + } + } + +#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ + if (((fflags & (fflag)) && !(zflags & (zflag))) || \ + ((zflags & (zflag)) && !(fflags & (fflag)))) { \ + XVA_SET_REQ(&xvap, (xflag)); \ + (xfield) = ((fflags & (fflag)) != 0); \ + } \ +} while (0) + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); +#undef FLAG_CHANGE + } + if (vap->va_birthtime.tv_sec != VNOVAL) { + xvap.xva_vattr.va_mask |= AT_XVATTR; + XVA_SET_REQ(&xvap, XAT_CREATETIME); + } + return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_rename_args { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; +}; +#endif + +static int +zfs_freebsd_rename(struct vop_rename_args *ap) +{ + vnode_t *fdvp = ap->a_fdvp; + vnode_t *fvp = ap->a_fvp; + vnode_t *tdvp = ap->a_tdvp; + vnode_t *tvp = ap->a_tvp; + int error; + + ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); + ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); + + error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, + ap->a_tcnp, ap->a_fcnp->cn_cred, 1); + + vrele(fdvp); + vrele(fvp); + vrele(tdvp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_symlink_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; +}; +#endif + +static int +zfs_freebsd_symlink(struct vop_symlink_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc; + + ASSERT(cnp->cn_flags & SAVENAME); + + vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ + vattr_init_mask(vap); + *ap->a_vpp = NULL; + + rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, + ap->a_target, &zp, cnp->cn_cred, 0 /* flags */); + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_readlink_args { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_readlink(struct vop_readlink_args *ap) +{ + + return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_link_args { + struct vnode *a_tdvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_link(struct vop_link_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *vp = ap->a_vp; + vnode_t *tdvp = ap->a_tdvp; + + if (tdvp->v_mount != vp->v_mount) + return (EXDEV); + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_link(VTOZ(tdvp), VTOZ(vp), + cnp->cn_nameptr, cnp->cn_cred, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_inactive_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_inactive(struct vop_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + + zfs_inactive(vp, ap->a_td->td_ucred, NULL); + return (0); +} + +#if __FreeBSD_version >= 1300042 +#ifndef _SYS_SYSPROTO_H_ +struct vop_need_inactive_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + bool need; + + if (!rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER)) + return (true); + need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + + return (need); +} +#endif + +#ifndef _SYS_SYSPROTO_H_ +struct vop_reclaim_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp != NULL); + +#if __FreeBSD_version < 1300042 + /* Destroy the vm object and flush associated pages. */ + vnode_destroy_vobject(vp); +#endif + /* + * z_teardown_inactive_lock protects from a race with + * zfs_znode_dmu_fini in zfsvfs_teardown during + * force unmount. + */ + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) + zfs_znode_free(zp); + else + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + + vp->v_data = NULL; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_fid_args { + struct vnode *a_vp; + struct fid *a_fid; +}; +#endif + +static int +zfs_freebsd_fid(struct vop_fid_args *ap) +{ + + return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); +} + + +#ifndef _SYS_SYSPROTO_H_ +struct vop_pathconf_args { + struct vnode *a_vp; + int a_name; + register_t *a_retval; +} *ap; +#endif + +static int +zfs_freebsd_pathconf(struct vop_pathconf_args *ap) +{ + ulong_t val; + int error; + + error = zfs_pathconf(ap->a_vp, ap->a_name, &val, + curthread->td_ucred, NULL); + if (error == 0) { + *ap->a_retval = val; + return (error); + } + if (error != EOPNOTSUPP) + return (error); + + switch (ap->a_name) { + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + case _PC_PIPE_BUF: + if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { + *ap->a_retval = PIPE_BUF; + return (0); + } + return (EINVAL); + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * FreeBSD's extended attributes namespace defines file name prefix for ZFS' + * extended attribute name: + * + * NAMESPACE PREFIX + * system freebsd:system: + * user (none, can be used to access ZFS fsattr(5) attributes + * created on Solaris) + */ +static int +zfs_create_attrname(int attrnamespace, const char *name, char *attrname, + size_t size) +{ + const char *namespace, *prefix, *suffix; + + /* We don't allow '/' character in attribute name. */ + if (strchr(name, '/') != NULL) + return (EINVAL); + /* We don't allow attribute names that start with "freebsd:" string. */ + if (strncmp(name, "freebsd:", 8) == 0) + return (EINVAL); + + bzero(attrname, size); + + switch (attrnamespace) { + case EXTATTR_NAMESPACE_USER: +#if 0 + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_USER_STRING; + suffix = ":"; +#else + /* + * This is the default namespace by which we can access all + * attributes created on Solaris. + */ + prefix = namespace = suffix = ""; +#endif + break; + case EXTATTR_NAMESPACE_SYSTEM: + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; + suffix = ":"; + break; + case EXTATTR_NAMESPACE_EMPTY: + default: + return (EINVAL); + } + if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, + name) >= size) { + return (ENAMETOOLONG); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operating to retrieve a named extended attribute. + */ +static int +zfs_getextattr(struct vop_getextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FREAD; + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, VN_OPEN_INVFS, 0, ap->a_cred, NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + if (error == ENOENT) + error = ENOATTR; + return (error); + } + + if (ap->a_size != NULL) { + error = VOP_GETATTR(vp, &va, ap->a_cred); + if (error == 0) + *ap->a_size = (size_t)va.va_size; + } else if (ap->a_uio != NULL) + error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK1(vp); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to remove a named attribute. + */ +int +zfs_deleteextattr(struct vop_deleteextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + vnode_t *xvp = NULL, *vp; + int error; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, + UIO_SYSSPACE, attrname, xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + if (error != 0) { + ZFS_EXIT(zfsvfs); + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == ENOENT) + error = ENOATTR; + return (error); + } + + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + NDFREE(&nd, NDF_ONLY_PNBUF); + + vput(nd.ni_dvp); + if (vp == nd.ni_dvp) + vrele(vp); + else + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to set a named attribute. + */ +static int +zfs_setextattr(struct vop_setextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (error); + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FFLAGS(O_WRONLY | O_CREAT); + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, + NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + VATTR_NULL(&va); + va.va_size = 0; + error = VOP_SETATTR(vp, &va, ap->a_cred); + if (error == 0) + VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK1(vp); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_listextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to retrieve extended attributes on a vnode. + */ +static int +zfs_listextattr(struct vop_listextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrprefix[16]; + uint8_t dirbuf[sizeof (struct dirent)]; + struct dirent *dp; + struct iovec aiov; + struct uio auio, *uio = ap->a_uio; + size_t *sizep = ap->a_size; + size_t plen; + vnode_t *xvp = NULL, *vp; + int done, error, eof, pos; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, + sizeof (attrprefix)); + if (error != 0) + return (error); + plen = strlen(attrprefix); + + ZFS_ENTER(zfsvfs); + + if (sizep != NULL) + *sizep = 0; + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + /* + * ENOATTR means that the EA directory does not yet exist, + * i.e. there are no extended attributes there. + */ + if (error == ENOATTR) + error = 0; + return (error); + } + + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, + UIO_SYSSPACE, ".", xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + + do { + uint8_t nlen; + + aiov.iov_base = (void *)dirbuf; + aiov.iov_len = sizeof (dirbuf); + auio.uio_resid = sizeof (dirbuf); + error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); + done = sizeof (dirbuf) - auio.uio_resid; + if (error != 0) + break; + for (pos = 0; pos < done; ) { + dp = (struct dirent *)(dirbuf + pos); + pos += dp->d_reclen; + /* + * XXX: Temporarily we also accept DT_UNKNOWN, as this + * is what we get when attribute was created on Solaris. + */ + if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) + continue; + if (plen == 0 && + strncmp(dp->d_name, "freebsd:", 8) == 0) + continue; + else if (strncmp(dp->d_name, attrprefix, plen) != 0) + continue; + nlen = dp->d_namlen - plen; + if (sizep != NULL) + *sizep += 1 + nlen; + else if (uio != NULL) { + /* + * Format of extattr name entry is one byte for + * length and the rest for name. + */ + error = uiomove(&nlen, 1, uio->uio_rw, uio); + if (error == 0) { + error = uiomove(dp->d_name + plen, nlen, + uio->uio_rw, uio); + } + if (error != 0) + break; + } + } + } while (!eof && error == 0); + + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getacl_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +int +zfs_freebsd_getacl(struct vop_getacl_args *ap) +{ + int error; + vsecattr_t vsecattr; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; + if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))) + return (error); + + error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt); + if (vsecattr.vsa_aclentp != NULL) + kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setacl_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +int +zfs_freebsd_setacl(struct vop_setacl_args *ap) +{ + int error; + vsecattr_t vsecattr; + int aclbsize; /* size of acl list in bytes */ + aclent_t *aaclp; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + if (ap->a_aclp == NULL) + return (EINVAL); + + if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) + return (EINVAL); + + /* + * With NFSv4 ACLs, chmod(2) may need to add additional entries, + * splitting every entry into two and appending "canonical six" + * entries at the end. Don't allow for setting an ACL that would + * cause chmod(2) to run out of ACL entries. + */ + if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) + return (ENOSPC); + + error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); + if (error != 0) + return (error); + + vsecattr.vsa_mask = VSA_ACE; + aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t); + vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclentsz = aclbsize; + + aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); + error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred); + kmem_free(aaclp, aclbsize); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_aclcheck_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +int +zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap) +{ + + return (EOPNOTSUPP); +} + +static int +zfs_vptocnp(struct vop_vptocnp_args *ap) +{ + vnode_t *covered_vp; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *zp = VTOZ(vp); + int ltype; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If we are a snapshot mounted under .zfs, run the operation + * on the covered vnode. + */ + if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { + char name[MAXNAMLEN + 1]; + znode_t *dzp; + size_t len; + + error = zfs_znode_parent_and_name(zp, &dzp, name); + if (error == 0) { + len = strlen(name); + if (*ap->a_buflen < len) + error = SET_ERROR(ENOMEM); + } + if (error == 0) { + *ap->a_buflen -= len; + bcopy(name, ap->a_buf + *ap->a_buflen, len); + *ap->a_vpp = ZTOV(dzp); + } + ZFS_EXIT(zfsvfs); + return (error); + } + ZFS_EXIT(zfsvfs); + + covered_vp = vp->v_mount->mnt_vnodecovered; +#if __FreeBSD_version >= 1300045 + enum vgetstate vs = vget_prep(covered_vp); +#else + vhold(covered_vp); +#endif + ltype = VOP_ISLOCKED(vp); + VOP_UNLOCK1(vp); +#if __FreeBSD_version >= 1300045 + error = vget_finish(covered_vp, LK_SHARED, vs); +#else + error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); +#endif + if (error == 0) { + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, + ap->a_buf, ap->a_buflen); + vput(covered_vp); + } + vn_lock(vp, ltype | LK_RETRY); + if (VN_IS_DOOMED(vp)) + error = SET_ERROR(ENOENT); + return (error); +} + +#ifdef DIAGNOSTIC +#ifndef _SYS_SYSPROTO_H_ +struct vop_lock1_args { + struct vnode *a_vp; + int a_flags; + char *file; + int line; +}; +#endif + +static int +zfs_lock(struct vop_lock1_args *ap) +{ + vnode_t *vp; + znode_t *zp; + int err; + +#if __FreeBSD_version >= 1300064 + err = vop_lock(ap); +#else + err = vop_stdlock(ap); +#endif + if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { + vp = ap->a_vp; + zp = vp->v_data; + if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) && + zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); + } + return (err); +} +#endif + +struct vop_vector zfs_vnodeops; +struct vop_vector zfs_fifoops; +struct vop_vector zfs_shareops; + +struct vop_vector zfs_vnodeops = { + .vop_default = &default_vnodeops, + .vop_inactive = zfs_freebsd_inactive, +#if __FreeBSD_version >= 1300042 + .vop_need_inactive = zfs_freebsd_need_inactive, +#endif + .vop_reclaim = zfs_freebsd_reclaim, + .vop_access = zfs_freebsd_access, + .vop_allocate = VOP_EINVAL, + .vop_lookup = zfs_cache_lookup, + .vop_cachedlookup = zfs_freebsd_cachedlookup, + .vop_getattr = zfs_freebsd_getattr, + .vop_setattr = zfs_freebsd_setattr, + .vop_create = zfs_freebsd_create, + .vop_mknod = (vop_mknod_t *)zfs_freebsd_create, + .vop_mkdir = zfs_freebsd_mkdir, + .vop_readdir = zfs_freebsd_readdir, + .vop_fsync = zfs_freebsd_fsync, + .vop_open = zfs_freebsd_open, + .vop_close = zfs_freebsd_close, + .vop_rmdir = zfs_freebsd_rmdir, + .vop_ioctl = zfs_freebsd_ioctl, + .vop_link = zfs_freebsd_link, + .vop_symlink = zfs_freebsd_symlink, + .vop_readlink = zfs_freebsd_readlink, + .vop_read = zfs_freebsd_read, + .vop_write = zfs_freebsd_write, + .vop_remove = zfs_freebsd_remove, + .vop_rename = zfs_freebsd_rename, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_bmap = zfs_freebsd_bmap, + .vop_fid = zfs_freebsd_fid, + .vop_getextattr = zfs_getextattr, + .vop_deleteextattr = zfs_deleteextattr, + .vop_setextattr = zfs_setextattr, + .vop_listextattr = zfs_listextattr, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, + .vop_getpages = zfs_freebsd_getpages, + .vop_putpages = zfs_freebsd_putpages, + .vop_vptocnp = zfs_vptocnp, +#if __FreeBSD_version >= 1300064 +#ifdef DIAGNOSTIC + .vop_lock1 = zfs_lock, +#else + .vop_lock1 = vop_lock, +#endif + .vop_unlock = vop_unlock, + .vop_islocked = vop_islocked, +#else +#ifdef DIAGNOSTIC + .vop_lock1 = zfs_lock, +#endif +#endif +}; +VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); + +struct vop_vector zfs_fifoops = { + .vop_default = &fifo_specops, + .vop_fsync = zfs_freebsd_fsync, + .vop_access = zfs_freebsd_access, + .vop_getattr = zfs_freebsd_getattr, + .vop_inactive = zfs_freebsd_inactive, + .vop_read = VOP_PANIC, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_setattr = zfs_freebsd_setattr, + .vop_write = VOP_PANIC, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_fid = zfs_freebsd_fid, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, +}; +VFS_VOP_VECTOR_REGISTER(zfs_fifoops); + +/* + * special share hidden files vnode operations template + */ +struct vop_vector zfs_shareops = { + .vop_default = &default_vnodeops, + .vop_access = zfs_freebsd_access, + .vop_inactive = zfs_freebsd_inactive, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_fid = zfs_freebsd_fid, + .vop_pathconf = zfs_freebsd_pathconf, +}; +VFS_VOP_VECTOR_REGISTER(zfs_shareops); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c new file mode 100644 index 000000000000..b9d5472b25b0 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -0,0 +1,1987 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2011 Martin Matuska */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif /* _KERNEL */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* Used by fstat(1). */ +SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, + SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); + +/* + * Define ZNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef DEBUG +#define ZNODE_STATS +#endif /* DEBUG */ + +#ifdef ZNODE_STATS +#define ZNODE_STAT_ADD(stat) ((stat)++) +#else +#define ZNODE_STAT_ADD(stat) /* nothing */ +#endif /* ZNODE_STATS */ + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +/* + * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to + * be freed before it can be safely accessed. + */ +krwlock_t zfsvfs_lock; + +static kmem_cache_t *znode_cache = NULL; + +extern struct vop_vector zfs_vnodeops; +extern struct vop_vector zfs_fifoops; +extern struct vop_vector zfs_shareops; + + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + POINTER_INVALIDATE(&zp->z_zfsvfs); + + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + + zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + + zp->z_acl_cached = NULL; + zp->z_vnode = NULL; + zp->z_moved = 0; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT3P(zp->z_vnode, ==, NULL); + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_acl_lock); + zfs_rangelock_fini(&zp->z_rangelock); + + ASSERT(zp->z_acl_cached == NULL); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, 0); + // kmem_cache_set_move(znode_cache, zfs_znode_move); +} + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + rw_destroy(&zfsvfs_lock); +} + + +static int +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + znode_t *zp; + int error; + + vattr.va_mask = AT_MODE|AT_UID|AT_GID; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); + + sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); + sharezp->z_moved = 0; + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; + sharezp->z_is_sa = zfsvfs->z_use_sa; + + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, sharezp); + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + sa_handle_destroy(sharezp->z_sa_hdl); + kmem_cache_free(znode_cache, sharezp); + + return (error); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ + return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); +} +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) +{ + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); + + ASSERT(zp->z_sa_hdl == NULL); + ASSERT(zp->z_acl_cached == NULL); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + /* + * Slap on VROOT if we are the root znode unless we are the root + * node of a snapshot mounted under .zfs. + */ + if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) + ZTOV(zp)->v_flag |= VROOT; + + vn_exists(ZTOV(zp)); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || + zp->z_unlinked || + RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +static void +zfs_vnode_forget(vnode_t *vp) +{ + + /* copied from insmntque_stddtr */ + vp->v_data = NULL; + vp->v_op = &dead_vnodeops; + vgone(vp); + vput(vp); +} + +/* + * Construct a new znode/vnode and intialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + vnode_t *vp; + uint64_t mode; + uint64_t parent; +#ifdef notyet + uint64_t mtime[2], ctime[2]; +#endif + uint64_t projid = ZFS_DEFAULT_PROJID; + sa_bulk_attr_t bulk[9]; + int count = 0; + int error; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + +#if __FreeBSD_version >= 1300076 + KASSERT(curthread->td_vp_reserved != NULL, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); +#else + KASSERT(curthread->td_vp_reserv > 0, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); +#endif + error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); + if (error != 0) { + kmem_cache_free(znode_cache, zp); + return (NULL); + } + zp->z_vnode = vp; + vp->v_data = zp; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + zp->z_moved = 0; + + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ + zp->z_sa_hdl = NULL; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + vp = ZTOV(zp); + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, 16); +#ifdef notyet + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); +#endif + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, 8); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || + (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + (zp->z_pflags & ZFS_PROJID) && + sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zfs_vnode_forget(vp); + zp->z_vnode = NULL; + kmem_cache_free(znode_cache, zp); + return (NULL); + } + + zp->z_projid = projid; + zp->z_mode = mode; + + /* Cache the xattr parent id */ + if (zp->z_pflags & ZFS_XATTR) + zp->z_xattr_parent = parent; + + vp->v_type = IFTOVT((mode_t)mode); + + switch (vp->v_type) { + case VDIR: + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; + case VFIFO: + vp->v_op = &zfs_fifoops; + break; + case VREG: + if (parent == zfsvfs->z_shares_dir) { + ASSERT(zp->z_uid == 0 && zp->z_gid == 0); + vp->v_op = &zfs_shareops; + } + break; + default: + break; + } + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes++; + membar_producer(); + /* + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). + */ + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * Acquire vnode lock before making it available to the world. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VN_LOCK_AREC(vp); + if (vp->v_type != VFIFO) + VN_LOCK_ASHARE(vp); + + return (zp); +} + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. + * + * OUT: zpp - allocated znode + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t dzp_pflags = 0; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + dmu_buf_t *db; + timestruc_t now; + uint64_t gen, obj; + int err; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + + ASSERT(vap && ((vap->va_mask & AT_MODE) == AT_MODE)); + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + vfs_timestamp(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } else { + dzp_pflags = dzp->z_pflags; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (vap->va_type == VDIR) { + size = 2; /* contents ("." and "..") */ + links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } else { + size = links = 0; + } + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + rdev = zfs_expldev(vap->va_rdev); + } + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + /* + * No execs denied will be deterimed when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } + if (obj_type == DMU_OT_ZNODE || + (vap->va_type == VBLK || vap->va_type == VCHR)) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + + if (!(flag & IS_ROOT_NODE)) { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + ASSERT(*zpp != NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + } + if (!(flag & IS_ROOT_NODE)) { + vnode_t *vp; + + vp = ZTOV(*zpp); + vp->v_vflag |= VV_FORCEINSMQ; + err = insmntque(vp, zfsvfs->z_vfs); + vp->v_vflag &= ~VV_FORCEINSMQ; + KASSERT(err == 0, ("insmntque() failed: error %d", err)); + } + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + vnode_t *vp; + sa_handle_t *hdl; + struct thread *td; + int locked; + int err; + + td = curthread; + getnewvnode_reserve_(); +again: + *zpp = NULL; + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + ASSERT3P(zp, !=, NULL); + ASSERT3U(zp->z_id, ==, obj_num); + if (zp->z_unlinked) { + err = SET_ERROR(ENOENT); + } else { + vp = ZTOV(zp); + /* + * Don't let the vnode disappear after + * ZFS_OBJ_HOLD_EXIT. + */ + VN_HOLD(vp); + *zpp = zp; + err = 0; + } + + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + if (err) { + getnewvnode_drop_reserve(); + return (err); + } + + locked = VOP_ISLOCKED(vp); + VI_LOCK(vp); + if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { + /* + * The vnode is doomed and this thread doesn't + * hold the exclusive lock on it, so the vnode + * must be being reclaimed by another thread. + * Otherwise the doomed vnode is being reclaimed + * by this thread and zfs_zget is called from + * ZIL internals. + */ + VI_UNLOCK(vp); + + /* + * XXX vrele() locks the vnode when the last reference + * is dropped. Although in this case the vnode is + * doomed / dead and so no inactivation is required, + * the vnode lock is still acquired. That could result + * in a LOR with z_teardown_lock if another thread holds + * the vnode's lock and tries to take z_teardown_lock. + * But that is only possible if the other thread peforms + * a ZFS vnode operation on the vnode. That either + * should not happen if the vnode is dead or the thread + * should also have a refrence to the vnode and thus + * our reference is not last. + */ + VN_RELE(vp); + goto again; + } + VI_UNLOCK(vp); + getnewvnode_drop_reserve(); + return (err); + } + + /* + * Not found create new znode/vnode + * but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + } else { + *zpp = zp; + } + if (err == 0) { + vnode_t *vp = ZTOV(zp); + + err = insmntque(vp, zfsvfs->z_vfs); + if (err == 0) { + vp->v_hash = obj_num; + VOP_UNLOCK1(vp); + } else { + zp->z_vnode = NULL; + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + *zpp = NULL; + } + } + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); +} + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + vnode_t *vp; + uint64_t obj_num = zp->z_id; + uint64_t mode, size; + sa_bulk_attr_t bulk[8]; + int err; + int count = 0; + uint64_t gen; + + /* + * Remove cached pages before reloading the znode, so that they are not + * lingering after we run into any error. Ideally, we should vgone() + * the vnode in case of error, but currently we cannot do that + * because of the LOR between the vnode lock and z_teardown_lock. + * So, instead, we have to "doom" the znode in the illumos style. + */ + vp = ZTOV(zp); + vn_pages_remove(vp, 0, 0); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + mutex_exit(&zp->z_acl_lock); + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + size = zp->z_size; + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, sizeof (zp->z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, sizeof (zp->z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + zp->z_mode = mode; + + if (gen != zp->z_gen) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * It is highly improbable but still quite possible that two + * objects in different datasets are created with the same + * object numbers and in transaction groups with the same + * numbers. znodes corresponding to those objects would + * have the same z_id and z_gen, but their other attributes + * may be different. + * zfs recv -F may replace one of such objects with the other. + * As a result file properties recorded in the replaced + * object's vnode may no longer match the received object's + * properties. At present the only cached property is the + * files type recorded in v_type. + * So, handle this case by leaving the old vnode and znode + * disassociated from the actual object. A new vnode and a + * znode will be created if the object is accessed + * (e.g. via a look-up). The old vnode and znode will be + * recycled when the last vnode reference is dropped. + */ + if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatical removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (zp->z_links == 0); + if (zp->z_unlinked) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (0); + } + + zp->z_blksz = doi.doi_data_block_size; + if (zp->z_size != size) + vnode_pager_setsize(vp, zp->z_size); + + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + zfs_znode_free(zp); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + + ASSERT(zp->z_sa_hdl); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_rmnode(zp); + return; + } + } + + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_znode_free(zp); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_sa_hdl == NULL); + zp->z_vnode = NULL; + mutex_enter(&zfsvfs->z_znodes_lock); + POINTER_INVALIDATE(&zp->z_zfsvfs); + list_remove(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes--; + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + kmem_cache_free(znode_cache, zp); + +} + +void +zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2], boolean_t have_tx) +{ + timestruc_t now; + + vfs_timestamp(&now); + + if (have_tx) { /* will sa_bulk_update happen really soon? */ + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & AT_ATIME) { + ZFS_TIME_ENCODE(&now, zp->z_atime); + } + + if (flag & AT_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + if (zp->z_zfsvfs->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & AT_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + + +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2]) +{ + zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); +} +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT0(error); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx)); + + vnode_pager_setsize(ZTOV(zp), end); + + zfs_rangelock_exit(lr); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + int error; + + /* + * Lock the range being freed. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + if (error == 0) { + /* + * In FreeBSD we cannot free block in the middle of a file, + * but only at the end of a file, so this code path should + * never happen. + */ + vnode_pager_setsize(ZTOV(zp), off); + } + + zfs_rangelock_exit(lr); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfs_locked_range_t *lr; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + zfs_rangelock_exit(lr); + return (error); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + + dmu_tx_commit(tx); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + vnode_pager_setsize(vp, end); + + zfs_rangelock_exit(lr); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + else + return (error); + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + return (error); +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + return (0); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int error; + int i; + znode_t *rootzp = NULL; + zfsvfs_t *zfsvfs; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + VATTR_NULL(&vattr); + vattr.va_mask = AT_MODE|AT_UID|AT_GID; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_moved = 0; + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_is_sa = USE_SA(version, os); + + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT(error == 0); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + rootzp->z_zfsvfs = zfsvfs; + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + + sa_handle_destroy(rootzp->z_sa_hdl); + kmem_cache_free(znode_cache, rootzp); + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(zfsvfs, tx); + + ASSERT(error == 0); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} + +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if ((doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t))) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + + return (0); +} + +void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (SET_ERROR(EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir; + + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, ""); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +#ifdef _KERNEL +int +zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t parent; + int is_xattrdir; + int err; + + /* Extended attributes should not be visible as regular files. */ + if ((zp->z_pflags & ZFS_XATTR) != 0) + return (SET_ERROR(EINVAL)); + + err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, + &parent, &is_xattrdir); + if (err != 0) + return (err); + ASSERT0(is_xattrdir); + + /* No name as this is a root object. */ + if (parent == zp->z_id) + return (SET_ERROR(EINVAL)); + + err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), buf); + if (err != 0) + return (err); + err = zfs_zget(zfsvfs, parent, dzpp); + return (err); +} +#endif /* _KERNEL */ diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c new file mode 100644 index 000000000000..d6496a7eff23 --- /dev/null +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -0,0 +1,1882 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authentication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearranged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpreted. + * + * At the objset level, we maintain 2 separate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintained in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaintext as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC + * here so that a reproducible checksum of the plaintext is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +/* + * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many + * calls, to test decryption error handling code paths. + */ +uint64_t zio_decrypt_fail_fraction = 0; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +static void +zio_crypt_key_destroy_early(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + bzero(&key->zk_session, sizeof (key->zk_session)); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + + freebsd_crypt_freesession(&key->zk_session); + zio_crypt_key_destroy_early(key); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech __unused; + uint_t keydata_len; + zio_crypt_info_t *ci = NULL; + + ASSERT(key != NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + ret = freebsd_crypt_newsession(&key->zk_session, ci, + &key->zk_current_key); + if (ret) + goto error; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy_early(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech __unused; + + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + freebsd_crypt_freesession(&key->zk_session); + ret = freebsd_crypt_newsession(&key->zk_session, + &zio_crypt_table[key->zk_crypt], &key->zk_current_key); + if (ret != 0) + goto out_unlock; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int failed_decrypt_size; + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the ciphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +/* + * The implemenation for FreeBSD's OpenCrypto. + * + * The big difference between ICP and FOC is that FOC uses a single + * buffer for input and output. This means that (for AES-GCM, the + * only one supported right now) the source must be copied into the + * destination, and the destination must have the AAD, and the tag/MAC, + * already associated with it. (Both implementations can use a uio.) + * + * Since the auth data is part of the iovec array, all we need to know + * is the length: 0 means there's no AAD. + * + */ +static int +zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess, + uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen, + uio_t *uio, uint_t auth_len) +{ + zio_crypt_info_t *ci; + int ret; + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + + ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf, + datalen, auth_len); + if (ret != 0) { +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Returning error %s\n", + __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM"); +#endif + ret = SET_ERROR(encrypt ? EIO : ECKSUM); + } + + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uint64_t aad[3]; + /* + * With OpenCrypto in FreeBSD, the same buffer is used for + * input and output. Also, the AAD (for AES-GMC at least) + * needs to logically go in front. + */ + uio_t cuio; + iovec_t iovecs[4]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + /* + * Since we only support one buffer, we need to copy + * the plain text (source) to the cipher buffer (dest). + * We set iovecs[0] -- the authentication data -- below. + */ + bcopy((void*)key->zk_master_keydata, keydata_out, keydata_len); + bcopy((void*)key->zk_hmac_keydata, hmac_keydata_out, + SHA512_HMAC_KEYLEN); + iovecs[1].iov_base = keydata_out; + iovecs[1].iov_len = keydata_len; + iovecs[2].iov_base = hmac_keydata_out; + iovecs[2].iov_len = SHA512_HMAC_KEYLEN; + iovecs[3].iov_base = mac; + iovecs[3].iov_len = WRAPPING_MAC_LEN; + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + iovecs[0].iov_base = aad; + iovecs[0].iov_len = aad_len; + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + + cuio.uio_iov = iovecs; + cuio.uio_iovcnt = 4; + cuio.uio_segflg = UIO_SYSSPACE; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey, + iv, enc_len, &cuio, aad_len); + if (ret != 0) + goto error; + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + int ret; + uint64_t aad[3]; + /* + * With OpenCrypto in FreeBSD, the same buffer is used for + * input and output. Also, the AAD (for AES-GMC at least) + * needs to logically go in front. + */ + uio_t cuio; + iovec_t iovecs[4]; + void *src, *dst; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + /* + * Since we only support one buffer, we need to copy + * the encrypted buffer (source) to the plain buffer + * (dest). We set iovecs[0] -- the authentication data -- + * below. + */ + dst = key->zk_master_keydata; + src = keydata; + + bcopy(src, dst, keydata_len); + + dst = key->zk_hmac_keydata; + src = hmac_keydata; + bcopy(src, dst, SHA512_HMAC_KEYLEN); + + iovecs[1].iov_base = key->zk_master_keydata; + iovecs[1].iov_len = keydata_len; + iovecs[2].iov_base = key->zk_hmac_keydata; + iovecs[2].iov_len = SHA512_HMAC_KEYLEN; + iovecs[3].iov_base = mac; + iovecs[3].iov_len = WRAPPING_MAC_LEN; + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + iovecs[0].iov_base = aad; + iovecs[0].iov_len = aad_len; + + cuio.uio_iov = iovecs; + cuio.uio_iovcnt = 4; + cuio.uio_segflg = UIO_SYSSPACE; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey, + iv, enc_len, &cuio, aad_len); + + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + ret = freebsd_crypt_newsession(&key->zk_session, + &zio_crypt_table[crypt], &key->zk_current_key); + if (ret != 0) + goto error; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy_early(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + crypto_mac(&key->zk_hmac_key, data, datalen, + raw_digestbuf, SHA512_DIGEST_LENGTH); + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + int avoidlint = SPA_MINBLOCKSIZE; + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, avoidlint); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpreted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, avoidlint); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + crypto_mac_update(ctx, &bab, bab_len); + + return (0); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + crypto_mac_update(ctx, adnp, sizeof (tmp_dncore)); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 separate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +/* ARGSUSED */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + struct hmac_ctx hash_ctx; + struct hmac_ctx *ctx = &hash_ctx; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + + /* calculate the portable MAC from the portable fields and metadnode */ + crypto_mac_init(ctx, &key->zk_hmac_key); + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + /* CONSTCOND */ + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH); + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user, group and project accounting. + * If these objects are not present, the local MAC is zeroed out. + */ + if ((datalen >= OBJSET_PHYS_SIZE_V3 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE && + osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || + (datalen >= OBJSET_PHYS_SIZE_V2 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || + (datalen <= OBJSET_PHYS_SIZE_V1)) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + crypto_mac_init(ctx, &key->zk_hmac_key); + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + /* CONSTCOND */ + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* XXX check dnode type ... */ + /* add in fields from the user accounting dnodes */ + if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + } + + if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + } + + if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && + datalen >= OBJSET_PHYS_SIZE_V3) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_projectused_dnode); + if (ret) + goto error; + } + + crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH); + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(uio_t *uio) +{ + if (uio->uio_iov) + kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) { +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__); +#endif + return (SET_ERROR(ECKSUM)); + } + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ +/* + * The OpenCrypto used in FreeBSD does not use seperate source and + * destination buffers; instead, the same buffer is used. Further, to + * accomodate some of the drivers, the authbuf needs to be logically before + * the data. This means that we need to copy the source to the destination, + * and set up an extra iovec_t at the beginning to handle the authbuf. + * It also means we'll only return one uio_t, which we do via the clumsy + * ifdef in the function declaration. + */ + +/* ARGSUSED */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, + uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + int ret; + uint64_t txtype, lr_len; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + zil_chain_t *zilc; + lr_t *lr; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + /* cipherbuf always needs an extra iovec for the MAC */ + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + /* + * We need at least two iovecs -- one for the AAD, + * one for the MAC. + */ + bcopy(src, dst, datalen); + nr_dst = 2; + + /* find the start and end record of the log block */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* calculate the number of encrypted iovecs we will need */ + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + nr_src = 0; + nr_dst += nr_iovecs; + + /* allocate the iovec arrays */ + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(src_iovecs, nr_src * sizeof (iovec_t)); + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(dst_iovecs, nr_dst * sizeof (iovec_t)); + } + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, dst, sizeof (zil_chain_t)); + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + /* loop over records again, filling in iovecs */ + /* The first one will contain the authbuf */ + nr_iovecs = 1; + + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + ASSERT3P(dst_iovecs, !=, NULL); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + dst_iovecs[nr_iovecs].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + nr_iovecs++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_base = (char *) + dlrp + sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + dst_iovecs[0].iov_base = aadbuf; + dst_iovecs[0].iov_len = aad_len; + + out_uio->uio_iov = dst_iovecs; + out_uio->uio_iovcnt = nr_dst; + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uio_t *puio, uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + bcopy(src, dst, datalen); + nr_dst = 2; + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + nr_src = 0; + nr_dst += nr_iovecs; + + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(src_iovecs, nr_src * sizeof (iovec_t)); + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(dst_iovecs, nr_dst * sizeof (iovec_t)); + } + + nr_iovecs = 1; + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + ASSERT3U(nr_iovecs, <, nr_dst); + ASSERT3P(dst_iovecs, !=, NULL); + dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + nr_iovecs++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + dst_iovecs[0].iov_base = aadbuf; + dst_iovecs[0].iov_len = aad_len; + out_uio->uio_iov = dst_iovecs; + out_uio->uio_iovcnt = nr_dst; + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* ARGSUSED */ +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *out_uio, + uint_t *enc_len) +{ + int ret; + uint_t nr_plain = 1, nr_cipher = 2; + iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; + void *src, *dst; + + cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), + KM_SLEEP); + if (!cipher_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + } else { + src = cipherbuf; + dst = plainbuf; + } + bcopy(src, dst, datalen); + cipher_iovecs[0].iov_base = dst; + cipher_iovecs[0].iov_len = datalen; + + *enc_len = datalen; + out_uio->uio_iov = cipher_iovecs; + out_uio->uio_iovcnt = nr_cipher; + + return (0); + +error: + if (plain_iovecs != NULL) + kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); + if (cipher_iovecs != NULL) + kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + *enc_len = 0; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + iovec_t *mac_iov; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + /* populate the uios */ + cuio->uio_segflg = UIO_SYSSPACE; + + mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov->iov_base = (void *)mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int faile_decrypt_size; + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + uint_t enc_len, auth_len; + uio_t puio, cuio; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + freebsd_crypt_session_t *tmpl = NULL; + uint8_t *authbuf = NULL; + + bzero(&puio, sizeof (uio_t)); + bzero(&cuio, sizeof (uio_t)); + +#ifdef FCRYPTO_DEBUG + printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", + __FUNCTION__, + encrypt ? "encrypt" : "decrypt", + key, salt, ot, iv, mac, datalen, + byteswap ? "byteswap" : "native_endian", plainbuf, + cipherbuf, no_crypt); + + printf("\tkey = {"); + for (int i = 0; i < key->zk_current_key.ck_length/8; i++) + printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]); + printf("}\n"); +#endif + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + if (ret != 0) + return (ret); + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = &key->zk_session; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + /* perform the encryption / decryption */ + ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt, + ckey, iv, enc_len, &cuio, auth_len); + if (ret != 0) + goto error; + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (0); + +error: + if (!encrypt) { + if (failed_decrypt_buf != NULL) + kmem_free(failed_decrypt_buf, failed_decrypt_size); + failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP); + failed_decrypt_size = datalen; + bcopy(cipherbuf, failed_decrypt_buf, datalen); + } + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + return (SET_ERROR(ret)); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (SET_ERROR(ret)); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* BEGIN CSTYLED */ +module_param(zfs_key_max_salt_uses, ulong, 0644); +MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " + "can be used for generating encryption keys before it is rotated"); +/* END CSTYLED */ +#endif diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c new file mode 100644 index 000000000000..bef97a9b34ab --- /dev/null +++ b/module/os/freebsd/zfs/zvol_os.c @@ -0,0 +1,1476 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2006-2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Portions Copyright 2010 Robert Milkowski + * + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2011 Martin Matuska */ + +/* + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev/zvol// + * + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + * + * On FreeBSD ZVOLs are simply GEOM providers like any other storage device + * in the system. Except when they're simply character devices (volmode=dev). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zfs_namecheck.h" + +#define ZVOL_DUMPSIZE "dumpsize" + +#ifdef ZVOL_LOCK_DEBUG +#define ZVOL_RW_READER RW_WRITER +#define ZVOL_RW_READ_HELD RW_WRITE_HELD +#else +#define ZVOL_RW_READER RW_READER +#define ZVOL_RW_READ_HELD RW_READ_HELD +#endif + +enum zvol_geom_state { + ZVOL_GEOM_UNINIT, + ZVOL_GEOM_STOPPED, + ZVOL_GEOM_RUNNING, +}; + +struct zvol_state_os { + int zso_volmode; +#define zso_dev _zso_state._zso_dev +#define zso_geom _zso_state._zso_geom + union { + /* volmode=dev */ + struct zvol_state_dev { + struct cdev *zsd_cdev; + uint64_t zsd_sync_cnt; + } _zso_dev; + + /* volmode=geom */ + struct zvol_state_geom { + struct g_provider *zsg_provider; + struct bio_queue_head zsg_queue; + struct mtx zsg_queue_mtx; + enum zvol_geom_state zsg_state; + } _zso_geom; + } _zso_state; +}; + +struct proc *zfsproc; + +static uint32_t zvol_minors; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, + "Expose as GEOM providers (1), device files (2) or neither"); +static boolean_t zpool_on_zvol = B_FALSE; +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, + "Allow zpools to use zvols as vdevs (DANGEROUS)"); + +/* + * Toggle unmap functionality. + */ +boolean_t zvol_unmap_enabled = B_TRUE; + +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, + &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); + +/* + * zvol maximum transfer in one DMU tx. + */ +int zvol_maxphys = DMU_MAX_ACCESS / 2; + +static void zvol_ensure_zilog(zvol_state_t *zv); + +static d_open_t zvol_cdev_open; +static d_close_t zvol_cdev_close; +static d_ioctl_t zvol_cdev_ioctl; +static d_read_t zvol_cdev_read; +static d_write_t zvol_cdev_write; +static d_strategy_t zvol_geom_bio_strategy; + +static struct cdevsw zvol_cdevsw = { + .d_name = "zvol", + .d_version = D_VERSION, + .d_flags = D_DISK | D_TRACKCLOSE, + .d_open = zvol_cdev_open, + .d_close = zvol_cdev_close, + .d_ioctl = zvol_cdev_ioctl, + .d_read = zvol_cdev_read, + .d_write = zvol_cdev_write, + .d_strategy = zvol_geom_bio_strategy, +}; + +extern uint_t zfs_geom_probe_vdev_key; + +struct g_class zfs_zvol_class = { + .name = "ZFS::ZVOL", + .version = G_VERSION, +}; + +DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); + +static int zvol_geom_open(struct g_provider *pp, int flag, int count); +static int zvol_geom_close(struct g_provider *pp, int flag, int count); +static void zvol_geom_run(zvol_state_t *zv); +static void zvol_geom_destroy(zvol_state_t *zv); +static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); +static void zvol_geom_worker(void *arg); +static void zvol_geom_bio_start(struct bio *bp); +static int zvol_geom_bio_getattr(struct bio *bp); +static void zvol_geom_bio_check_zilog(struct bio *bp); +/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ + +/* + * GEOM mode implementation + */ + +/*ARGSUSED*/ +static int +zvol_geom_open(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + int err = 0; + boolean_t drop_suspend = B_TRUE; + + if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { + /* + * if zfs_geom_probe_vdev_key is set, that means that zfs is + * attempting to probe geom providers while looking for a + * replacement for a missing VDEV. In this case, the + * spa_namespace_lock will not be held, but it is still illegal + * to use a zvol as a vdev. Deadlocks can result if another + * thread has spa_namespace_lock + */ + return (SET_ERROR(EOPNOTSUPP)); + } + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = pp->private; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + err = zvol_first_open(zv, !(flag & FWRITE)); + if (err) + goto out_mutex; + pp->mediasize = zv->zv_volsize; + pp->stripeoffset = 0; + pp->stripesize = zv->zv_volblocksize; + } + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || + dmu_objset_incompatible_encryption_version(zv->zv_objset))) { + err = EROFS; + goto out_open_count; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = EBUSY; + goto out_open_count; + } +#ifdef FEXCL + if (flag & FEXCL) { + if (zv->zv_open_count != 0) { + err = EBUSY; + goto out_open_count; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_open_count += count; + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); + +out_open_count: + if (zv->zv_open_count == 0) + zvol_last_close(zv); +out_mutex: + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (SET_ERROR(err)); +} + +/*ARGSUSED*/ +static int +zvol_geom_close(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = pp->private; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_open_count == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_open_count > 0); + + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if ((zv->zv_open_count - count) == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count -= count; + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); +} + +static void +zvol_geom_run(zvol_state_t *zv) +{ + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + g_error_provider(pp, 0); + + kproc_kthread_add(zvol_geom_worker, zv, &zfsproc, NULL, 0, 0, + "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); +} + +static void +zvol_geom_destroy(zvol_state_t *zv) +{ + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + g_topology_assert(); + + mutex_enter(&zv->zv_state_lock); + VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING); + mutex_exit(&zv->zv_state_lock); + zsg->zsg_provider = NULL; + pp->private = NULL; + g_wither_geom(pp->geom, ENXIO); +} + +static int +zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) +{ + int count, error, flags; + + g_topology_assert(); + + /* + * To make it easier we expect either open or close, but not both + * at the same time. + */ + KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || + (acr <= 0 && acw <= 0 && ace <= 0), + ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", + pp->name, acr, acw, ace)); + + if (pp->private == NULL) { + if (acr <= 0 && acw <= 0 && ace <= 0) + return (0); + return (pp->error); + } + + /* + * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if + * ace != 0, because GEOM already handles that and handles it a bit + * differently. GEOM allows for multiple read/exclusive consumers and + * ZFS allows only one exclusive consumer, no matter if it is reader or + * writer. I like better the way GEOM works so I'll leave it for GEOM + * to decide what to do. + */ + + count = acr + acw + ace; + if (count == 0) + return (0); + + flags = 0; + if (acr != 0 || ace != 0) + flags |= FREAD; + if (acw != 0) + flags |= FWRITE; + + g_topology_unlock(); + if (count > 0) + error = zvol_geom_open(pp, flags, count); + else + error = zvol_geom_close(pp, flags, -count); + g_topology_lock(); + return (error); +} + +static void +zvol_geom_worker(void *arg) +{ + zvol_state_t *zv = arg; + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct bio *bp; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + thread_lock(curthread); + sched_prio(curthread, PRIBIO); + thread_unlock(curthread); + + for (;;) { + mtx_lock(&zsg->zsg_queue_mtx); + bp = bioq_takefirst(&zsg->zsg_queue); + if (bp == NULL) { + if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { + zsg->zsg_state = ZVOL_GEOM_RUNNING; + wakeup(&zsg->zsg_state); + mtx_unlock(&zsg->zsg_queue_mtx); + kthread_exit(); + } + msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, + PRIBIO | PDROP, "zvol:io", 0); + continue; + } + mtx_unlock(&zsg->zsg_queue_mtx); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_geom_bio_check_zilog(bp); + switch (bp->bio_cmd) { + case BIO_FLUSH: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + g_io_deliver(bp, 0); + break; + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + zvol_geom_bio_strategy(bp); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + break; + } + rw_exit(&zv->zv_suspend_lock); + } +} + +static void +zvol_geom_bio_start(struct bio *bp) +{ + zvol_state_t *zv = bp->bio_to->private; + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + boolean_t first; + + if (bp->bio_cmd == BIO_GETATTR) { + if (zvol_geom_bio_getattr(bp)) + g_io_deliver(bp, EOPNOTSUPP); + return; + } + + if (!THREAD_CAN_SLEEP()) { + mtx_lock(&zsg->zsg_queue_mtx); + first = (bioq_first(&zsg->zsg_queue) == NULL); + bioq_insert_tail(&zsg->zsg_queue, bp); + mtx_unlock(&zsg->zsg_queue_mtx); + if (first) + wakeup_one(&zsg->zsg_queue); + return; + } + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_geom_bio_check_zilog(bp); + + switch (bp->bio_cmd) { + case BIO_FLUSH: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + g_io_deliver(bp, 0); + break; + case BIO_READ: + case BIO_WRITE: + case BIO_DELETE: + zvol_geom_bio_strategy(bp); + break; + default: + g_io_deliver(bp, EOPNOTSUPP); + break; + } + rw_exit(&zv->zv_suspend_lock); +} + +static int +zvol_geom_bio_getattr(struct bio *bp) +{ + zvol_state_t *zv; + + zv = bp->bio_to->private; + ASSERT(zv != NULL); + + spa_t *spa = dmu_objset_spa(zv->zv_objset); + uint64_t refd, avail, usedobjs, availobjs; + + if (g_handleattr_int(bp, "GEOM::candelete", 1)) + return (0); + if (strcmp(bp->bio_attribute, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksavail", + avail / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) + return (0); + } + return (1); +} + +static void +zvol_geom_bio_check_zilog(struct bio *bp) +{ + zvol_state_t *zv; + + zv = bp->bio_to->private; + ASSERT(zv != NULL); + + switch (bp->bio_cmd) { + case BIO_FLUSH: + case BIO_WRITE: + case BIO_DELETE: + zvol_ensure_zilog(zv); + default: + break; + } +} + +static void +zvol_geom_bio_strategy(struct bio *bp) +{ + zvol_state_t *zv; + uint64_t off, volsize; + size_t resid; + char *addr; + objset_t *os; + zfs_locked_range_t *lr; + int error = 0; + boolean_t doread = 0; + boolean_t is_dumpified; + boolean_t sync; + + if (bp->bio_to) + zv = bp->bio_to->private; + else + zv = bp->bio_dev->si_drv2; + + if (zv == NULL) { + error = SET_ERROR(ENXIO); + goto out; + } + + if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) { + error = SET_ERROR(EROFS); + goto out; + } + + switch (bp->bio_cmd) { + case BIO_FLUSH: + goto sync; + case BIO_READ: + doread = 1; + case BIO_WRITE: + case BIO_DELETE: + break; + default: + error = EOPNOTSUPP; + goto out; + } + + off = bp->bio_offset; + volsize = zv->zv_volsize; + + os = zv->zv_objset; + ASSERT(os != NULL); + + addr = bp->bio_data; + resid = bp->bio_length; + + if (resid > 0 && (off < 0 || off >= volsize)) { + error = SET_ERROR(EIO); + goto out; + } + + is_dumpified = B_FALSE; + sync = !doread && !is_dumpified && + zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + + /* + * There must be no buffer changes when doing a dmu_sync() because + * we can't change the data whilst calculating the checksum. + */ + lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, + doread ? RL_READER : RL_WRITER); + + if (bp->bio_cmd == BIO_DELETE) { + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, off, resid, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + off, resid); + resid = 0; + } + goto unlock; + } + while (resid != 0 && off < volsize) { + size_t size = MIN(resid, zvol_maxphys); + if (doread) { + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); + } else { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size, sync); + dmu_tx_commit(tx); + } + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + off += size; + addr += size; + resid -= size; + } +unlock: + zfs_rangelock_exit(lr); + + bp->bio_completed = bp->bio_length - resid; + if (bp->bio_completed < bp->bio_length && off > volsize) + error = EINVAL; + + if (sync) { +sync: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } +out: + if (bp->bio_to) + g_io_deliver(bp, error); + else + biofinish(bp, NULL, error); +} + +/* + * Character device mode implementation + */ + +static int +zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_state_t *zv; + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + + zv = dev->si_drv2; + + volsize = zv->zv_volsize; + /* + * uio_loffset == volsize isn't an error as + * its required for EOF processing. + */ + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + return (SET_ERROR(EIO)); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, + uio->uio_resid, RL_READER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - uio->uio_loffset) + bytes = volsize - uio->uio_loffset; + + error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + } + zfs_rangelock_exit(lr); + + return (error); +} + +static int +zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_state_t *zv; + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + boolean_t sync; + + zv = dev->si_drv2; + + volsize = zv->zv_volsize; + + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + return (SET_ERROR(EIO)); + + sync = (ioflag & IO_SYNC) || + (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_ensure_zilog(zv); + rw_exit(&zv->zv_suspend_lock); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, + uio->uio_resid, RL_WRITER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + uint64_t off = uio->uio_loffset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); + if (error == 0) + zvol_log_write(zv, tx, off, bytes, sync); + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_rangelock_exit(lr); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + return (error); +} + +static int +zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + int err = 0; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = dev->si_drv2; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + err = zvol_first_open(zv, !(flags & FWRITE)); + if (err) + goto out_locked; + } + + if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + err = EROFS; + goto out_opened; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = EBUSY; + goto out_opened; + } +#ifdef FEXCL + if (flags & FEXCL) { + if (zv->zv_open_count != 0) { + err = EBUSY; + goto out_opened; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_open_count++; + if (flags & (FSYNC | FDSYNC)) { + zsd = &zv->zv_zso->zso_dev; + zsd->zsd_sync_cnt++; + if (zsd->zsd_sync_cnt == 1) + zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); + } + + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); + +out_opened: + if (zv->zv_open_count == 0) + zvol_last_close(zv); +out_locked: + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (SET_ERROR(err)); +} + +static int +zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = dev->si_drv2; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_open_count == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_open_count > 0); + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 1) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count--; + if (flags & (FSYNC | FDSYNC)) { + zsd = &zv->zv_zso->zso_dev; + zsd->zsd_sync_cnt--; + } + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); +} + +static int +zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, + int fflag, struct thread *td) +{ + zvol_state_t *zv; + zfs_locked_range_t *lr; + off_t offset, length; + int i, error; + boolean_t sync; + + zv = dev->si_drv2; + + error = 0; + KASSERT(zv->zv_open_count > 0, + ("Device with zero access count in %s", __func__)); + + i = IOCPARM_LEN(cmd); + switch (cmd) { + case DIOCGSECTORSIZE: + *(uint32_t *)data = DEV_BSIZE; + break; + case DIOCGMEDIASIZE: + *(off_t *)data = zv->zv_volsize; + break; + case DIOCGFLUSH: + if (zv->zv_zilog != NULL) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + break; + case DIOCGDELETE: + if (!zvol_unmap_enabled) + break; + + offset = ((off_t *)data)[0]; + length = ((off_t *)data)[1]; + if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || + offset < 0 || offset >= zv->zv_volsize || + length <= 0) { + printf("%s: offset=%jd length=%jd\n", __func__, offset, + length); + error = EINVAL; + break; + } + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_ensure_zilog(zv); + lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, + RL_WRITER); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + sync = FALSE; + dmu_tx_abort(tx); + } else { + sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + zvol_log_truncate(zv, tx, offset, length, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + offset, length); + } + zfs_rangelock_exit(lr); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); + break; + case DIOCGSTRIPESIZE: + *(off_t *)data = zv->zv_volblocksize; + break; + case DIOCGSTRIPEOFFSET: + *(off_t *)data = 0; + break; + case DIOCGATTR: { + spa_t *spa = dmu_objset_spa(zv->zv_objset); + struct diocgattr_arg *arg = (struct diocgattr_arg *)data; + uint64_t refd, avail, usedobjs, availobjs; + + if (strcmp(arg->name, "GEOM::candelete") == 0) + arg->value.i = 1; + else if (strcmp(arg->name, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = refd / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc( + spa_normal_class(spa)); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + arg->value.off = refd / DEV_BSIZE; + } else + error = ENOIOCTL; + break; + } + case FIOSEEKHOLE: + case FIOSEEKDATA: { + off_t *off = (off_t *)data; + uint64_t noff; + boolean_t hole; + + hole = (cmd == FIOSEEKHOLE); + noff = *off; + error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); + *off = noff; + break; + } + default: + error = ENOIOCTL; + } + + return (error); +} + +/* + * Misc. helpers + */ + +static void +zvol_ensure_zilog(zvol_state_t *zv) +{ + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } +} + +static boolean_t +zvol_is_zvol_impl(const char *device) +{ + return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); +} + +static void +zvol_rename_minor(zvol_state_t *zv, const char *newname) +{ + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + struct g_geom *gp; + + g_topology_lock(); + gp = pp->geom; + ASSERT(gp != NULL); + + zsg->zsg_provider = NULL; + g_wither_provider(pp, ENXIO); + + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = zv->zv_volsize; + pp->private = zv; + zsg->zsg_provider = pp; + g_error_provider(pp, 0); + g_topology_unlock(); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + dev = zsd->zsd_cdev; + if (dev != NULL) { + destroy_dev(dev); + dev = zsd->zsd_cdev = NULL; + if (zv->zv_open_count > 0) { + zv->zv_flags &= ~ZVOL_EXCL; + zv->zv_open_count = 0; + /* XXX need suspend lock but lock order */ + zvol_last_close(zv); + } + } + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) + == 0) { + dev->si_iosize_max = MAXPHYS; + zsd->zsd_cdev = dev; + } + } + strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); +} + +/* + * Remove minor node for the specified volume. + */ +static void +zvol_free(zvol_state_t *zv) +{ + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count == 0); + + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + + g_topology_lock(); + zvol_geom_destroy(zv); + g_topology_unlock(); + mtx_destroy(&zsg->zsg_queue_mtx); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev = zsd->zsd_cdev; + + if (dev != NULL) + destroy_dev(dev); + } + + mutex_destroy(&zv->zv_state_lock); + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + zvol_minors--; +} + +/* + * Create a minor node (plus a whole lot more) for the specified volume. + */ +static int +zvol_create_minor_impl(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + uint64_t volmode, hash; + int error; + + ZFS_LOG(1, "Creating ZVOL %s...", name); + + hash = zvol_name_hash(name); + if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(EEXIST)); + } + + DROP_GIANT(); + /* lie and say we're read-only */ + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + + if (error) + goto out_doi; + + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; + + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); + if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + /* + * zvol_alloc equivalent ... + */ + zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); + zv->zv_hash = hash; + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_zso->zso_volmode = volmode; + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp; + struct g_geom *gp; + + zsg->zsg_state = ZVOL_GEOM_UNINIT; + mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); + + g_topology_lock(); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_bio_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + /* TODO: NULL check? */ + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = 0; + pp->private = zv; + + zsg->zsg_provider = pp; + bioq_init(&zsg->zsg_queue); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); + if (error != 0) { + mutex_destroy(&zv->zv_state_lock); + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (*zv)); + dmu_objset_disown(os, B_TRUE, FTAG); + goto out_giant; + } + dev->si_iosize_max = MAXPHYS; + zsd->zsd_cdev = dev; + } + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + + if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) + zv->zv_flags |= ZVOL_RDONLY; + + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; + + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + + /* XXX do prefetch */ + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, B_TRUE, FTAG); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (error == 0) + zvol_geom_run(zv); + g_topology_unlock(); + } +out_doi: + kmem_free(doi, sizeof (dmu_object_info_t)); + if (error == 0) { + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_insert(zv); + zvol_minors++; + rw_exit(&zvol_state_lock); + } + ZFS_LOG(1, "ZVOL %s created.", name); +out_giant: + PICKUP_GIANT(); + return (error); +} + +static void +zvol_clear_private(zvol_state_t *zv) +{ + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + if (pp == NULL) /* XXX when? */ + return; + + mtx_lock(&zsg->zsg_queue_mtx); + zsg->zsg_state = ZVOL_GEOM_STOPPED; + pp->private = NULL; + wakeup_one(&zsg->zsg_queue); + while (zsg->zsg_state != ZVOL_GEOM_RUNNING) + msleep(&zsg->zsg_state, + &zsg->zsg_queue_mtx, + 0, "zvol:w", 0); + mtx_unlock(&zsg->zsg_queue_mtx); + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + } +} + +static int +zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + zv->zv_volsize = volsize; + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + if (pp == NULL) /* XXX when? */ + return (0); + + g_topology_lock(); + + /* + * Do not invoke resize event when initial size was zero. + * ZVOL initializes the size on first open, this is not + * real resizing. + */ + if (pp->mediasize == 0) + pp->mediasize = zv->zv_volsize; + else + g_resize_provider(pp, zv->zv_volsize); + + g_topology_unlock(); + } + return (0); +} + +static void +zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) +{ + // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); +} + +static void +zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) +{ + // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); +} + +const static zvol_platform_ops_t zvol_freebsd_ops = { + .zv_free = zvol_free, + .zv_rename_minor = zvol_rename_minor, + .zv_create_minor = zvol_create_minor_impl, + .zv_update_volsize = zvol_update_volsize, + .zv_clear_private = zvol_clear_private, + .zv_is_zvol = zvol_is_zvol_impl, + .zv_set_disk_ro = zvol_set_disk_ro_impl, + .zv_set_capacity = zvol_set_capacity_impl, +}; + +/* + * Public interfaces + */ + +int +zvol_busy(void) +{ + return (zvol_minors != 0); +} + +int +zvol_init(void) +{ + zvol_init_impl(); + zvol_register_ops(&zvol_freebsd_ops); + return (0); +} + +void +zvol_fini(void) +{ + zvol_fini_impl(); +} diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh index e1e114128e38..1f142455a9cd 100755 --- a/scripts/zfs-tests.sh +++ b/scripts/zfs-tests.sh @@ -221,7 +221,7 @@ create_links() { [ ! -e "$STF_PATH/$i" ] || continue if [ ! -d "$j/$i" ] && [ -e "$j/$i" ]; then - ln -s "$j/$i" "$STF_PATH/$i" || \ + ln -sf "$j/$i" "$STF_PATH/$i" || \ fail "Couldn't link $i" break fi diff --git a/tests/runfiles/Makefile.am b/tests/runfiles/Makefile.am index cc630a5e9e6c..c7cf2a20c1d5 100644 --- a/tests/runfiles/Makefile.am +++ b/tests/runfiles/Makefile.am @@ -1,6 +1,7 @@ pkgdatadir = $(datadir)/@PACKAGE@/runfiles dist_pkgdata_DATA = \ common.run \ + freebsd.run \ linux.run \ longevity.run \ perf-regression.run \ diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ccf03af89dcb..b75be778bdc2 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -654,6 +654,12 @@ tests = ['online_offline_001_pos', 'online_offline_002_neg', 'online_offline_003_neg'] tags = ['functional', 'online_offline'] +[tests/functional/persist_l2arc] +tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos', + 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', + 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] +tags = ['functional', 'persist_l2arc'] + [tests/functional/pool_checkpoint] tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', diff --git a/tests/runfiles/freebsd.run b/tests/runfiles/freebsd.run new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 61df6d4208ce..897a6a95582e 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -164,9 +164,3 @@ tags = ['functional', 'user_namespace'] tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos', 'userquota_013_pos', 'userspace_003_pos'] tags = ['functional', 'userquota'] - -[tests/functional/persist_l2arc:Linux] -tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos', - 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', - 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] -tags = ['functional', 'persist_l2arc'] diff --git a/tests/test-runner/bin/zts-report.py b/tests/test-runner/bin/zts-report.py index 7fc84fcecf95..d74aa9d7aef8 100755 --- a/tests/test-runner/bin/zts-report.py +++ b/tests/test-runner/bin/zts-report.py @@ -244,9 +244,10 @@ if sys.platform.startswith('freebsd'): maybe.update({ 'cli_root/zfs_copies/zfs_copies_002_pos': ['FAIL', known_reason], - 'cli_root/zpool_import/zpool_import_missing_003_pos': - ['FAIL', known_reason], + 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], + 'removal/removal_condense_export': ['FAIL', known_reason], + 'removal/removal_with_export': ['FAIL', known_reason], 'resilver/resilver_restart_001': ['FAIL', known_reason], }) diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index a641a0b7a7e3..f917c58129af 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -4056,13 +4056,19 @@ function ls_xattr # path function get_arcstat # stat { - if is_linux; then - typeset stat=$1 + typeset stat=$1 + + case $(uname) in + FreeBSD) + sysctl -n kstat.zfs.misc.arcstats.$stat + ;; + Linux) typeset zfs_arcstats="/proc/spl/kstat/zfs/arcstats" [[ -f "$zfs_arcstats" ]] || return 1 grep $stat $zfs_arcstats | awk '{print $3}' - return $? - else - return 1 - fi + ;; + *) + false + ;; + esac } diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 680fcf42cb23..d7bd44b60350 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -36,8 +36,8 @@ INITIALIZE_CHUNK_SIZE initialize_chunk_size zfs_initialize_chunk_size INITIALIZE_VALUE initialize_value zfs_initialize_value KEEP_LOG_SPACEMAPS_AT_EXPORT keep_log_spacemaps_at_export zfs_keep_log_spacemaps_at_export L2ARC_NOPREFETCH l2arc.noprefetch l2arc_noprefetch -L2ARC_REBUILD_BLOCKS_MIN_L2SIZE UNSUPPORTED l2arc_rebuild_blocks_min_l2size -L2ARC_REBUILD_ENABLED UNSUPPORTED l2arc_rebuild_enabled +L2ARC_REBUILD_BLOCKS_MIN_L2SIZE l2arc.rebuild_blocks_min_l2size l2arc_rebuild_blocks_min_l2size +L2ARC_REBUILD_ENABLED l2arc.rebuild_enabled l2arc_rebuild_enabled L2ARC_WRITE_BOOST l2arc.write_boost l2arc_write_boost L2ARC_WRITE_MAX l2arc.write_max l2arc_write_max LIVELIST_CONDENSE_NEW_ALLOC livelist.condense.new_alloc zfs_livelist_condense_new_alloc diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh index 815d409aa1a9..6fa55250a77d 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh @@ -59,8 +59,8 @@ verify_runnable "global" -# See issue: https://github.com/zfsonlinux/zfs/issues/6839 -if is_linux; then +# See issue: https://github.com/openzfs/zfs/issues/6839 +if ! is_illumos; then log_unsupported "Test case may be slow" fi