From 7ecd19cfdfcbb625cc059dfa5b267d2436732c1c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:41 -0800 Subject: [PATCH 01/55] mm: percpu: generalize percpu related config Patch series "mm: percpu: Cleanup percpu first chunk function". When supporting page mapping percpu first chunk allocator on arm64, we found there are lots of duplicated codes in percpu embed/page first chunk allocator. This patchset is aimed to cleanup them and should no function change. The currently supported status about 'embed' and 'page' in Archs shows below, embed: NEED_PER_CPU_PAGE_FIRST_CHUNK page: NEED_PER_CPU_EMBED_FIRST_CHUNK embed page ------------------------ arm64 Y Y mips Y N powerpc Y Y riscv Y N sparc Y Y x86 Y Y ------------------------ There are two interfaces about percpu first chunk allocator, extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); The pcpu_fc_alloc_fn_t/pcpu_fc_free_fn_t is killed, we provide generic pcpu_fc_alloc() and pcpu_fc_free() function, which are called in the pcpu_embed/page_first_chunk(). 1) For pcpu_embed_first_chunk(), pcpu_fc_cpu_to_node_fn_t is needed to be provided when archs supported NUMA. 2) For pcpu_page_first_chunk(), the pcpu_fc_populate_pte_fn_t is killed too, a generic pcpu_populate_pte() which marked '__weak' is provided, if you need a different function to populate pte on the arch(like x86), please provide its own implementation. [1] https://github.com/kevin78/linux.git percpu-cleanup This patch (of 4): The HAVE_SETUP_PER_CPU_AREA/NEED_PER_CPU_EMBED_FIRST_CHUNK/ NEED_PER_CPU_PAGE_FIRST_CHUNK/USE_PERCPU_NUMA_NODE_ID configs, which have duplicate definitions on platforms that subscribe it. Move them into mm, drop these redundant definitions and instead just select it on applicable platforms. Link: https://lkml.kernel.org/r/20211216112359.103822-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20211216112359.103822-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Catalin Marinas [arm64] Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Christoph Lameter Cc: Dennis Zhou Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 20 ++++---------------- arch/ia64/Kconfig | 9 ++------- arch/mips/Kconfig | 10 ++-------- arch/powerpc/Kconfig | 17 ++++------------- arch/riscv/Kconfig | 10 ++-------- arch/sparc/Kconfig | 12 +++--------- arch/x86/Kconfig | 17 ++++------------- mm/Kconfig | 12 ++++++++++++ 8 files changed, 33 insertions(+), 74 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index c4207cf9bb17ff..4ff73299f8a9fd 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1135,6 +1135,10 @@ config NUMA select GENERIC_ARCH_NUMA select ACPI_NUMA if ACPI select OF_NUMA + select HAVE_SETUP_PER_CPU_AREA + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK + select USE_PERCPU_NUMA_NODE_ID help Enable NUMA (Non-Uniform Memory Access) support. @@ -1151,22 +1155,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - -config HAVE_SETUP_PER_CPU_AREA - def_bool y - depends on NUMA - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y - depends on NUMA - -config NEED_PER_CPU_PAGE_FIRST_CHUNK - def_bool y - depends on NUMA - source "kernel/Kconfig.hz" config ARCH_SPARSEMEM_ENABLE diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 1e33666fa679be..703952819e10e0 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -32,6 +32,7 @@ config IA64 select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE if (!ITANIUM) select HAVE_FUNCTION_TRACER + select HAVE_SETUP_PER_CPU_AREA select TTY select HAVE_ARCH_TRACEHOOK select HAVE_VIRT_CPU_ACCOUNTING @@ -88,9 +89,6 @@ config GENERIC_CALIBRATE_DELAY bool default y -config HAVE_SETUP_PER_CPU_AREA - def_bool y - config DMI bool default y @@ -292,6 +290,7 @@ config NUMA bool "NUMA support" depends on !FLATMEM select SMP + select USE_PERCPU_NUMA_NODE_ID help Say Y to compile the kernel to support NUMA (Non-Uniform Memory Access). This option is for configuring high-end multiprocessor @@ -311,10 +310,6 @@ config HAVE_ARCH_NODEDATA_EXTENSION def_bool y depends on NUMA -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - config HAVE_MEMORYLESS_NODES def_bool NUMA diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 0215dc1529e9ae..9e77659641a2b7 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2666,6 +2666,8 @@ config NUMA bool "NUMA Support" depends on SYS_SUPPORTS_NUMA select SMP + select HAVE_SETUP_PER_CPU_AREA + select NEED_PER_CPU_EMBED_FIRST_CHUNK help Say Y to compile the kernel to support NUMA (Non-Uniform Memory Access). This option improves performance on systems with more @@ -2676,14 +2678,6 @@ config NUMA config SYS_SUPPORTS_NUMA bool -config HAVE_SETUP_PER_CPU_AREA - def_bool y - depends on NUMA - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y - depends on NUMA - config RELOCATABLE bool "Relocatable kernel" depends on SYS_SUPPORTS_RELOCATABLE diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index dea74d7717c0d4..8badd39854a0ad 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -55,15 +55,6 @@ config ARCH_MMAP_RND_COMPAT_BITS_MIN default 9 if PPC_16K_PAGES # 9 = 23 (8MB) - 14 (16K) default 11 # 11 = 23 (8MB) - 12 (4K) -config HAVE_SETUP_PER_CPU_AREA - def_bool PPC64 - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y if PPC64 - -config NEED_PER_CPU_PAGE_FIRST_CHUNK - def_bool y if PPC64 - config NR_IRQS int "Number of virtual interrupt numbers" range 32 1048576 @@ -240,6 +231,7 @@ config PPC select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE select HAVE_RSEQ + select HAVE_SETUP_PER_CPU_AREA if PPC64 select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if PPC32 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r2) select HAVE_STACKPROTECTOR if PPC64 && $(cc-option,-mstack-protector-guard=tls -mstack-protector-guard-reg=r13) @@ -254,6 +246,8 @@ config PPC select MMU_GATHER_RCU_TABLE_FREE select MODULES_USE_ELF_RELA select NEED_DMA_MAP_STATE if PPC64 || NOT_COHERENT_CACHE + select NEED_PER_CPU_EMBED_FIRST_CHUNK if PPC64 + select NEED_PER_CPU_PAGE_FIRST_CHUNK if PPC64 select NEED_SG_DMA_LENGTH select OF select OF_DMA_DEFAULT_COHERENT if !NOT_COHERENT_CACHE @@ -659,6 +653,7 @@ config NUMA bool "NUMA Memory Allocation and Scheduler Support" depends on PPC64 && SMP default y if PPC_PSERIES || PPC_POWERNV + select USE_PERCPU_NUMA_NODE_ID help Enable NUMA (Non-Uniform Memory Access) support. @@ -672,10 +667,6 @@ config NODES_SHIFT default "4" depends on NUMA -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - config HAVE_MEMORYLESS_NODES def_bool y depends on NUMA diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 821252b65f8906..bf66bcbc5a39e0 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -334,6 +334,8 @@ config NUMA select GENERIC_ARCH_NUMA select OF_NUMA select ARCH_SUPPORTS_NUMA_BALANCING + select USE_PERCPU_NUMA_NODE_ID + select NEED_PER_CPU_EMBED_FIRST_CHUNK help Enable NUMA (Non-Uniform Memory Access) support. @@ -349,14 +351,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y - depends on NUMA - config RISCV_ISA_C bool "Emit compressed instructions when building Linux" default y diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 66fc08646be5e5..1cab1b284f1a87 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -97,6 +97,9 @@ config SPARC64 select PCI_DOMAINS if PCI select ARCH_HAS_GIGANTIC_PAGE select HAVE_SOFTIRQ_ON_OWN_STACK + select HAVE_SETUP_PER_CPU_AREA + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK config ARCH_PROC_KCORE_TEXT def_bool y @@ -123,15 +126,6 @@ config AUDIT_ARCH bool default y -config HAVE_SETUP_PER_CPU_AREA - def_bool y if SPARC64 - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y if SPARC64 - -config NEED_PER_CPU_PAGE_FIRST_CHUNK - def_bool y if SPARC64 - config MMU bool default y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5c2ccb85f2efb8..1275bab8be2cba 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -239,6 +239,7 @@ config X86 select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_SETUP_PER_CPU_AREA select HAVE_SOFTIRQ_ON_OWN_STACK select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR select HAVE_STACK_VALIDATION if X86_64 @@ -252,6 +253,8 @@ config X86 select HAVE_GENERIC_VDSO select HOTPLUG_SMT if SMP select IRQ_FORCED_THREADING + select NEED_PER_CPU_EMBED_FIRST_CHUNK + select NEED_PER_CPU_PAGE_FIRST_CHUNK select NEED_SG_DMA_LENGTH select PCI_DOMAINS if PCI select PCI_LOCKLESS_CONFIG if PCI @@ -331,15 +334,6 @@ config ARCH_HAS_CPU_RELAX config ARCH_HAS_FILTER_PGPROT def_bool y -config HAVE_SETUP_PER_CPU_AREA - def_bool y - -config NEED_PER_CPU_EMBED_FIRST_CHUNK - def_bool y - -config NEED_PER_CPU_PAGE_FIRST_CHUNK - def_bool y - config ARCH_HIBERNATION_POSSIBLE def_bool y @@ -1557,6 +1551,7 @@ config NUMA depends on SMP depends on X86_64 || (X86_32 && HIGHMEM64G && X86_BIGSMP) default y if X86_BIGSMP + select USE_PERCPU_NUMA_NODE_ID help Enable NUMA (Non-Uniform Memory Access) support. @@ -2431,10 +2426,6 @@ config ARCH_HAS_ADD_PAGES config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE def_bool y -config USE_PERCPU_NUMA_NODE_ID - def_bool y - depends on NUMA - menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e5f..9b5de3f54158c6 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -432,6 +432,18 @@ config NEED_PER_CPU_KM bool default y +config NEED_PER_CPU_EMBED_FIRST_CHUNK + bool + +config NEED_PER_CPU_PAGE_FIRST_CHUNK + bool + +config USE_PERCPU_NUMA_NODE_ID + bool + +config HAVE_SETUP_PER_CPU_AREA + bool + config CLEANCACHE bool "Enable cleancache driver to cache clean pages if tmem is present" help From 1ca3fb3abd2b615c4b61728de545760a6e2c2d8b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:45 -0800 Subject: [PATCH 02/55] mm: percpu: add pcpu_fc_cpu_to_node_fn_t typedef Add pcpu_fc_cpu_to_node_fn_t and pass it into pcpu_fc_alloc_fn_t, pcpu first chunk allocation will call it to alloc memblock on the corresponding node by it, this is prepare for the next patch. Link: https://lkml.kernel.org/r/20211216112359.103822-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/init.c | 12 +++++++++--- arch/powerpc/kernel/setup_64.c | 15 +++++++++++---- arch/sparc/kernel/smp_64.c | 13 ++++++++++--- arch/x86/kernel/setup_percpu.c | 18 +++++++++++++----- drivers/base/arch_numa.c | 8 +++++--- include/linux/percpu.h | 7 +++++-- mm/percpu.c | 14 +++++++++----- 7 files changed, 62 insertions(+), 25 deletions(-) diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 325e1552cbeada..1d8f2844704c30 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -519,12 +519,17 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(cpu_to_node(from), cpu_to_node(to)); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, - size_t align) +static int __init pcpu_cpu_to_node(int cpu) +{ + return cpu_to_node(cpu); +} + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, - cpu_to_node(cpu)); + cpu_to_nd_fn(cpu)); } static void __init pcpu_fc_free(void *ptr, size_t size) @@ -545,6 +550,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, + pcpu_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) panic("Failed to initialize percpu areas."); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6052f5d5ded343..b79b10ae466f96 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -784,12 +784,12 @@ void __init emergency_stack_init(void) * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, - size_t align) +static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA - int node = early_cpu_to_node(cpu); + int node = cpu_to_nd_fun(cpu); void *ptr; if (!node_online(node) || !NODE_DATA(node)) { @@ -823,6 +823,11 @@ static int pcpu_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } +static __init int pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); @@ -891,6 +896,7 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_PAGE) { rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_cpu_to_node, pcpu_alloc_bootmem, pcpu_free_bootmem); if (rc) pr_warn("PERCPU: %s allocator failed (%d), " @@ -899,7 +905,8 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) - rc = pcpu_page_first_chunk(0, pcpu_alloc_bootmem, pcpu_free_bootmem, + rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node, + pcpu_alloc_bootmem, pcpu_free_bootmem, pcpu_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index b98a7bbe6728a0..14d719aa318da9 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1539,12 +1539,12 @@ void smp_send_stop(void) * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, - size_t align) +static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA - int node = cpu_to_node(cpu); + int node = cpu_to_nd_fn(cpu); void *ptr; if (!node_online(node) || !NODE_DATA(node)) { @@ -1578,6 +1578,11 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return REMOTE_DISTANCE; } +static int __init pcpu_cpu_to_node(int cpu) +{ + return cpu_to_node(cpu); +} + static void __init pcpu_populate_pte(unsigned long addr) { pgd_t *pgd = pgd_offset_k(addr); @@ -1641,6 +1646,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, 4 << 20, pcpu_cpu_distance, + pcpu_cpu_to_node, pcpu_alloc_bootmem, pcpu_free_bootmem); if (rc) @@ -1650,6 +1656,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, + pcpu_cpu_to_node, pcpu_alloc_bootmem, pcpu_free_bootmem, pcpu_populate_pte); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7b65275544b2c3..1d41f48441492a 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -97,12 +97,12 @@ static bool __init pcpu_need_numa(void) * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, - unsigned long align) +static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, unsigned long align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { const unsigned long goal = __pa(MAX_DMA_ADDRESS); #ifdef CONFIG_NUMA - int node = early_cpu_to_node(cpu); + int node = cpu_to_nd_fn(cpu); void *ptr; if (!node_online(node) || !NODE_DATA(node)) { @@ -128,9 +128,10 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, /* * Helpers for first chunk memory allocation */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { - return pcpu_alloc_bootmem(cpu, size, align); + return pcpu_alloc_bootmem(cpu, size, align, cpu_to_nd_fn); } static void __init pcpu_fc_free(void *ptr, size_t size) @@ -150,6 +151,11 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) #endif } +static int __init pcpu_cpu_to_node(int cpu) +{ + return early_cpu_to_node(cpu); +} + static void __init pcpup_populate_pte(unsigned long addr) { populate_extra_pte(addr); @@ -205,6 +211,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, + pcpu_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free); if (rc < 0) pr_warn("%s allocator failed (%d), falling back to page size\n", @@ -212,6 +219,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, + pcpu_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); if (rc < 0) diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index bc1876915457d4..dae8618385358e 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -155,10 +155,10 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, - size_t align) +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { - int nid = early_cpu_to_node(cpu); + int nid = cpu_to_nd_fn(cpu); return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); @@ -229,6 +229,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, + early_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) @@ -240,6 +241,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, + early_cpu_to_node, pcpu_fc_alloc, pcpu_fc_free, pcpu_populate_pte); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index ae4004e7957e18..e4078bf45fd52d 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -94,8 +94,9 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, - size_t align); +typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); +typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); @@ -111,12 +112,14 @@ extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); diff --git a/mm/percpu.c b/mm/percpu.c index f5b2c2ea5a548a..267a4d295fcf0d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3001,6 +3001,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional + * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * @alloc_fn: function to allocate percpu page * @free_fn: function to free percpu page * @@ -3030,6 +3031,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn) { @@ -3066,7 +3068,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ - ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); if (!ptr) { rc = -ENOMEM; goto out_free_areas; @@ -3143,6 +3145,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes + * @cpu_to_nd_fn: callback to convert cpu to it's node, optional * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: function to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte @@ -3157,6 +3160,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, * 0 on success, -errno on failure. */ int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) @@ -3201,7 +3205,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); @@ -3278,8 +3282,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, - size_t align) +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS)); } @@ -3300,7 +3304,7 @@ void __init setup_per_cpu_areas(void) * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, NULL, pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) panic("Failed to initialize percpu areas."); From 23f917169ef157aa7a6bf80d8c4aad6f1282852c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:49 -0800 Subject: [PATCH 03/55] mm: percpu: add generic pcpu_fc_alloc/free funciton With the previous patch, we could add a generic pcpu first chunk allocate and free function to cleanup the duplicated definations on each architecture. Link: https://lkml.kernel.org/r/20211216112359.103822-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: "Rafael J. Wysocki" Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/mm/init.c | 16 +------ arch/powerpc/kernel/setup_64.c | 51 +--------------------- arch/sparc/kernel/smp_64.c | 50 +--------------------- arch/x86/kernel/setup_percpu.c | 59 +------------------------ drivers/base/arch_numa.c | 19 +-------- include/linux/percpu.h | 9 +--- mm/percpu.c | 78 ++++++++++++++++++++-------------- 7 files changed, 54 insertions(+), 228 deletions(-) diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 1d8f2844704c30..5a8002839550e1 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -524,19 +524,6 @@ static int __init pcpu_cpu_to_node(int cpu) return cpu_to_node(cpu); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - return memblock_alloc_try_nid(size, align, __pa(MAX_DMA_ADDRESS), - MEMBLOCK_ALLOC_ACCESSIBLE, - cpu_to_nd_fn(cpu)); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -550,8 +537,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, - pcpu_cpu_to_node, - pcpu_fc_alloc, pcpu_fc_free); + pcpu_cpu_to_node); if (rc < 0) panic("Failed to initialize percpu areas."); diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index b79b10ae466f96..a0c55c6e3023d5 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -771,50 +771,6 @@ void __init emergency_stack_init(void) } #ifdef CONFIG_SMP -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = cpu_to_nd_fun(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -static void __init pcpu_free_bootmem(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int pcpu_cpu_distance(unsigned int from, unsigned int to) { if (early_cpu_to_node(from) == early_cpu_to_node(to)) @@ -896,8 +852,7 @@ void __init setup_per_cpu_areas(void) if (pcpu_chosen_fc != PCPU_FC_PAGE) { rc = pcpu_embed_first_chunk(0, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_cpu_to_node, - pcpu_alloc_bootmem, pcpu_free_bootmem); + pcpu_cpu_to_node); if (rc) pr_warn("PERCPU: %s allocator failed (%d), " "falling back to page size\n", @@ -905,9 +860,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) - rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node, - pcpu_alloc_bootmem, pcpu_free_bootmem, - pcpu_populate_pte); + rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node, pcpu_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index 14d719aa318da9..ef815b3f0592c2 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1526,50 +1526,6 @@ void smp_send_stop(void) smp_call_function(stop_this_cpu, NULL, 0); } -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = cpu_to_nd_fn(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, node); - pr_debug("per cpu data for cpu%d %lu bytes on node%d at " - "%016lx\n", cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -static void __init pcpu_free_bootmem(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { if (cpu_to_node(from) == cpu_to_node(to)) @@ -1646,9 +1602,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, 4 << 20, pcpu_cpu_distance, - pcpu_cpu_to_node, - pcpu_alloc_bootmem, - pcpu_free_bootmem); + pcpu_cpu_to_node); if (rc) pr_warn("PERCPU: %s allocator failed (%d), " "falling back to page size\n", @@ -1657,8 +1611,6 @@ void __init setup_per_cpu_areas(void) if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, pcpu_cpu_to_node, - pcpu_alloc_bootmem, - pcpu_free_bootmem, pcpu_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1d41f48441492a..15c5bf3cbe5fe8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -84,61 +84,6 @@ static bool __init pcpu_need_numa(void) } #endif -/** - * pcpu_alloc_bootmem - NUMA friendly alloc_bootmem wrapper for percpu - * @cpu: cpu to allocate for - * @size: size allocation in bytes - * @align: alignment - * - * Allocate @size bytes aligned at @align for cpu @cpu. This wrapper - * does the right thing for NUMA regardless of the current - * configuration. - * - * RETURNS: - * Pointer to the allocated area on success, NULL on failure. - */ -static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size, unsigned long align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - const unsigned long goal = __pa(MAX_DMA_ADDRESS); -#ifdef CONFIG_NUMA - int node = cpu_to_nd_fn(cpu); - void *ptr; - - if (!node_online(node) || !NODE_DATA(node)) { - ptr = memblock_alloc_from(size, align, goal); - pr_info("cpu %d has no node %d or node-local memory\n", - cpu, node); - pr_debug("per cpu data for cpu%d %lu bytes at %016lx\n", - cpu, size, __pa(ptr)); - } else { - ptr = memblock_alloc_try_nid(size, align, goal, - MEMBLOCK_ALLOC_ACCESSIBLE, - node); - - pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n", - cpu, size, node, __pa(ptr)); - } - return ptr; -#else - return memblock_alloc_from(size, align, goal); -#endif -} - -/* - * Helpers for first chunk memory allocation - */ -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - return pcpu_alloc_bootmem(cpu, size, align, cpu_to_nd_fn); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) { #ifdef CONFIG_NUMA @@ -211,8 +156,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, dyn_size, atom_size, pcpu_cpu_distance, - pcpu_cpu_to_node, - pcpu_fc_alloc, pcpu_fc_free); + pcpu_cpu_to_node); if (rc < 0) pr_warn("%s allocator failed (%d), falling back to page size\n", pcpu_fc_names[pcpu_chosen_fc], rc); @@ -220,7 +164,6 @@ void __init setup_per_cpu_areas(void) if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, pcpu_cpu_to_node, - pcpu_fc_alloc, pcpu_fc_free, pcpup_populate_pte); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index dae8618385358e..23a10cc3616526 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -155,20 +155,6 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); } -static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - int nid = cpu_to_nd_fn(cpu); - - return memblock_alloc_try_nid(size, align, - __pa(MAX_DMA_ADDRESS), MEMBLOCK_ALLOC_ACCESSIBLE, nid); -} - -static void __init pcpu_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK static void __init pcpu_populate_pte(unsigned long addr) { @@ -229,8 +215,7 @@ void __init setup_per_cpu_areas(void) rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, pcpu_cpu_distance, - early_cpu_to_node, - pcpu_fc_alloc, pcpu_fc_free); + early_cpu_to_node); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) pr_warn("PERCPU: %s allocator failed (%d), falling back to page size\n", @@ -242,8 +227,6 @@ void __init setup_per_cpu_areas(void) if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node, - pcpu_fc_alloc, - pcpu_fc_free, pcpu_populate_pte); #endif if (rc < 0) diff --git a/include/linux/percpu.h b/include/linux/percpu.h index e4078bf45fd52d..d73c97ef4ff40a 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -95,9 +95,6 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); -typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); -typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); @@ -112,16 +109,12 @@ extern void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK extern int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn); #endif diff --git a/mm/percpu.c b/mm/percpu.c index 267a4d295fcf0d..0f79b6d9a6d699 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2992,6 +2992,42 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( return ai; } + +static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align, + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) +{ + const unsigned long goal = __pa(MAX_DMA_ADDRESS); +#ifdef CONFIG_NUMA + int node = NUMA_NO_NODE; + void *ptr; + + if (cpu_to_nd_fn) + node = cpu_to_nd_fn(cpu); + + if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) { + ptr = memblock_alloc_from(size, align, goal); + pr_info("cpu %d has no node %d or node-local memory\n", + cpu, node); + pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n", + cpu, size, (u64)__pa(ptr)); + } else { + ptr = memblock_alloc_try_nid(size, align, goal, + MEMBLOCK_ALLOC_ACCESSIBLE, + node); + + pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n", + cpu, size, node, (u64)__pa(ptr)); + } + return ptr; +#else + return memblock_alloc_from(size, align, goal); +#endif +} + +static void __init pcpu_fc_free(void *ptr, size_t size) +{ + memblock_free(ptr, size); +} #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ #if defined(BUILD_EMBED_FIRST_CHUNK) @@ -3002,14 +3038,12 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @cpu_to_nd_fn: callback to convert cpu to it's node, optional - * @alloc_fn: function to allocate percpu page - * @free_fn: function to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * by calling @alloc_fn and used as-is without being mapped into + * by calling pcpu_fc_alloc and used as-is without being mapped into * vmalloc area. Allocations are always whole multiples of @atom_size * aligned to @atom_size. * @@ -3023,7 +3057,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned using @free_fn. + * size, the leftover is returned using pcpu_fc_free. * * RETURNS: * 0 on success, -errno on failure. @@ -3031,9 +3065,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info( int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn) + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { void *base = (void *)ULONG_MAX; void **areas = NULL; @@ -3068,7 +3100,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ - ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); + ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn); if (!ptr) { rc = -ENOMEM; goto out_free_areas; @@ -3107,12 +3139,12 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { /* unused unit, free whole */ - free_fn(ptr, ai->unit_size); + pcpu_fc_free(ptr, ai->unit_size); continue; } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); - free_fn(ptr + size_sum, ai->unit_size - size_sum); + pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum); } } @@ -3131,7 +3163,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, out_free_areas: for (group = 0; group < ai->nr_groups; group++) if (areas[group]) - free_fn(areas[group], + pcpu_fc_free(areas[group], ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); @@ -3146,8 +3178,6 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @cpu_to_nd_fn: callback to convert cpu to it's node, optional - * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE - * @free_fn: function to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up page-remapped first percpu @@ -3161,8 +3191,6 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, */ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_alloc_fn_t alloc_fn, - pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; @@ -3205,7 +3233,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, for (i = 0; i < unit_pages; i++) { void *ptr; - ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); + ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); @@ -3257,7 +3285,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, enomem: while (--j >= 0) - free_fn(page_address(pages[j]), PAGE_SIZE); + pcpu_fc_free(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: memblock_free(pages, pages_size); @@ -3282,17 +3310,6 @@ int __init pcpu_page_first_chunk(size_t reserved_size, unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) -{ - return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS)); -} - -static void __init pcpu_dfl_fc_free(void *ptr, size_t size) -{ - memblock_free(ptr, size); -} - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -3303,9 +3320,8 @@ void __init setup_per_cpu_areas(void) * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ - rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, - PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, NULL, - pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, + PAGE_SIZE, NULL, NULL); if (rc < 0) panic("Failed to initialize percpu areas."); From 20c035764626c56c4f6514936b9ee4be0f4cd962 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 19 Jan 2022 18:07:53 -0800 Subject: [PATCH 04/55] mm: percpu: add generic pcpu_populate_pte() function With NEED_PER_CPU_PAGE_FIRST_CHUNK enabled, we need a function to populate pte, this patch adds a generic pcpu populate pte function, pcpu_populate_pte(), which is marked __weak and used on most architectures, but it is overridden on x86, which has its own implementation. Link: https://lkml.kernel.org/r/20211216112359.103822-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Dennis Zhou Cc: Tejun Heo Cc: Christoph Lameter Cc: Albert Ou Cc: Catalin Marinas Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Thomas Bogendoerfer Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/setup_64.c | 47 +-------------------- arch/sparc/kernel/smp_64.c | 56 +------------------------ arch/x86/kernel/setup_percpu.c | 5 +-- drivers/base/arch_numa.c | 51 +---------------------- include/linux/percpu.h | 5 +-- mm/percpu.c | 76 +++++++++++++++++++++++++++++++--- 6 files changed, 78 insertions(+), 162 deletions(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index a0c55c6e3023d5..f7cf408217c563 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -787,51 +787,6 @@ static __init int pcpu_cpu_to_node(int cpu) unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); -static void __init pcpu_populate_pte(unsigned long addr) -{ - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!new) - goto err_alloc; - p4d_populate(&init_mm, p4d, new); - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!new) - goto err_alloc; - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); - if (!new) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, new); - } - - return; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); -} - - void __init setup_per_cpu_areas(void) { const size_t dyn_size = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; @@ -860,7 +815,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) - rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node, pcpu_populate_pte); + rc = pcpu_page_first_chunk(0, pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/sparc/kernel/smp_64.c b/arch/sparc/kernel/smp_64.c index ef815b3f0592c2..a1f78e9ddaf371 100644 --- a/arch/sparc/kernel/smp_64.c +++ b/arch/sparc/kernel/smp_64.c @@ -1539,59 +1539,6 @@ static int __init pcpu_cpu_to_node(int cpu) return cpu_to_node(cpu); } -static void __init pcpu_populate_pte(unsigned long addr) -{ - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - if (pgd_none(*pgd)) { - pud_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pgd_populate(&init_mm, pgd, new); - } - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - p4d_populate(&init_mm, p4d, new); - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = memblock_alloc_from(PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, new); - } - - return; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); -} - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -1610,8 +1557,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, - pcpu_cpu_to_node, - pcpu_populate_pte); + pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 15c5bf3cbe5fe8..49325caa7307df 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -101,7 +101,7 @@ static int __init pcpu_cpu_to_node(int cpu) return early_cpu_to_node(cpu); } -static void __init pcpup_populate_pte(unsigned long addr) +void __init pcpu_populate_pte(unsigned long addr) { populate_extra_pte(addr); } @@ -163,8 +163,7 @@ void __init setup_per_cpu_areas(void) } if (rc < 0) rc = pcpu_page_first_chunk(PERCPU_FIRST_CHUNK_RESERVE, - pcpu_cpu_to_node, - pcpup_populate_pte); + pcpu_cpu_to_node); if (rc < 0) panic("cannot initialize percpu area (err=%d)", rc); diff --git a/drivers/base/arch_numa.c b/drivers/base/arch_numa.c index 23a10cc3616526..eaa31e567d1ece 100644 --- a/drivers/base/arch_numa.c +++ b/drivers/base/arch_numa.c @@ -14,7 +14,6 @@ #include #include -#include struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); @@ -155,52 +154,6 @@ static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) return node_distance(early_cpu_to_node(from), early_cpu_to_node(to)); } -#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK -static void __init pcpu_populate_pte(unsigned long addr) -{ - pgd_t *pgd = pgd_offset_k(addr); - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; - - p4d = p4d_offset(pgd, addr); - if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - p4d_populate(&init_mm, p4d, new); - } - - pud = pud_offset(p4d, addr); - if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pud_populate(&init_mm, pud, new); - } - - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - pte_t *new; - - new = memblock_alloc(PAGE_SIZE, PAGE_SIZE); - if (!new) - goto err_alloc; - pmd_populate_kernel(&init_mm, pmd, new); - } - - return; - -err_alloc: - panic("%s: Failed to allocate %lu bytes align=%lx from=%lx\n", - __func__, PAGE_SIZE, PAGE_SIZE, PAGE_SIZE); -} -#endif - void __init setup_per_cpu_areas(void) { unsigned long delta; @@ -225,9 +178,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK if (rc < 0) - rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, - early_cpu_to_node, - pcpu_populate_pte); + rc = pcpu_page_first_chunk(PERCPU_MODULE_RESERVE, early_cpu_to_node); #endif if (rc < 0) panic("Failed to initialize percpu areas (err=%d).", rc); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index d73c97ef4ff40a..f1ec5ad1351cc1 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -95,7 +95,6 @@ extern const char * const pcpu_fc_names[PCPU_FC_NR]; extern enum pcpu_fc pcpu_chosen_fc; typedef int (pcpu_fc_cpu_to_node_fn_t)(int cpu); -typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); extern struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, @@ -113,9 +112,9 @@ extern int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK +void __init pcpu_populate_pte(unsigned long addr); extern int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn); + pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn); #endif extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __alloc_size(1); diff --git a/mm/percpu.c b/mm/percpu.c index 0f79b6d9a6d699..fc6f591cb54f6c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -3174,11 +3174,79 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK +#include + +#ifndef P4D_TABLE_SIZE +#define P4D_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PUD_TABLE_SIZE +#define PUD_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PMD_TABLE_SIZE +#define PMD_TABLE_SIZE PAGE_SIZE +#endif + +#ifndef PTE_TABLE_SIZE +#define PTE_TABLE_SIZE PAGE_SIZE +#endif +void __init __weak pcpu_populate_pte(unsigned long addr) +{ + pgd_t *pgd = pgd_offset_k(addr); + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + if (pgd_none(*pgd)) { + p4d_t *new; + + new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); + if (!new) + goto err_alloc; + pgd_populate(&init_mm, pgd, new); + } + + p4d = p4d_offset(pgd, addr); + if (p4d_none(*p4d)) { + pud_t *new; + + new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + if (!new) + goto err_alloc; + p4d_populate(&init_mm, p4d, new); + } + + pud = pud_offset(p4d, addr); + if (pud_none(*pud)) { + pmd_t *new; + + new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + if (!new) + goto err_alloc; + pud_populate(&init_mm, pud, new); + } + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) { + pte_t *new; + + new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE); + if (!new) + goto err_alloc; + pmd_populate_kernel(&init_mm, pmd, new); + } + + return; + +err_alloc: + panic("%s: Failed to allocate memory\n", __func__); +} + /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @cpu_to_nd_fn: callback to convert cpu to it's node, optional - * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up page-remapped first percpu * chunk and can be called where pcpu_setup_first_chunk() is expected. @@ -3189,9 +3257,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, * RETURNS: * 0 on success, -errno on failure. */ -int __init pcpu_page_first_chunk(size_t reserved_size, - pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn, - pcpu_fc_populate_pte_fn_t populate_pte_fn) +int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; @@ -3255,7 +3321,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) - populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); + pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], From 25bc5b0de91bc5e7afa65f1face0087fb9e331c7 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 19 Jan 2022 18:07:57 -0800 Subject: [PATCH 05/55] proc/vmcore: don't fake reading zeroes on surprise vmcore_cb unregistration In commit cc5f2704c934 ("proc/vmcore: convert oldmem_pfn_is_ram callback to more generic vmcore callbacks"), we added detection of surprise vmcore_cb unregistration after the vmcore was already opened. Once detected, we warn the user and simulate reading zeroes from that point on when accessing the vmcore. The basic reason was that unexpected unregistration, for example, by manually unbinding a driver from a device after opening the vmcore, is not supported and could result in reading oldmem the vmcore_cb would have actually prohibited while registered. However, something like that can similarly be trigger by a user that's really looking for trouble simply by unbinding the relevant driver before opening the vmcore -- or by disallowing loading the driver in the first place. So it's actually of limited help. Currently, unregistration can only be triggered via virtio-mem when manually unbinding the driver from the device inside the VM; there is no way to trigger it from the hypervisor, as hypervisors don't allow for unplugging virtio-mem devices -- ripping out system RAM from a VM without coordination with the guest is usually not a good idea. The important part is that unbinding the driver and unregistering the vmcore_cb while concurrently reading the vmcore won't crash the system, and that is handled by the rwsem. To make the mechanism more future proof, let's remove the "read zero" part, but leave the warning in place. For example, we could have a future driver (like virtio-balloon) that will contact the hypervisor to figure out if we already populated a page for a given PFN. Hotunplugging such a device and consequently unregistering the vmcore_cb could be triggered from the hypervisor without harming the system even while kdump is running. In that case, we don't want to silently end up with a vmcore that contains wrong data, because the user inside the VM might be unaware of the hypervisor action and might easily miss the warning in the log. Link: https://lkml.kernel.org/r/20211111192243.22002-1-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Baoquan He Cc: Dave Young Cc: Vivek Goyal Cc: Philipp Rudo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/vmcore.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 509f85148fee82..702754dd1daffb 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -65,8 +65,6 @@ static size_t vmcoredd_orig_sz; static DECLARE_RWSEM(vmcore_cb_rwsem); /* List of registered vmcore callbacks. */ static LIST_HEAD(vmcore_cb_list); -/* Whether we had a surprise unregistration of a callback. */ -static bool vmcore_cb_unstable; /* Whether the vmcore has been opened once. */ static bool vmcore_opened; @@ -94,10 +92,8 @@ void unregister_vmcore_cb(struct vmcore_cb *cb) * very unusual (e.g., forced driver removal), but we cannot stop * unregistering. */ - if (vmcore_opened) { + if (vmcore_opened) pr_warn_once("Unexpected vmcore callback unregistration\n"); - vmcore_cb_unstable = true; - } up_write(&vmcore_cb_rwsem); } EXPORT_SYMBOL_GPL(unregister_vmcore_cb); @@ -108,8 +104,6 @@ static bool pfn_is_ram(unsigned long pfn) bool ret = true; lockdep_assert_held_read(&vmcore_cb_rwsem); - if (unlikely(vmcore_cb_unstable)) - return false; list_for_each_entry(cb, &vmcore_cb_list, next) { if (unlikely(!cb->pfn_is_ram)) @@ -581,7 +575,7 @@ static int vmcore_remap_oldmem_pfn(struct vm_area_struct *vma, * looping over all pages without a reason. */ down_read(&vmcore_cb_rwsem); - if (!list_empty(&vmcore_cb_list) || vmcore_cb_unstable) + if (!list_empty(&vmcore_cb_list)) ret = remap_oldmem_pfn_checked(vma, from, pfn, size, prot); else ret = remap_oldmem_pfn_range(vma, from, pfn, size, prot); From ae62fbe299629d3b2fa61d4cf5146258c4d99fdf Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 19 Jan 2022 18:08:00 -0800 Subject: [PATCH 06/55] proc: make the proc_create[_data]() stubs static inlines Change the proc_create[_data]() stubs which are used when CONFIG_PROC_FS is not set from #defines to a static inline stubs. This should fix clang -Werror builds failing due to errors like this: drivers/platform/x86/thinkpad_acpi.c:918:30: error: unused variable 'dispatch_proc_ops' [-Werror,-Wunused-const-variable] Fixing this in include/linux/proc_fs.h should ensure that the same issue is also fixed in any other drivers hitting the same -Werror issue. [akpm@linux-foundation.org: fix CONFIG_PROC_FS=n] [akpm@linux-foundation.org: fix arch/sparc/kernel/led.c] [akpm@linux-foundation.org: fix build] Link: https://lkml.kernel.org/r/20211116131112.508304-1-hdegoede@redhat.com Signed-off-by: Hans de Goede Reported-by: kernel test robot Acked-by: Christian Brauner Cc: Alexander Viro Cc: Hans de Goede Cc: David Howells Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/kernel/led.c | 8 +++----- include/linux/proc_fs.h | 12 ++++++++++-- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/arch/sparc/kernel/led.c b/arch/sparc/kernel/led.c index 3a66e62eb2a0e6..ab657b359789ea 100644 --- a/arch/sparc/kernel/led.c +++ b/arch/sparc/kernel/led.c @@ -114,18 +114,16 @@ static const struct proc_ops led_proc_ops = { }; #endif -static struct proc_dir_entry *led; - #define LED_VERSION "0.1" static int __init led_init(void) { timer_setup(&led_blink_timer, led_blink, 0); - led = proc_create("led", 0, NULL, &led_proc_ops); - if (!led) +#ifdef CONFIG_PROC_FS + if (!proc_create("led", 0, NULL, &led_proc_ops)) return -ENOMEM; - +#endif printk(KERN_INFO "led: version %s, Lars Kotthoff \n", LED_VERSION); diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 069c7fd953961a..01b9268451a8ec 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -178,8 +178,16 @@ static inline struct proc_dir_entry *proc_mkdir_mode(const char *name, #define proc_create_seq(name, mode, parent, ops) ({NULL;}) #define proc_create_single(name, mode, parent, show) ({NULL;}) #define proc_create_single_data(name, mode, parent, show, data) ({NULL;}) -#define proc_create(name, mode, parent, proc_ops) ({NULL;}) -#define proc_create_data(name, mode, parent, proc_ops, data) ({NULL;}) + +static inline struct proc_dir_entry * +proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, + const struct proc_ops *proc_ops) +{ return NULL; } + +static inline struct proc_dir_entry * +proc_create_data(const char *name, umode_t mode, struct proc_dir_entry *parent, + const struct proc_ops *proc_ops, void *data) +{ return NULL; } static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} From 51a18734402874382ccfab288342c72d7227e122 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 19 Jan 2022 18:08:03 -0800 Subject: [PATCH 07/55] proc: convert the return type of proc_fd_access_allowed() to be boolean Convert return type of proc_fd_access_allowed() and the 'allowed' in it to be boolean since the return type of ptrace_may_access() is boolean. Link: https://lkml.kernel.org/r/20211219024404.29779-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Kees Cook Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 13eda8de299819..d654ce7150fddb 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -670,10 +670,10 @@ static int proc_pid_syscall(struct seq_file *m, struct pid_namespace *ns, /************************************************************************/ /* permission checks */ -static int proc_fd_access_allowed(struct inode *inode) +static bool proc_fd_access_allowed(struct inode *inode) { struct task_struct *task; - int allowed = 0; + bool allowed = false; /* Allow access to a task's file descriptors if it is us or we * may use ptrace attach to the process and find out that * information. From 153ee1c41a3ec707438ae0ca6b0061f72de334ef Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 19 Jan 2022 18:08:06 -0800 Subject: [PATCH 08/55] sysctl: fix duplicate path separator in printed entries sysctl_print_dir() always terminates the printed path name with a slash, so printing a slash before the file part causes a duplicate like in sysctl duplicate entry: /kernel//perf_user_access Fix this by dropping the extra slash. Link: https://lkml.kernel.org/r/e3054d605dc56f83971e4b6d2f5fa63a978720ad.1641551872.git.geert+renesas@glider.be Signed-off-by: Geert Uytterhoeven Acked-by: Christian Brauner Acked-by: Luis Chamberlain Cc: Iurii Zaikin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 5d66faecd4ef06..4f6168ec5079fc 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -163,7 +163,7 @@ static int insert_entry(struct ctl_table_header *head, struct ctl_table *entry) else { pr_err("sysctl duplicate entry: "); sysctl_print_dir(head->parent); - pr_cont("/%s\n", entry->procname); + pr_cont("%s\n", entry->procname); return -EEXIST; } } @@ -1020,8 +1020,8 @@ static struct ctl_dir *get_subdir(struct ctl_dir *dir, if (IS_ERR(subdir)) { pr_err("sysctl could not get directory: "); sysctl_print_dir(dir); - pr_cont("/%*.*s %ld\n", - namelen, namelen, name, PTR_ERR(subdir)); + pr_cont("%*.*s %ld\n", namelen, namelen, name, + PTR_ERR(subdir)); } drop_sysctl_table(&dir->header); if (new) @@ -1626,7 +1626,7 @@ static void put_links(struct ctl_table_header *header) else { pr_err("sysctl link missing during unregister: "); sysctl_print_dir(parent); - pr_cont("/%s\n", name); + pr_cont("%s\n", name); } } } From 7080cead5d45b79ec0c86fa285cf9b6abc413ed8 Mon Sep 17 00:00:00 2001 From: luo penghao Date: Wed, 19 Jan 2022 18:08:09 -0800 Subject: [PATCH 09/55] sysctl: remove redundant ret assignment Subsequent if judgments will assign new values to ret, so the statement here should be deleted The clang_analyzer complains as follows: fs/proc/proc_sysctl.c: Value stored to 'ret' is never read Link: https://lkml.kernel.org/r/20211230063622.586360-1-luo.penghao@zte.com.cn Signed-off-by: luo penghao Reported-by: Zeal Robot Acked-by: Luis Chamberlain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/proc_sysctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 4f6168ec5079fc..389e1e42e7d9a0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -1053,7 +1053,6 @@ static int sysctl_follow_link(struct ctl_table_header **phead, struct ctl_dir *dir; int ret; - ret = 0; spin_lock(&sysctl_lock); root = (*pentry)->data; set = lookup_header_set(root); From 22c033989c3eb9731ad0c497dfab4231b8e367d6 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:08:12 -0800 Subject: [PATCH 10/55] include/linux/unaligned: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. The rest of the changes are induced by the above and may not be split. Link: https://lkml.kernel.org/r/20211209123823.20425-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Arend van Spriel [brcmfmac] Acked-by: Kalle Valo Cc: Arend van Spriel Cc: Franky Lin Cc: Hante Meuleman Cc: Chi-hsien Lin Cc: Wright Feng Cc: Chung-hsien Hsu Cc: Kalle Valo Cc: David S. Miller Cc: Jakub Kicinski Cc: Heikki Krogerus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/wireless/broadcom/brcm80211/brcmfmac/xtlv.c | 2 ++ include/linux/unaligned/packed_struct.h | 2 +- lib/lz4/lz4defs.h | 2 ++ 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/xtlv.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/xtlv.c index 2f3c451148db71..2f890807430379 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/xtlv.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/xtlv.c @@ -4,6 +4,8 @@ */ #include + +#include #include #include diff --git a/include/linux/unaligned/packed_struct.h b/include/linux/unaligned/packed_struct.h index c0d817de4df24e..f4c8eaf4d01293 100644 --- a/include/linux/unaligned/packed_struct.h +++ b/include/linux/unaligned/packed_struct.h @@ -1,7 +1,7 @@ #ifndef _LINUX_UNALIGNED_PACKED_STRUCT_H #define _LINUX_UNALIGNED_PACKED_STRUCT_H -#include +#include struct __una_u16 { u16 x; } __packed; struct __una_u32 { u32 x; } __packed; diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h index 673bd206aa98b6..330aa539b46e64 100644 --- a/lib/lz4/lz4defs.h +++ b/lib/lz4/lz4defs.h @@ -36,6 +36,8 @@ */ #include + +#include #include /* memset, memcpy */ #define FORCE_INLINE __always_inline From 40cbf09f060c8febef64541c463d4dd526abe445 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:08:16 -0800 Subject: [PATCH 11/55] kernel.h: include a note to discourage people from including it in headers Include a note at the top to discourage people from including it in headers. Link: https://lkml.kernel.org/r/20211209150803.4473-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 77755ac3e189bf..36a612d8295656 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -1,4 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ +/* + * NOTE: + * + * This header has combined a lot of unrelated to each other stuff. + * The process of splitting its content is in progress while keeping + * backward compatibility. That's why it's highly recommended NOT to + * include this header inside another header file, especially under + * generic or architectural include/ directory. + */ #ifndef _LINUX_KERNEL_H #define _LINUX_KERNEL_H From 06c5088aeedafc06f8b33074d67e30077ba71b8b Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:19 -0800 Subject: [PATCH 12/55] fs/exec: replace strlcpy with strscpy_pad in __set_task_comm Patch series "task comm cleanups", v2. This patchset is part of the patchset "extend task comm from 16 to 24"[1]. Now we have different opinion that dynamically allocates memory to store kthread's long name into a separate pointer, so I decide to take the useful cleanups apart from the original patchset and send it separately[2]. These useful cleanups can make the usage around task comm less error-prone. Furthermore, it will be useful if we want to extend task comm in the future. [1]. https://lore.kernel.org/lkml/20211101060419.4682-1-laoar.shao@gmail.com/ [2]. https://lore.kernel.org/lkml/CALOAHbAx55AUo3bm8ZepZSZnw7A08cvKPdPyNTf=E_tPqmw5hw@mail.gmail.com/ This patch (of 7): strlcpy() can trigger out-of-bound reads on the source string[1], we'd better use strscpy() instead. To make it be robust against full tsk->comm copies that got noticed in other places, we should make sure it's zero padded. [1] https://github.com/KSPP/linux/issues/89 Link: https://lkml.kernel.org/r/20211120112738.45980-1-laoar.shao@gmail.com Link: https://lkml.kernel.org/r/20211120112738.45980-2-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Kees Cook Reviewed-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Andrii Nakryiko Cc: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 537d92c41105bc..51d3cb4e3cdfa2 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1222,7 +1222,7 @@ void __set_task_comm(struct task_struct *tsk, const char *buf, bool exec) { task_lock(tsk); trace_task_rename(tsk, buf); - strlcpy(tsk->comm, buf, sizeof(tsk->comm)); + strscpy_pad(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); perf_event_comm(tsk, exec); } From 503471ac36df60bba037c3b110d76f53a93f61b5 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:22 -0800 Subject: [PATCH 13/55] fs/exec: replace strncpy with strscpy_pad in __get_task_comm If the dest buffer size is smaller than sizeof(tsk->comm), the buffer will be without null ternimator, that may cause problem. Using strscpy_pad() instead of strncpy() in __get_task_comm() can make the string always nul ternimated and zero padded. Link: https://lkml.kernel.org/r/20211120112738.45980-3-laoar.shao@gmail.com Suggested-by: Kees Cook Suggested-by: Steven Rostedt Signed-off-by: Yafang Shao Reviewed-by: Kees Cook Reviewed-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Andrii Nakryiko Cc: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/exec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 51d3cb4e3cdfa2..fa142638b191cc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1207,7 +1207,8 @@ static int unshare_sighand(struct task_struct *me) char *__get_task_comm(char *buf, size_t buf_size, struct task_struct *tsk) { task_lock(tsk); - strncpy(buf, tsk->comm, buf_size); + /* Always NUL terminated and zero-padded */ + strscpy_pad(buf, tsk->comm, buf_size); task_unlock(tsk); return buf; } From 7b6397d7e5dfabf2ce1e77739d2a24af31b8a43f Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:26 -0800 Subject: [PATCH 14/55] drivers/infiniband: replace open-coded string copy with get_task_comm We'd better use the helper get_task_comm() rather than the open-coded strlcpy() to get task comm. As the comment above the hard-coded 16, we can replace it with TASK_COMM_LEN. Link: https://lkml.kernel.org/r/20211120112738.45980-4-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Dennis Dalessandro Reviewed-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Andrii Nakryiko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/qib/qib.h | 2 +- drivers/infiniband/hw/qib/qib_file_ops.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h index 9363bccfc6e718..a8e1c30c370f1d 100644 --- a/drivers/infiniband/hw/qib/qib.h +++ b/drivers/infiniband/hw/qib/qib.h @@ -196,7 +196,7 @@ struct qib_ctxtdata { pid_t pid; pid_t subpid[QLOGIC_IB_MAX_SUBCTXT]; /* same size as task_struct .comm[], command that opened context */ - char comm[16]; + char comm[TASK_COMM_LEN]; /* pkeys set by this use of this ctxt */ u16 pkeys[4]; /* so file ops can get at unit */ diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index 63854f4b652455..aa290928cf9680 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -1321,7 +1321,7 @@ static int setup_ctxt(struct qib_pportdata *ppd, int ctxt, rcd->tid_pg_list = ptmp; rcd->pid = current->pid; init_waitqueue_head(&dd->rcd[ctxt]->wait); - strlcpy(rcd->comm, current->comm, sizeof(rcd->comm)); + get_task_comm(rcd->comm, current); ctxt_fp(fp) = rcd; qib_stats.sps_ctxts++; dd->freectxts--; From 95af469c4f609de011debc08e7a35b45201623a8 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:29 -0800 Subject: [PATCH 15/55] fs/binfmt_elf: replace open-coded string copy with get_task_comm It is better to use get_task_comm() instead of the open coded string copy as we do in other places. struct elf_prpsinfo is used to dump the task information in userspace coredump or kernel vmcore. Below is the verification of vmcore, crash> ps PID PPID CPU TASK ST %MEM VSZ RSS COMM 0 0 0 ffffffff9d21a940 RU 0.0 0 0 [swapper/0] > 0 0 1 ffffa09e40f85e80 RU 0.0 0 0 [swapper/1] > 0 0 2 ffffa09e40f81f80 RU 0.0 0 0 [swapper/2] > 0 0 3 ffffa09e40f83f00 RU 0.0 0 0 [swapper/3] > 0 0 4 ffffa09e40f80000 RU 0.0 0 0 [swapper/4] > 0 0 5 ffffa09e40f89f80 RU 0.0 0 0 [swapper/5] 0 0 6 ffffa09e40f8bf00 RU 0.0 0 0 [swapper/6] > 0 0 7 ffffa09e40f88000 RU 0.0 0 0 [swapper/7] > 0 0 8 ffffa09e40f8de80 RU 0.0 0 0 [swapper/8] > 0 0 9 ffffa09e40f95e80 RU 0.0 0 0 [swapper/9] > 0 0 10 ffffa09e40f91f80 RU 0.0 0 0 [swapper/10] > 0 0 11 ffffa09e40f93f00 RU 0.0 0 0 [swapper/11] > 0 0 12 ffffa09e40f90000 RU 0.0 0 0 [swapper/12] > 0 0 13 ffffa09e40f9bf00 RU 0.0 0 0 [swapper/13] > 0 0 14 ffffa09e40f98000 RU 0.0 0 0 [swapper/14] > 0 0 15 ffffa09e40f9de80 RU 0.0 0 0 [swapper/15] It works well as expected. Some comments are added to explain why we use the hard-coded 16. Link: https://lkml.kernel.org/r/20211120112738.45980-5-laoar.shao@gmail.com Suggested-by: Kees Cook Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 2 +- include/linux/elfcore-compat.h | 5 +++++ include/linux/elfcore.h | 5 +++++ 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8c7f26f1fbb3f..b9a33cc34d6bf6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1585,7 +1585,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); - strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + get_task_comm(psinfo->pr_fname, p); return 0; } diff --git a/include/linux/elfcore-compat.h b/include/linux/elfcore-compat.h index e272c3d452ce78..54feb64e9b5df9 100644 --- a/include/linux/elfcore-compat.h +++ b/include/linux/elfcore-compat.h @@ -43,6 +43,11 @@ struct compat_elf_prpsinfo __compat_uid_t pr_uid; __compat_gid_t pr_gid; compat_pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; + /* + * The hard-coded 16 is derived from TASK_COMM_LEN, but it can't be + * changed as it is exposed to userspace. We'd better make it hard-coded + * here. + */ char pr_fname[16]; char pr_psargs[ELF_PRARGSZ]; }; diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 957ebec35aad01..746e081879a5ab 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -65,6 +65,11 @@ struct elf_prpsinfo __kernel_gid_t pr_gid; pid_t pr_pid, pr_ppid, pr_pgrp, pr_sid; /* Lots missing */ + /* + * The hard-coded 16 is derived from TASK_COMM_LEN, but it can't be + * changed as it is exposed to userspace. We'd better make it hard-coded + * here. + */ char pr_fname[16]; /* filename of executable */ char pr_psargs[ELF_PRARGSZ]; /* initial part of arg list */ }; From d068144d3b2cae09062ed936a3865c093ff69590 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:33 -0800 Subject: [PATCH 16/55] samples/bpf/test_overhead_kprobe_kern: replace bpf_probe_read_kernel with bpf_probe_read_kernel_str to get task comm bpf_probe_read_kernel_str() will add a nul terminator to the dst, then we don't care about if the dst size is big enough. This patch also replaces the hard-coded 16 with TASK_COMM_LEN to make it grepable. Link: https://lkml.kernel.org/r/20211120112738.45980-6-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: Kees Cook Acked-by: Andrii Nakryiko Reviewed-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- samples/bpf/offwaketime_kern.c | 4 ++-- samples/bpf/test_overhead_kprobe_kern.c | 11 ++++++----- samples/bpf/test_overhead_tp_kern.c | 5 +++-- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/samples/bpf/offwaketime_kern.c b/samples/bpf/offwaketime_kern.c index 4866afd054dab8..eb4d94742e6b41 100644 --- a/samples/bpf/offwaketime_kern.c +++ b/samples/bpf/offwaketime_kern.c @@ -113,11 +113,11 @@ static inline int update_counts(void *ctx, u32 pid, u64 delta) /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ struct sched_switch_args { unsigned long long pad; - char prev_comm[16]; + char prev_comm[TASK_COMM_LEN]; int prev_pid; int prev_prio; long long prev_state; - char next_comm[16]; + char next_comm[TASK_COMM_LEN]; int next_pid; int next_prio; }; diff --git a/samples/bpf/test_overhead_kprobe_kern.c b/samples/bpf/test_overhead_kprobe_kern.c index f6d593e47037d9..8fdd2c9c56b2b0 100644 --- a/samples/bpf/test_overhead_kprobe_kern.c +++ b/samples/bpf/test_overhead_kprobe_kern.c @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include @@ -22,17 +23,17 @@ int prog(struct pt_regs *ctx) { struct signal_struct *signal; struct task_struct *tsk; - char oldcomm[16] = {}; - char newcomm[16] = {}; + char oldcomm[TASK_COMM_LEN] = {}; + char newcomm[TASK_COMM_LEN] = {}; u16 oom_score_adj; u32 pid; tsk = (void *)PT_REGS_PARM1(ctx); pid = _(tsk->pid); - bpf_probe_read_kernel(oldcomm, sizeof(oldcomm), &tsk->comm); - bpf_probe_read_kernel(newcomm, sizeof(newcomm), - (void *)PT_REGS_PARM2(ctx)); + bpf_probe_read_kernel_str(oldcomm, sizeof(oldcomm), &tsk->comm); + bpf_probe_read_kernel_str(newcomm, sizeof(newcomm), + (void *)PT_REGS_PARM2(ctx)); signal = _(tsk->signal); oom_score_adj = _(signal->oom_score_adj); return 0; diff --git a/samples/bpf/test_overhead_tp_kern.c b/samples/bpf/test_overhead_tp_kern.c index eaa32693f8fc1a..80edadacb6925f 100644 --- a/samples/bpf/test_overhead_tp_kern.c +++ b/samples/bpf/test_overhead_tp_kern.c @@ -4,6 +4,7 @@ * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. */ +#include #include #include @@ -11,8 +12,8 @@ struct task_rename { __u64 pad; __u32 pid; - char oldcomm[16]; - char newcomm[16]; + char oldcomm[TASK_COMM_LEN]; + char newcomm[TASK_COMM_LEN]; __u16 oom_score_adj; }; SEC("tracepoint/task/task_rename") From 4cfb943537ed3716daf668ca5a33d3ce667f82a3 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:36 -0800 Subject: [PATCH 17/55] tools/bpf/bpftool/skeleton: replace bpf_probe_read_kernel with bpf_probe_read_kernel_str to get task comm bpf_probe_read_kernel_str() will add a nul terminator to the dst, then we don't care about if the dst size is big enough. Link: https://lkml.kernel.org/r/20211120112738.45980-7-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Andrii Nakryiko Reviewed-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/bpf/bpftool/skeleton/pid_iter.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c index d9b420972934f9..f70702fcb224c3 100644 --- a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c +++ b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c @@ -71,8 +71,8 @@ int iter(struct bpf_iter__task_file *ctx) e.pid = task->tgid; e.id = get_obj_id(file->private_data, obj_type); - bpf_probe_read_kernel(&e.comm, sizeof(e.comm), - task->group_leader->comm); + bpf_probe_read_kernel_str(&e.comm, sizeof(e.comm), + task->group_leader->comm); bpf_seq_write(ctx->meta->seq, &e, sizeof(e)); return 0; From 3087c61ed2c48548b74dd343a5209b87082c682d Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:40 -0800 Subject: [PATCH 18/55] tools/testing/selftests/bpf: replace open-coded 16 with TASK_COMM_LEN As the sched:sched_switch tracepoint args are derived from the kernel, we'd better make it same with the kernel. So the macro TASK_COMM_LEN is converted to type enum, then all the BPF programs can get it through BTF. The BPF program which wants to use TASK_COMM_LEN should include the header vmlinux.h. Regarding the test_stacktrace_map and test_tracepoint, as the type defined in linux/bpf.h are also defined in vmlinux.h, so we don't need to include linux/bpf.h again. Link: https://lkml.kernel.org/r/20211120112738.45980-8-laoar.shao@gmail.com Signed-off-by: Yafang Shao Acked-by: Andrii Nakryiko Acked-by: David Hildenbrand Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: David Hildenbrand Cc: Al Viro Cc: Kees Cook Cc: Petr Mladek Cc: Alexei Starovoitov Cc: Dennis Dalessandro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 +++++++-- tools/testing/selftests/bpf/progs/test_stacktrace_map.c | 6 +++--- tools/testing/selftests/bpf/progs/test_tracepoint.c | 6 +++--- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 78c351e35fec63..cecd4806edc667 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -274,8 +274,13 @@ struct task_group; #define get_current_state() READ_ONCE(current->__state) -/* Task command name length: */ -#define TASK_COMM_LEN 16 +/* + * Define the task command name length as enum, then it can be visible to + * BPF programs. + */ +enum { + TASK_COMM_LEN = 16, +}; extern void scheduler_tick(void); diff --git a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c index a8233e7f173bcc..728dbd39eff0b9 100644 --- a/tools/testing/selftests/bpf/progs/test_stacktrace_map.c +++ b/tools/testing/selftests/bpf/progs/test_stacktrace_map.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2018 Facebook -#include +#include #include #ifndef PERF_MAX_STACK_DEPTH @@ -41,11 +41,11 @@ struct { /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ struct sched_switch_args { unsigned long long pad; - char prev_comm[16]; + char prev_comm[TASK_COMM_LEN]; int prev_pid; int prev_prio; long long prev_state; - char next_comm[16]; + char next_comm[TASK_COMM_LEN]; int next_pid; int next_prio; }; diff --git a/tools/testing/selftests/bpf/progs/test_tracepoint.c b/tools/testing/selftests/bpf/progs/test_tracepoint.c index ce6974016f53fa..43bd7a20cc5031 100644 --- a/tools/testing/selftests/bpf/progs/test_tracepoint.c +++ b/tools/testing/selftests/bpf/progs/test_tracepoint.c @@ -1,17 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 // Copyright (c) 2017 Facebook -#include +#include #include /* taken from /sys/kernel/debug/tracing/events/sched/sched_switch/format */ struct sched_switch_args { unsigned long long pad; - char prev_comm[16]; + char prev_comm[TASK_COMM_LEN]; int prev_pid; int prev_prio; long long prev_state; - char next_comm[16]; + char next_comm[TASK_COMM_LEN]; int next_pid; int next_prio; }; From d6986ce24fc00b0638bd29efe8fb7ba7619ed2aa Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 19 Jan 2022 18:08:43 -0800 Subject: [PATCH 19/55] kthread: dynamically allocate memory to store kthread's full name When I was implementing a new per-cpu kthread cfs_migration, I found the comm of it "cfs_migration/%u" is truncated due to the limitation of TASK_COMM_LEN. For example, the comm of the percpu thread on CPU10~19 all have the same name "cfs_migration/1", which will confuse the user. This issue is not critical, because we can get the corresponding CPU from the task's Cpus_allowed. But for kthreads corresponding to other hardware devices, it is not easy to get the detailed device info from task comm, for example, jbd2/nvme0n1p2- xfs-reclaim/sdf Currently there are so many truncated kthreads: rcu_tasks_kthre rcu_tasks_rude_ rcu_tasks_trace poll_mpt3sas0_s ext4-rsv-conver xfs-reclaim/sd{a, b, c, ...} xfs-blockgc/sd{a, b, c, ...} xfs-inodegc/sd{a, b, c, ...} audit_send_repl ecryptfs-kthrea vfio-irqfd-clea jbd2/nvme0n1p2- ... We can shorten these names to work around this problem, but it may be not applied to all of the truncated kthreads. Take 'jbd2/nvme0n1p2-' for example, it is a nice name, and it is not a good idea to shorten it. One possible way to fix this issue is extending the task comm size, but as task->comm is used in lots of places, that may cause some potential buffer overflows. Another more conservative approach is introducing a new pointer to store kthread's full name if it is truncated, which won't introduce too much overhead as it is in the non-critical path. Finally we make a dicision to use the second approach. See also the discussions in this thread: https://lore.kernel.org/lkml/20211101060419.4682-1-laoar.shao@gmail.com/ After this change, the full name of these truncated kthreads will be displayed via /proc/[pid]/comm: rcu_tasks_kthread rcu_tasks_rude_kthread rcu_tasks_trace_kthread poll_mpt3sas0_statu ext4-rsv-conversion xfs-reclaim/sdf1 xfs-blockgc/sdf1 xfs-inodegc/sdf1 audit_send_reply ecryptfs-kthread vfio-irqfd-cleanup jbd2/nvme0n1p2-8 Link: https://lkml.kernel.org/r/20211120112850.46047-1-laoar.shao@gmail.com Signed-off-by: Yafang Shao Reviewed-by: David Hildenbrand Reviewed-by: Petr Mladek Suggested-by: Petr Mladek Suggested-by: Steven Rostedt Cc: Mathieu Desnoyers Cc: Arnaldo Carvalho de Melo Cc: Alexei Starovoitov Cc: Andrii Nakryiko Cc: Michal Miroslaw Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Matthew Wilcox Cc: Al Viro Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 3 +++ include/linux/kthread.h | 1 + kernel/kthread.c | 32 ++++++++++++++++++++++++++++++-- 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index ff869a66b34e39..4321aa63835d4f 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -92,6 +92,7 @@ #include #include #include +#include #include #include "internal.h" @@ -102,6 +103,8 @@ void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape) if (p->flags & PF_WQ_WORKER) wq_worker_comm(tcomm, sizeof(tcomm), p); + else if (p->flags & PF_KTHREAD) + get_kthread_comm(tcomm, sizeof(tcomm), p); else __get_task_comm(tcomm, sizeof(tcomm), p); diff --git a/include/linux/kthread.h b/include/linux/kthread.h index 346b0f269161a3..2a5c04494663af 100644 --- a/include/linux/kthread.h +++ b/include/linux/kthread.h @@ -33,6 +33,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), unsigned int cpu, const char *namefmt); +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk); void set_kthread_struct(struct task_struct *p); void kthread_set_per_cpu(struct task_struct *k, int cpu); diff --git a/kernel/kthread.c b/kernel/kthread.c index 7113003fab63de..a70cd5dc94e348 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -60,6 +60,8 @@ struct kthread { #ifdef CONFIG_BLK_CGROUP struct cgroup_subsys_state *blkcg_css; #endif + /* To store the full name if task comm is truncated. */ + char *full_name; }; enum KTHREAD_BITS { @@ -93,6 +95,18 @@ static inline struct kthread *__to_kthread(struct task_struct *p) return kthread; } +void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk) +{ + struct kthread *kthread = to_kthread(tsk); + + if (!kthread || !kthread->full_name) { + __get_task_comm(buf, buf_size, tsk); + return; + } + + strscpy_pad(buf, kthread->full_name, buf_size); +} + void set_kthread_struct(struct task_struct *p) { struct kthread *kthread; @@ -118,9 +132,13 @@ void free_kthread_struct(struct task_struct *k) * or if kmalloc() in kthread() failed. */ kthread = to_kthread(k); + if (!kthread) + return; + #ifdef CONFIG_BLK_CGROUP - WARN_ON_ONCE(kthread && kthread->blkcg_css); + WARN_ON_ONCE(kthread->blkcg_css); #endif + kfree(kthread->full_name); kfree(kthread); } @@ -406,12 +424,22 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), task = create->result; if (!IS_ERR(task)) { char name[TASK_COMM_LEN]; + va_list aq; + int len; /* * task is already visible to other tasks, so updating * COMM must be protected. */ - vsnprintf(name, sizeof(name), namefmt, args); + va_copy(aq, args); + len = vsnprintf(name, sizeof(name), namefmt, aq); + va_end(aq); + if (len >= TASK_COMM_LEN) { + struct kthread *kthread = to_kthread(task); + + /* leave it truncated when out of memory. */ + kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args); + } set_task_comm(task, name); } kfree(create); From 7f8ca0edfe07d271ba6bef3cef5ec7fc1bbe8a68 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 19 Jan 2022 18:08:47 -0800 Subject: [PATCH 20/55] kernel/sys.c: only take tasklist_lock for get/setpriority(PRIO_PGRP) PRIO_PGRP needs the tasklist_lock mainly to serialize vs setpgid(2), to protect against any concurrent change_pid(PIDTYPE_PGID) that can move the task from one hlist to another while iterating. However, the remaining can only rely only on RCU: PRIO_PROCESS only does the task lookup and never iterates over tasklist and we already have an rcu-aware stable pointer. PRIO_USER is already racy vs setuid(2) so with creds being rcu protected, we can end up seeing stale data. When removing the tasklist_lock there can be a race with (i) fork but this is benign as the child's nice is inherited and the new task is not observable by the user yet either, hence the return semantics do not differ. And (ii) a race with exit, which is a small window and can cause us to miss a task which was removed from the list and it had the highest nice. Similarly change the buggy do_each_thread/while_each_thread combo in PRIO_USER for the rcu-safe for_each_process_thread flavor, which doesn't make use of next_thread/p->thread_group. [akpm@linux-foundation.org: coding style fixes] Link: https://lkml.kernel.org/r/20211210182250.43734-1-dave@stgolabs.net Signed-off-by: Davidlohr Bueso Acked-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 8fdac0d90504a4..34bbe8cd1f0404 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -220,7 +220,6 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) niceval = MAX_NICE; rcu_read_lock(); - read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) @@ -235,9 +234,11 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) pgrp = find_vpid(who); else pgrp = task_pgrp(current); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); @@ -249,16 +250,15 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) if (!user) goto out_unlock; /* No processes for this user */ } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); - } while_each_thread(g, p); + } if (!uid_eq(uid, cred->uid)) free_uid(user); /* For find_user() */ break; } out_unlock: - read_unlock(&tasklist_lock); rcu_read_unlock(); out: return error; @@ -283,7 +283,6 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) return -EINVAL; rcu_read_lock(); - read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: if (who) @@ -301,11 +300,13 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) pgrp = find_vpid(who); else pgrp = task_pgrp(current); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); break; case PRIO_USER: uid = make_kuid(cred->user_ns, who); @@ -317,19 +318,18 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (!user) goto out_unlock; /* No processes for this user */ } - do_each_thread(g, p) { + for_each_process_thread(g, p) { if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; } - } while_each_thread(g, p); + } if (!uid_eq(uid, cred->uid)) free_uid(user); /* for find_user() */ break; } out_unlock: - read_unlock(&tasklist_lock); rcu_read_unlock(); return retval; From 26d98e9f78da8e49413b1cb6bcd0d63ac03b8c85 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 19 Jan 2022 18:08:50 -0800 Subject: [PATCH 21/55] get_maintainer: don't remind about no git repo when --nogit is used When --nogit is used with scripts/get_maintainer.pl, the script spews 4 lines of unnecessary information (noise). Do not print those lines when --nogit is specified. This change removes the printing of these 4 lines: ./scripts/get_maintainer.pl: No supported VCS found. Add --nogit to options? Using a git repository produces better results. Try Linus Torvalds' latest git repository using: git clone git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git Link: https://lkml.kernel.org/r/20220102031424.3328-1-rdunlap@infradead.org Signed-off-by: Randy Dunlap Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 2075db0c08b8e0..6bd5221d37b8f9 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -1718,7 +1718,7 @@ sub vcs_exists { %VCS_cmds = %VCS_cmds_hg; return 2 if eval $VCS_cmds{"available"}; %VCS_cmds = (); - if (!$printed_novcs) { + if (!$printed_novcs && $email_git) { warn("$P: No supported VCS found. Add --nogit to options?\n"); warn("Using a git repository produces better results.\n"); warn("Try Linus Torvalds' latest git repository using:\n"); From 70ac69928e9717a313a4c72647ebe80663e397a3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 19 Jan 2022 18:08:53 -0800 Subject: [PATCH 22/55] kstrtox: uninline everything I've made a mistake of looking into lib/kstrtox.o code generation. The only function remotely performance critical is _parse_integer() (via /proc/*/map_files/*), everything else is not. Uninline everything, shrink lib/kstrtox.o by ~20 % ! Space savings on x86_64: add/remove: 0/0 grow/shrink: 0/23 up/down: 0/-1269 (-1269 !!!) Function old new delta kstrtoull 16 13 -3 kstrtouint 59 48 -11 kstrtou8 60 49 -11 kstrtou16 61 50 -11 _kstrtoul 46 35 -11 kstrtoull_from_user 95 83 -12 kstrtoul_from_user 95 83 -12 kstrtoll 93 80 -13 kstrtouint_from_user 124 83 -41 kstrtou8_from_user 125 83 -42 kstrtou16_from_user 126 83 -43 kstrtos8 101 50 -51 kstrtos16 102 51 -51 kstrtoint 100 49 -51 _kstrtol 93 35 -58 kstrtobool_from_user 156 75 -81 kstrtoll_from_user 165 83 -82 kstrtol_from_user 165 83 -82 kstrtoint_from_user 172 83 -89 kstrtos8_from_user 173 83 -90 kstrtos16_from_user 174 83 -91 _parse_integer 136 10 -126 _kstrtoull 308 101 -207 Total: Before=3421236, After=3419967, chg -0.04% Link: https://lkml.kernel.org/r/YZDsFDhHst4m2Pnt@localhost.localdomain Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/kstrtox.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/lib/kstrtox.c b/lib/kstrtox.c index 059b8b00dc532b..886510d248e5d0 100644 --- a/lib/kstrtox.c +++ b/lib/kstrtox.c @@ -22,6 +22,7 @@ #include "kstrtox.h" +noinline const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) { if (*base == 0) { @@ -47,6 +48,7 @@ const char *_parse_integer_fixup_radix(const char *s, unsigned int *base) * * Don't you dare use this function. */ +noinline unsigned int _parse_integer_limit(const char *s, unsigned int base, unsigned long long *p, size_t max_chars) { @@ -85,6 +87,7 @@ unsigned int _parse_integer_limit(const char *s, unsigned int base, unsigned lon return rv; } +noinline unsigned int _parse_integer(const char *s, unsigned int base, unsigned long long *p) { return _parse_integer_limit(s, base, p, INT_MAX); @@ -125,6 +128,7 @@ static int _kstrtoull(const char *s, unsigned int base, unsigned long long *res) * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoull(). Return code must be checked. */ +noinline int kstrtoull(const char *s, unsigned int base, unsigned long long *res) { if (s[0] == '+') @@ -148,6 +152,7 @@ EXPORT_SYMBOL(kstrtoull); * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoll(). Return code must be checked. */ +noinline int kstrtoll(const char *s, unsigned int base, long long *res) { unsigned long long tmp; @@ -219,6 +224,7 @@ EXPORT_SYMBOL(_kstrtol); * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtoul(). Return code must be checked. */ +noinline int kstrtouint(const char *s, unsigned int base, unsigned int *res) { unsigned long long tmp; @@ -249,6 +255,7 @@ EXPORT_SYMBOL(kstrtouint); * Returns 0 on success, -ERANGE on overflow and -EINVAL on parsing error. * Preferred over simple_strtol(). Return code must be checked. */ +noinline int kstrtoint(const char *s, unsigned int base, int *res) { long long tmp; @@ -264,6 +271,7 @@ int kstrtoint(const char *s, unsigned int base, int *res) } EXPORT_SYMBOL(kstrtoint); +noinline int kstrtou16(const char *s, unsigned int base, u16 *res) { unsigned long long tmp; @@ -279,6 +287,7 @@ int kstrtou16(const char *s, unsigned int base, u16 *res) } EXPORT_SYMBOL(kstrtou16); +noinline int kstrtos16(const char *s, unsigned int base, s16 *res) { long long tmp; @@ -294,6 +303,7 @@ int kstrtos16(const char *s, unsigned int base, s16 *res) } EXPORT_SYMBOL(kstrtos16); +noinline int kstrtou8(const char *s, unsigned int base, u8 *res) { unsigned long long tmp; @@ -309,6 +319,7 @@ int kstrtou8(const char *s, unsigned int base, u8 *res) } EXPORT_SYMBOL(kstrtou8); +noinline int kstrtos8(const char *s, unsigned int base, s8 *res) { long long tmp; @@ -333,6 +344,7 @@ EXPORT_SYMBOL(kstrtos8); * [oO][NnFf] for "on" and "off". Otherwise it will return -EINVAL. Value * pointed to by res is updated upon finding a match. */ +noinline int kstrtobool(const char *s, bool *res) { if (!s) From 0425473037db40d9e322631f2d4dc6ef51f97e88 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:08:56 -0800 Subject: [PATCH 23/55] list: introduce list_is_head() helper and re-use it in list.h Introduce list_is_head() in the similar (*) way as it's done for list_entry_is_head(). Make use of it in the list.h. *) it's done as inliner and not a macro to be aligned with other list_is_*() APIs; while at it, make all three to have the same style. Link: https://lkml.kernel.org/r/20211201141824.81400-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Heikki Krogerus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/list.h | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/include/linux/list.h b/include/linux/list.h index 6636fc07f918f1..dd6c2041d09c1d 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -258,8 +258,7 @@ static inline void list_bulk_move_tail(struct list_head *head, * @list: the entry to test * @head: the head of the list */ -static inline int list_is_first(const struct list_head *list, - const struct list_head *head) +static inline int list_is_first(const struct list_head *list, const struct list_head *head) { return list->prev == head; } @@ -269,12 +268,21 @@ static inline int list_is_first(const struct list_head *list, * @list: the entry to test * @head: the head of the list */ -static inline int list_is_last(const struct list_head *list, - const struct list_head *head) +static inline int list_is_last(const struct list_head *list, const struct list_head *head) { return list->next == head; } +/** + * list_is_head - tests whether @list is the list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_head(const struct list_head *list, const struct list_head *head) +{ + return list == head; +} + /** * list_empty - tests whether a list is empty * @head: the list to test. @@ -318,7 +326,7 @@ static inline void list_del_init_careful(struct list_head *entry) static inline int list_empty_careful(const struct list_head *head) { struct list_head *next = smp_load_acquire(&head->next); - return (next == head) && (next == head->prev); + return list_is_head(next, head) && (next == head->prev); } /** @@ -393,10 +401,9 @@ static inline void list_cut_position(struct list_head *list, { if (list_empty(head)) return; - if (list_is_singular(head) && - (head->next != entry && head != entry)) + if (list_is_singular(head) && !list_is_head(entry, head) && (entry != head->next)) return; - if (entry == head) + if (list_is_head(entry, head)) INIT_LIST_HEAD(list); else __list_cut_position(list, head, entry); @@ -570,7 +577,7 @@ static inline void list_splice_tail_init(struct list_head *list, * @head: the head for your list. */ #define list_for_each(pos, head) \ - for (pos = (head)->next; pos != (head); pos = pos->next) + for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_continue - continue iteration over a list @@ -580,7 +587,7 @@ static inline void list_splice_tail_init(struct list_head *list, * Continue to iterate over a list, continuing after the current position. */ #define list_for_each_continue(pos, head) \ - for (pos = pos->next; pos != (head); pos = pos->next) + for (pos = pos->next; !list_is_head(pos, (head)); pos = pos->next) /** * list_for_each_prev - iterate over a list backwards @@ -588,7 +595,7 @@ static inline void list_splice_tail_init(struct list_head *list, * @head: the head for your list. */ #define list_for_each_prev(pos, head) \ - for (pos = (head)->prev; pos != (head); pos = pos->prev) + for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev) /** * list_for_each_safe - iterate over a list safe against removal of list entry @@ -597,8 +604,9 @@ static inline void list_splice_tail_init(struct list_head *list, * @head: the head for your list. */ #define list_for_each_safe(pos, n, head) \ - for (pos = (head)->next, n = pos->next; pos != (head); \ - pos = n, n = pos->next) + for (pos = (head)->next, n = pos->next; \ + !list_is_head(pos, (head)); \ + pos = n, n = pos->next) /** * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry @@ -608,7 +616,7 @@ static inline void list_splice_tail_init(struct list_head *list, */ #define list_for_each_prev_safe(pos, n, head) \ for (pos = (head)->prev, n = pos->prev; \ - pos != (head); \ + !list_is_head(pos, (head)); \ pos = n, n = pos->prev) /** From a31f9336ed48317d61c2299d595ed14294ffe5f9 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Wed, 19 Jan 2022 18:08:59 -0800 Subject: [PATCH 24/55] lib/list_debug.c: print more list debugging context in __list_del_entry_valid() Currently, the entry->prev and entry->next are considered to be valid as long as they are not LIST_POISON{1|2}. However, the memory may be corrupted. The prev->next is invalid probably because 'prev' is invalid, not because prev->next's content is illegal. Unfortunately, the printk and its subfunctions will modify the registers that hold the 'prev' and 'next', and we don't see this valuable information in the BUG context. So print the contents of 'entry->prev' and 'entry->next'. Here's an example: list_del corruption. prev->next should be c0ecbf74, but was c08410dc kernel BUG at lib/list_debug.c:53! ... ... PC is at __list_del_entry_valid+0x58/0x98 LR is at __list_del_entry_valid+0x58/0x98 psr: 60000093 sp : c0ecbf30 ip : 00000000 fp : 00000001 r10: c08410d0 r9 : 00000001 r8 : c0825e0c r7 : 20000013 r6 : c08410d0 r5 : c0ecbf74 r4 : c0ecbf74 r3 : c0825d08 r2 : 00000000 r1 : df7ce6f4 r0 : 00000044 ... ... Stack: (0xc0ecbf30 to 0xc0ecc000) bf20: c0ecbf74 c0164fd0 c0ecbf70 c0165170 bf40: c0eca000 c0840c00 c0840c00 c0824500 c0825e0c c0189bbc c088f404 60000013 bf60: 60000013 c0e85100 000004ec 00000000 c0ebcdc0 c0ecbf74 c0ecbf74 c0825d08 bf80: c0e807c0 c018965c 00000000 c013f2a0 c0e807c0 c013f154 00000000 00000000 bfa0: 00000000 00000000 00000000 c01001b0 00000000 00000000 00000000 00000000 bfc0: 00000000 00000000 00000000 00000000 00000000 00000000 00000000 00000000 bfe0: 00000000 00000000 00000000 00000000 00000013 00000000 00000000 00000000 (__list_del_entry_valid) from (__list_del_entry+0xc/0x20) (__list_del_entry) from (finish_swait+0x60/0x7c) (finish_swait) from (rcu_gp_kthread+0x560/0xa20) (rcu_gp_kthread) from (kthread+0x14c/0x15c) (kthread) from (ret_from_fork+0x14/0x24) At first, I thought prev->next was overwritten. Later, I carefully analyzed the RCU code and the disassembly code. The error occurred when deleting a node from the list rcu_state.gp_wq. The System.map shows that the address of rcu_state is c0840c00. Then I use gdb to obtain the offset of rcu_state.gp_wq.task_list. (gdb) p &((struct rcu_state *)0)->gp_wq.task_list $1 = (struct list_head *) 0x4dc Again: list_del corruption. prev->next should be c0ecbf74, but was c08410dc c08410dc = c0840c00 + 0x4dc = &rcu_state.gp_wq.task_list Because rcu_state.gp_wq has at most one node, so I can guess that "prev = &rcu_state.gp_wq.task_list". But for other scenes, maybe I wasn't so lucky, I cannot figure out the value of 'prev'. Link: https://lkml.kernel.org/r/20211207025835.1909-1-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Cc: "Paul E . McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/list_debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/list_debug.c b/lib/list_debug.c index 5d5424b51b746f..9daa3fb9d1cd61 100644 --- a/lib/list_debug.c +++ b/lib/list_debug.c @@ -49,11 +49,11 @@ bool __list_del_entry_valid(struct list_head *entry) "list_del corruption, %px->prev is LIST_POISON2 (%px)\n", entry, LIST_POISON2) || CHECK_DATA_CORRUPTION(prev->next != entry, - "list_del corruption. prev->next should be %px, but was %px\n", - entry, prev->next) || + "list_del corruption. prev->next should be %px, but was %px. (prev=%px)\n", + entry, prev->next, prev) || CHECK_DATA_CORRUPTION(next->prev != entry, - "list_del corruption. next->prev should be %px, but was %px\n", - entry, next->prev)) + "list_del corruption. next->prev should be %px, but was %px. (next=%px)\n", + entry, next->prev, next)) return false; return true; From fd0a1462405b087377e59b84e119fe7e2d08499a Mon Sep 17 00:00:00 2001 From: Isabella Basso Date: Wed, 19 Jan 2022 18:09:02 -0800 Subject: [PATCH 25/55] hash.h: remove unused define directive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "test_hash.c: refactor into KUnit", v3. We refactored the lib/test_hash.c file into KUnit as part of the student group LKCAMP [1] introductory hackathon for kernel development. This test was pointed to our group by Daniel Latypov [2], so its full conversion into a pure KUnit test was our goal in this patch series, but we ran into many problems relating to it not being split as unit tests, which complicated matters a bit, as the reasoning behind the original tests is quite cryptic for those unfamiliar with hash implementations. Some interesting developments we'd like to highlight are: - In patch 1/5 we noticed that there was an unused define directive that could be removed. - In patch 4/5 we noticed how stringhash and hash tests are all under the lib/test_hash.c file, which might cause some confusion, and we also broke those kernel config entries up. Overall KUnit developments have been made in the other patches in this series: In patches 2/5, 3/5 and 5/5 we refactored the lib/test_hash.c file so as to make it more compatible with the KUnit style, whilst preserving the original idea of the maintainer who designed it (i.e. George Spelvin), which might be undesirable for unit tests, but we assume it is enough for a first patch. This patch (of 5): Currently, there exist hash_32() and __hash_32() functions, which were introduced in a patch [1] targeting architecture specific optimizations. These functions can be overridden on a per-architecture basis to achieve such optimizations. They must set their corresponding define directive (HAVE_ARCH_HASH_32 and HAVE_ARCH__HASH_32, respectively) so that header files can deal with these overrides properly. As the supported 32-bit architectures that have their own hash function implementation (i.e. m68k, Microblaze, H8/300, pa-risc) have only been making use of the (more general) __hash_32() function (which only lacks a right shift operation when compared to the hash_32() function), remove the define directive corresponding to the arch-specific hash_32() implementation. [1] https://lore.kernel.org/lkml/20160525073311.5600.qmail@ns.sciencehorizons.net/ [akpm@linux-foundation.org: hash_32_generic() becomes hash_32()] Link: https://lkml.kernel.org/r/20211208183711.390454-1-isabbasso@riseup.net Link: https://lkml.kernel.org/r/20211208183711.390454-2-isabbasso@riseup.net Reviewed-by: David Gow Tested-by: David Gow Co-developed-by: Augusto Durães Camargo Signed-off-by: Augusto Durães Camargo Co-developed-by: Enzo Ferreira Signed-off-by: Enzo Ferreira Signed-off-by: Isabella Basso Cc: Geert Uytterhoeven Cc: Brendan Higgins Cc: Daniel Latypov Cc: Shuah Khan Cc: Rodrigo Siqueira Cc: kernel test robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/sw/rxe/rxe_qp.c | 3 +-- include/linux/hash.h | 5 +---- lib/test_hash.c | 24 +----------------------- tools/include/linux/hash.h | 5 +---- 4 files changed, 4 insertions(+), 33 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c index 54b8711321c1e5..44c9ea601bff75 100644 --- a/drivers/infiniband/sw/rxe/rxe_qp.c +++ b/drivers/infiniband/sw/rxe/rxe_qp.c @@ -217,8 +217,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp, * the port number must be in the Dynamic Ports range * (0xc000 - 0xffff). */ - qp->src_port = RXE_ROCE_V2_SPORT + - (hash_32_generic(qp_num(qp), 14) & 0x3fff); + qp->src_port = RXE_ROCE_V2_SPORT + (hash_32(qp_num(qp), 14) & 0x3fff); qp->sq.max_wr = init->cap.max_send_wr; /* These caps are limited by rxe_qp_chk_cap() done by the caller */ diff --git a/include/linux/hash.h b/include/linux/hash.h index ad6fa21d977b59..38edaa08f86293 100644 --- a/include/linux/hash.h +++ b/include/linux/hash.h @@ -62,10 +62,7 @@ static inline u32 __hash_32_generic(u32 val) return val * GOLDEN_RATIO_32; } -#ifndef HAVE_ARCH_HASH_32 -#define hash_32 hash_32_generic -#endif -static inline u32 hash_32_generic(u32 val, unsigned int bits) +static inline u32 hash_32(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); diff --git a/lib/test_hash.c b/lib/test_hash.c index 0ee40b4a56ddaf..d4b0cfdb0377f1 100644 --- a/lib/test_hash.c +++ b/lib/test_hash.c @@ -94,22 +94,7 @@ test_int_hash(unsigned long long h64, u32 hash_or[2][33]) pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m); return false; } -#ifdef HAVE_ARCH_HASH_32 - h2 = hash_32_generic(h0, k); -#if HAVE_ARCH_HASH_32 == 1 - if (h1 != h2) { - pr_err("hash_32(%#x, %d) = %#x != hash_32_generic() " - " = %#x", h0, k, h1, h2); - return false; - } -#else - if (h2 > m) { - pr_err("hash_32_generic(%#x, %d) = %#x > %#x", - h0, k, h1, m); - return false; - } -#endif -#endif + /* Test hash_64 */ hash_or[1][k] |= h1 = hash_64(h64, k); if (h1 > m) { @@ -227,13 +212,6 @@ test_hash_init(void) #else pr_info("__hash_32() has no arch implementation to test."); #endif -#ifdef HAVE_ARCH_HASH_32 -#if HAVE_ARCH_HASH_32 != 1 - pr_info("hash_32() is arch-specific; not compared to generic."); -#endif -#else - pr_info("hash_32() has no arch implementation to test."); -#endif #ifdef HAVE_ARCH_HASH_64 #if HAVE_ARCH_HASH_64 != 1 pr_info("hash_64() is arch-specific; not compared to generic."); diff --git a/tools/include/linux/hash.h b/tools/include/linux/hash.h index ad6fa21d977b59..38edaa08f86293 100644 --- a/tools/include/linux/hash.h +++ b/tools/include/linux/hash.h @@ -62,10 +62,7 @@ static inline u32 __hash_32_generic(u32 val) return val * GOLDEN_RATIO_32; } -#ifndef HAVE_ARCH_HASH_32 -#define hash_32 hash_32_generic -#endif -static inline u32 hash_32_generic(u32 val, unsigned int bits) +static inline u32 hash_32(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); From ae7880676bc8019ff61e49126c558ad7c4b6fa21 Mon Sep 17 00:00:00 2001 From: Isabella Basso Date: Wed, 19 Jan 2022 18:09:05 -0800 Subject: [PATCH 26/55] test_hash.c: split test_int_hash into arch-specific functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split the test_int_hash function to keep its mainloop separate from arch-specific chunks, which are only compiled as needed. This aims at improving readability. Link: https://lkml.kernel.org/r/20211208183711.390454-3-isabbasso@riseup.net Reviewed-by: David Gow Tested-by: David Gow Signed-off-by: Isabella Basso Cc: Augusto Durães Camargo Cc: Brendan Higgins Cc: Daniel Latypov Cc: Enzo Ferreira Cc: Geert Uytterhoeven Cc: kernel test robot Cc: Rodrigo Siqueira Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_hash.c | 91 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 62 insertions(+), 29 deletions(-) diff --git a/lib/test_hash.c b/lib/test_hash.c index d4b0cfdb0377f1..2b4fe4976cc483 100644 --- a/lib/test_hash.c +++ b/lib/test_hash.c @@ -56,6 +56,58 @@ fill_buf(char *buf, size_t len, u32 seed) } } +/* Holds most testing variables for the int test. */ +struct test_hash_params { + /* Pointer to integer to be hashed. */ + unsigned long long *h64; + /* Low 32-bits of integer to be hashed. */ + u32 h0; + /* Arch-specific hash result. */ + u32 h1; + /* Generic hash result. */ + u32 h2; + /* ORed hashes of given size (in bits). */ + u32 (*hash_or)[33]; +}; + +#ifdef HAVE_ARCH__HASH_32 +static bool __init +test_int__hash_32(struct test_hash_params *params) +{ + params->hash_or[1][0] |= params->h2 = __hash_32_generic(params->h0); +#if HAVE_ARCH__HASH_32 == 1 + if (params->h1 != params->h2) { + pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x", + params->h0, params->h1, params->h2); + return false; + } +#endif + return true; +} +#endif + +#ifdef HAVE_ARCH_HASH_64 +static bool __init +test_int_hash_64(struct test_hash_params *params, u32 const *m, int *k) +{ + params->h2 = hash_64_generic(*params->h64, *k); +#if HAVE_ARCH_HASH_64 == 1 + if (params->h1 != params->h2) { + pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() = %#x", + *params->h64, *k, params->h1, params->h2); + return false; + } +#else + if (params->h2 > *m) { + pr_err("hash_64_generic(%#llx, %d) = %#x > %#x", + *params->h64, *k, params->h1, *m); + return false; + } +#endif + return true; +} +#endif + /* * Test the various integer hash functions. h64 (or its low-order bits) * is the integer to hash. hash_or accumulates the OR of the hash values, @@ -69,19 +121,13 @@ static bool __init test_int_hash(unsigned long long h64, u32 hash_or[2][33]) { int k; - u32 h0 = (u32)h64, h1, h2; + struct test_hash_params params = { &h64, (u32)h64, 0, 0, hash_or }; /* Test __hash32 */ - hash_or[0][0] |= h1 = __hash_32(h0); + hash_or[0][0] |= params.h1 = __hash_32(params.h0); #ifdef HAVE_ARCH__HASH_32 - hash_or[1][0] |= h2 = __hash_32_generic(h0); -#if HAVE_ARCH__HASH_32 == 1 - if (h1 != h2) { - pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x", - h0, h1, h2); + if (!test_int__hash_32(¶ms)) return false; - } -#endif #endif /* Test k = 1..32 bits */ @@ -89,37 +135,24 @@ test_int_hash(unsigned long long h64, u32 hash_or[2][33]) u32 const m = ((u32)2 << (k-1)) - 1; /* Low k bits set */ /* Test hash_32 */ - hash_or[0][k] |= h1 = hash_32(h0, k); - if (h1 > m) { - pr_err("hash_32(%#x, %d) = %#x > %#x", h0, k, h1, m); + hash_or[0][k] |= params.h1 = hash_32(params.h0, k); + if (params.h1 > m) { + pr_err("hash_32(%#x, %d) = %#x > %#x", params.h0, k, params.h1, m); return false; } /* Test hash_64 */ - hash_or[1][k] |= h1 = hash_64(h64, k); - if (h1 > m) { - pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, h1, m); + hash_or[1][k] |= params.h1 = hash_64(h64, k); + if (params.h1 > m) { + pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, params.h1, m); return false; } #ifdef HAVE_ARCH_HASH_64 - h2 = hash_64_generic(h64, k); -#if HAVE_ARCH_HASH_64 == 1 - if (h1 != h2) { - pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() " - "= %#x", h64, k, h1, h2); + if (!test_int_hash_64(¶ms, &m, &k)) return false; - } -#else - if (h2 > m) { - pr_err("hash_64_generic(%#llx, %d) = %#x > %#x", - h64, k, h1, m); - return false; - } -#endif #endif } - (void)h2; /* Suppress unused variable warning */ return true; } From 5427d3d772a77a4d67fece057064832ec5cfa078 Mon Sep 17 00:00:00 2001 From: Isabella Basso Date: Wed, 19 Jan 2022 18:09:09 -0800 Subject: [PATCH 27/55] test_hash.c: split test_hash_init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split up test_hash_init so that it calls each test more explicitly insofar it is possible without rewriting the entire file. This aims at improving readability. Split tests performed on string_or as they don't interfere with those performed in hash_or. Also separate pr_info calls about skipped tests as they're not part of the tests themselves, but only warn about (un)defined arch-specific hash functions. Link: https://lkml.kernel.org/r/20211208183711.390454-4-isabbasso@riseup.net Reviewed-by: David Gow Tested-by: David Gow Signed-off-by: Isabella Basso Cc: Augusto Durães Camargo Cc: Brendan Higgins Cc: Daniel Latypov Cc: Enzo Ferreira Cc: Geert Uytterhoeven Cc: kernel test robot Cc: Rodrigo Siqueira Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_hash.c | 66 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 54 insertions(+), 12 deletions(-) diff --git a/lib/test_hash.c b/lib/test_hash.c index 2b4fe4976cc483..032849a48da704 100644 --- a/lib/test_hash.c +++ b/lib/test_hash.c @@ -158,11 +158,39 @@ test_int_hash(unsigned long long h64, u32 hash_or[2][33]) #define SIZE 256 /* Run time is cubic in SIZE */ -static int __init -test_hash_init(void) +static int __init test_string_or(void) { char buf[SIZE+1]; - u32 string_or = 0, hash_or[2][33] = { { 0, } }; + u32 string_or = 0; + int i, j; + + fill_buf(buf, SIZE, 1); + + /* Test every possible non-empty substring in the buffer. */ + for (j = SIZE; j > 0; --j) { + buf[j] = '\0'; + + for (i = 0; i <= j; i++) { + u32 h0 = full_name_hash(buf+i, buf+i, j-i); + + string_or |= h0; + } /* i */ + } /* j */ + + /* The OR of all the hash values should cover all the bits */ + if (~string_or) { + pr_err("OR of all string hash results = %#x != %#x", + string_or, -1u); + return -EINVAL; + } + + return 0; +} + +static int __init test_hash_or(void) +{ + char buf[SIZE+1]; + u32 hash_or[2][33] = { { 0, } }; unsigned tests = 0; unsigned long long h64 = 0; int i, j; @@ -192,7 +220,6 @@ test_hash_init(void) return -EINVAL; } - string_or |= h0; h64 = h64 << 32 | h0; /* For use with hash_64 */ if (!test_int_hash(h64, hash_or)) return -EINVAL; @@ -200,12 +227,6 @@ test_hash_init(void) } /* i */ } /* j */ - /* The OR of all the hash values should cover all the bits */ - if (~string_or) { - pr_err("OR of all string hash results = %#x != %#x", - string_or, -1u); - return -EINVAL; - } if (~hash_or[0][0]) { pr_err("OR of all __hash_32 results = %#x != %#x", hash_or[0][0], -1u); @@ -237,6 +258,13 @@ test_hash_init(void) } } + pr_notice("%u tests passed.", tests); + + return 0; +} + +static void __init notice_skipped_tests(void) +{ /* Issue notices about skipped tests. */ #ifdef HAVE_ARCH__HASH_32 #if HAVE_ARCH__HASH_32 != 1 @@ -252,10 +280,24 @@ test_hash_init(void) #else pr_info("hash_64() has no arch implementation to test."); #endif +} - pr_notice("%u tests passed.", tests); +static int __init +test_hash_init(void) +{ + int ret; - return 0; + ret = test_string_or(); + if (ret < 0) + return ret; + + ret = test_hash_or(); + if (ret < 0) + return ret; + + notice_skipped_tests(); + + return ret; } static void __exit test_hash_exit(void) From 88168bf35c5260013daab4bddf944cd557cb6f08 Mon Sep 17 00:00:00 2001 From: Isabella Basso Date: Wed, 19 Jan 2022 18:09:12 -0800 Subject: [PATCH 28/55] lib/Kconfig.debug: properly split hash test kernel entries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split TEST_HASH so that each entry only has one file. Note that there's no stringhash test file, but actually tests are performed in lib/test_hash.c. Link: https://lkml.kernel.org/r/20211208183711.390454-5-isabbasso@riseup.net Reviewed-by: David Gow Tested-by: David Gow Signed-off-by: Isabella Basso Cc: Augusto Durães Camargo Cc: Brendan Higgins Cc: Daniel Latypov Cc: Enzo Ferreira Cc: Geert Uytterhoeven Cc: kernel test robot Cc: Rodrigo Siqueira Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 14 +++++++++++--- lib/Makefile | 3 ++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 5e14e32056add2..f27de2050ca00c 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2210,9 +2210,17 @@ config TEST_RHASHTABLE config TEST_HASH tristate "Perform selftest on hash functions" help - Enable this option to test the kernel's integer (), - string (), and siphash () - hash functions on boot (or module load). + Enable this option to test the kernel's integer (), and + string () hash functions on boot (or module load). + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. + +config TEST_SIPHASH + tristate "Perform selftest on siphash functions" + help + Enable this option to test the kernel's siphash () hash + functions on boot (or module load). This is intended to help people writing architecture-specific optimized versions. If unsure, say N. diff --git a/lib/Makefile b/lib/Makefile index 364c23f1557816..f3a2a251471d07 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -61,7 +61,8 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_BITOPS) += test_bitops.o CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o -obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o +obj-$(CONFIG_TEST_SIPHASH) += test_siphash.o +obj-$(CONFIG_TEST_HASH) += test_hash.o obj-$(CONFIG_TEST_IDA) += test_ida.o obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o CFLAGS_test_kasan.o += -fno-builtin From 0acc968f352336a459f27ba1f23745a174933c9c Mon Sep 17 00:00:00 2001 From: Isabella Basso Date: Wed, 19 Jan 2022 18:09:15 -0800 Subject: [PATCH 29/55] test_hash.c: refactor into kunit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use KUnit framework to make tests more easily integrable with CIs. Even though these tests are not yet properly written as unit tests this change should help in debugging. Also remove kernel messages (i.e. through pr_info) as KUnit handles all debugging output and let it handle module init and exit details. Link: https://lkml.kernel.org/r/20211208183711.390454-6-isabbasso@riseup.net Reviewed-by: David Gow Reported-by: kernel test robot Tested-by: David Gow Co-developed-by: Augusto Durães Camargo Signed-off-by: Augusto Durães Camargo Co-developed-by: Enzo Ferreira Signed-off-by: Enzo Ferreira Signed-off-by: Isabella Basso Cc: Brendan Higgins Cc: Daniel Latypov Cc: Geert Uytterhoeven Cc: Rodrigo Siqueira Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 28 ++++--- lib/Makefile | 2 +- lib/test_hash.c | 194 +++++++++++++++------------------------------- 3 files changed, 81 insertions(+), 143 deletions(-) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f27de2050ca00c..a789da4a19a17b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2207,15 +2207,6 @@ config TEST_RHASHTABLE If unsure, say N. -config TEST_HASH - tristate "Perform selftest on hash functions" - help - Enable this option to test the kernel's integer (), and - string () hash functions on boot (or module load). - - This is intended to help people writing architecture-specific - optimized versions. If unsure, say N. - config TEST_SIPHASH tristate "Perform selftest on siphash functions" help @@ -2364,6 +2355,25 @@ config BITFIELD_KUNIT If unsure, say N. +config HASH_KUNIT_TEST + tristate "KUnit Test for integer hash functions" if !KUNIT_ALL_TESTS + depends on KUNIT + default KUNIT_ALL_TESTS + help + Enable this option to test the kernel's string (), and + integer () hash functions on boot. + + KUnit tests run during boot and output the results to the debug log + in TAP format (https://testanything.org/). Only useful for kernel devs + running the KUnit test harness, and not intended for inclusion into a + production build. + + For more information on KUnit and unit tests in general please refer + to the KUnit documentation in Documentation/dev-tools/kunit/. + + This is intended to help people writing architecture-specific + optimized versions. If unsure, say N. + config RESOURCE_KUNIT_TEST tristate "KUnit test for resource API" depends on KUNIT diff --git a/lib/Makefile b/lib/Makefile index f3a2a251471d07..511c2782770133 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -62,7 +62,7 @@ obj-$(CONFIG_TEST_BITOPS) += test_bitops.o CFLAGS_test_bitops.o += -Werror obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_SIPHASH) += test_siphash.o -obj-$(CONFIG_TEST_HASH) += test_hash.o +obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o obj-$(CONFIG_TEST_IDA) += test_ida.o obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o CFLAGS_test_kasan.o += -fno-builtin diff --git a/lib/test_hash.c b/lib/test_hash.c index 032849a48da704..bb25fda34794b0 100644 --- a/lib/test_hash.c +++ b/lib/test_hash.c @@ -14,17 +14,15 @@ * and hash_64(). */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt "\n" - #include #include #include #include #include -#include +#include /* 32-bit XORSHIFT generator. Seed must not be zero. */ -static u32 __init __attribute_const__ +static u32 __attribute_const__ xorshift(u32 seed) { seed ^= seed << 13; @@ -34,7 +32,7 @@ xorshift(u32 seed) } /* Given a non-zero x, returns a non-zero byte. */ -static u8 __init __attribute_const__ +static u8 __attribute_const__ mod255(u32 x) { x = (x & 0xffff) + (x >> 16); /* 1 <= x <= 0x1fffe */ @@ -45,8 +43,7 @@ mod255(u32 x) } /* Fill the buffer with non-zero bytes. */ -static void __init -fill_buf(char *buf, size_t len, u32 seed) +static void fill_buf(char *buf, size_t len, u32 seed) { size_t i; @@ -71,40 +68,32 @@ struct test_hash_params { }; #ifdef HAVE_ARCH__HASH_32 -static bool __init -test_int__hash_32(struct test_hash_params *params) +static void +test_int__hash_32(struct kunit *test, struct test_hash_params *params) { params->hash_or[1][0] |= params->h2 = __hash_32_generic(params->h0); #if HAVE_ARCH__HASH_32 == 1 - if (params->h1 != params->h2) { - pr_err("__hash_32(%#x) = %#x != __hash_32_generic() = %#x", - params->h0, params->h1, params->h2); - return false; - } + KUNIT_EXPECT_EQ_MSG(test, params->h1, params->h2, + "__hash_32(%#x) = %#x != __hash_32_generic() = %#x", + params->h0, params->h1, params->h2); #endif - return true; } #endif #ifdef HAVE_ARCH_HASH_64 -static bool __init -test_int_hash_64(struct test_hash_params *params, u32 const *m, int *k) +static void +test_int_hash_64(struct kunit *test, struct test_hash_params *params, u32 const *m, int *k) { params->h2 = hash_64_generic(*params->h64, *k); #if HAVE_ARCH_HASH_64 == 1 - if (params->h1 != params->h2) { - pr_err("hash_64(%#llx, %d) = %#x != hash_64_generic() = %#x", - *params->h64, *k, params->h1, params->h2); - return false; - } + KUNIT_EXPECT_EQ_MSG(test, params->h1, params->h2, + "hash_64(%#llx, %d) = %#x != hash_64_generic() = %#x", + *params->h64, *k, params->h1, params->h2); #else - if (params->h2 > *m) { - pr_err("hash_64_generic(%#llx, %d) = %#x > %#x", - *params->h64, *k, params->h1, *m); - return false; - } + KUNIT_EXPECT_LE_MSG(test, params->h1, params->h2, + "hash_64_generic(%#llx, %d) = %#x > %#x", + *params->h64, *k, params->h1, *m); #endif - return true; } #endif @@ -117,8 +106,8 @@ test_int_hash_64(struct test_hash_params *params, u32 const *m, int *k) * inline, the code being tested is actually in the module, and you can * recompile and re-test the module without rebooting. */ -static bool __init -test_int_hash(unsigned long long h64, u32 hash_or[2][33]) +static void +test_int_hash(struct kunit *test, unsigned long long h64, u32 hash_or[2][33]) { int k; struct test_hash_params params = { &h64, (u32)h64, 0, 0, hash_or }; @@ -126,8 +115,7 @@ test_int_hash(unsigned long long h64, u32 hash_or[2][33]) /* Test __hash32 */ hash_or[0][0] |= params.h1 = __hash_32(params.h0); #ifdef HAVE_ARCH__HASH_32 - if (!test_int__hash_32(¶ms)) - return false; + test_int__hash_32(test, ¶ms); #endif /* Test k = 1..32 bits */ @@ -136,29 +124,24 @@ test_int_hash(unsigned long long h64, u32 hash_or[2][33]) /* Test hash_32 */ hash_or[0][k] |= params.h1 = hash_32(params.h0, k); - if (params.h1 > m) { - pr_err("hash_32(%#x, %d) = %#x > %#x", params.h0, k, params.h1, m); - return false; - } + KUNIT_EXPECT_LE_MSG(test, params.h1, m, + "hash_32(%#x, %d) = %#x > %#x", + params.h0, k, params.h1, m); /* Test hash_64 */ hash_or[1][k] |= params.h1 = hash_64(h64, k); - if (params.h1 > m) { - pr_err("hash_64(%#llx, %d) = %#x > %#x", h64, k, params.h1, m); - return false; - } + KUNIT_EXPECT_LE_MSG(test, params.h1, m, + "hash_64(%#llx, %d) = %#x > %#x", + h64, k, params.h1, m); #ifdef HAVE_ARCH_HASH_64 - if (!test_int_hash_64(¶ms, &m, &k)) - return false; + test_int_hash_64(test, ¶ms, &m, &k); #endif } - - return true; } #define SIZE 256 /* Run time is cubic in SIZE */ -static int __init test_string_or(void) +static void test_string_or(struct kunit *test) { char buf[SIZE+1]; u32 string_or = 0; @@ -178,20 +161,15 @@ static int __init test_string_or(void) } /* j */ /* The OR of all the hash values should cover all the bits */ - if (~string_or) { - pr_err("OR of all string hash results = %#x != %#x", - string_or, -1u); - return -EINVAL; - } - - return 0; + KUNIT_EXPECT_EQ_MSG(test, string_or, -1u, + "OR of all string hash results = %#x != %#x", + string_or, -1u); } -static int __init test_hash_or(void) +static void test_hash_or(struct kunit *test) { char buf[SIZE+1]; u32 hash_or[2][33] = { { 0, } }; - unsigned tests = 0; unsigned long long h64 = 0; int i, j; @@ -206,39 +184,27 @@ static int __init test_hash_or(void) u32 h0 = full_name_hash(buf+i, buf+i, j-i); /* Check that hashlen_string gets the length right */ - if (hashlen_len(hashlen) != j-i) { - pr_err("hashlen_string(%d..%d) returned length" - " %u, expected %d", - i, j, hashlen_len(hashlen), j-i); - return -EINVAL; - } + KUNIT_EXPECT_EQ_MSG(test, hashlen_len(hashlen), j-i, + "hashlen_string(%d..%d) returned length %u, expected %d", + i, j, hashlen_len(hashlen), j-i); /* Check that the hashes match */ - if (hashlen_hash(hashlen) != h0) { - pr_err("hashlen_string(%d..%d) = %08x != " - "full_name_hash() = %08x", - i, j, hashlen_hash(hashlen), h0); - return -EINVAL; - } + KUNIT_EXPECT_EQ_MSG(test, hashlen_hash(hashlen), h0, + "hashlen_string(%d..%d) = %08x != full_name_hash() = %08x", + i, j, hashlen_hash(hashlen), h0); h64 = h64 << 32 | h0; /* For use with hash_64 */ - if (!test_int_hash(h64, hash_or)) - return -EINVAL; - tests++; + test_int_hash(test, h64, hash_or); } /* i */ } /* j */ - if (~hash_or[0][0]) { - pr_err("OR of all __hash_32 results = %#x != %#x", - hash_or[0][0], -1u); - return -EINVAL; - } + KUNIT_EXPECT_EQ_MSG(test, hash_or[0][0], -1u, + "OR of all __hash_32 results = %#x != %#x", + hash_or[0][0], -1u); #ifdef HAVE_ARCH__HASH_32 #if HAVE_ARCH__HASH_32 != 1 /* Test is pointless if results match */ - if (~hash_or[1][0]) { - pr_err("OR of all __hash_32_generic results = %#x != %#x", - hash_or[1][0], -1u); - return -EINVAL; - } + KUNIT_EXPECT_EQ_MSG(test, hash_or[1][0], -1u, + "OR of all __hash_32_generic results = %#x != %#x", + hash_or[1][0], -1u); #endif #endif @@ -246,65 +212,27 @@ static int __init test_hash_or(void) for (i = 1; i <= 32; i++) { u32 const m = ((u32)2 << (i-1)) - 1; /* Low i bits set */ - if (hash_or[0][i] != m) { - pr_err("OR of all hash_32(%d) results = %#x " - "(%#x expected)", i, hash_or[0][i], m); - return -EINVAL; - } - if (hash_or[1][i] != m) { - pr_err("OR of all hash_64(%d) results = %#x " - "(%#x expected)", i, hash_or[1][i], m); - return -EINVAL; - } + KUNIT_EXPECT_EQ_MSG(test, hash_or[0][i], m, + "OR of all hash_32(%d) results = %#x (%#x expected)", + i, hash_or[0][i], m); + KUNIT_EXPECT_EQ_MSG(test, hash_or[1][i], m, + "OR of all hash_64(%d) results = %#x (%#x expected)", + i, hash_or[1][i], m); } - - pr_notice("%u tests passed.", tests); - - return 0; } -static void __init notice_skipped_tests(void) -{ - /* Issue notices about skipped tests. */ -#ifdef HAVE_ARCH__HASH_32 -#if HAVE_ARCH__HASH_32 != 1 - pr_info("__hash_32() is arch-specific; not compared to generic."); -#endif -#else - pr_info("__hash_32() has no arch implementation to test."); -#endif -#ifdef HAVE_ARCH_HASH_64 -#if HAVE_ARCH_HASH_64 != 1 - pr_info("hash_64() is arch-specific; not compared to generic."); -#endif -#else - pr_info("hash_64() has no arch implementation to test."); -#endif -} - -static int __init -test_hash_init(void) -{ - int ret; - - ret = test_string_or(); - if (ret < 0) - return ret; - - ret = test_hash_or(); - if (ret < 0) - return ret; - - notice_skipped_tests(); +static struct kunit_case hash_test_cases[] __refdata = { + KUNIT_CASE(test_string_or), + KUNIT_CASE(test_hash_or), + {} +}; - return ret; -} +static struct kunit_suite hash_test_suite = { + .name = "hash", + .test_cases = hash_test_cases, +}; -static void __exit test_hash_exit(void) -{ -} -module_init(test_hash_init); /* Does everything */ -module_exit(test_hash_exit); /* Does nothing */ +kunit_test_suite(hash_test_suite); MODULE_LICENSE("GPL"); From 60c7801b121aa0e90d8aae7245859aec0ce2306f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:09:19 -0800 Subject: [PATCH 30/55] kunit: replace kernel.h with the necessary inclusions When kernel.h is used in the headers it adds a lot into dependency hell, especially when there are circular dependencies are involved. Replace kernel.h inclusion with the list of what is really being used. Link: https://lkml.kernel.org/r/20211213204441.56204-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Reviewed-by: Brendan Higgins Tested-by: Brendan Higgins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/kunit/assert.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/kunit/assert.h b/include/kunit/assert.h index ad889b539ab391..ccbc36c0b02f7a 100644 --- a/include/kunit/assert.h +++ b/include/kunit/assert.h @@ -10,7 +10,7 @@ #define _KUNIT_ASSERT_H #include -#include +#include struct kunit; struct string_stream; From 8e930a66993be0a5f9a97c7c1c76ef09db4ef8bb Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:09:22 -0800 Subject: [PATCH 31/55] uuid: discourage people from using UAPI header in new code Discourage people from using UAPI header in new code by adding a note. Link: https://lkml.kernel.org/r/20211216113552.81199-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/uuid.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h index e5a7eecef7c339..32615dc5f0cfb9 100644 --- a/include/uapi/linux/uuid.h +++ b/include/uapi/linux/uuid.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* DO NOT USE in new code! This is solely for MEI due to legacy reasons */ /* * UUID/GUID definition * From c7e4289cbe668c2743ac0fd623a2518dbc191dc0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 19 Jan 2022 18:09:25 -0800 Subject: [PATCH 32/55] uuid: remove licence boilerplate text from the header Remove licence boilerplate text from the UAPI header. Link: https://lkml.kernel.org/r/20211216113552.81199-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/uuid.h | 9 --------- 1 file changed, 9 deletions(-) diff --git a/include/uapi/linux/uuid.h b/include/uapi/linux/uuid.h index 32615dc5f0cfb9..c0f4bd9b040edf 100644 --- a/include/uapi/linux/uuid.h +++ b/include/uapi/linux/uuid.h @@ -5,15 +5,6 @@ * * Copyright (C) 2010, Intel Corp. * Huang Ying - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation; - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. */ #ifndef _UAPI_LINUX_UUID_H_ From e073e5ef90298d2d6e5e7f04b545a0815e92110c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Wed, 19 Jan 2022 18:09:28 -0800 Subject: [PATCH 33/55] lib/test_meminit: destroy cache in kmem_cache_alloc_bulk() test Make do_kmem_cache_size_bulk() destroy the cache it creates. Link: https://lkml.kernel.org/r/aced20a94bf04159a139f0846e41d38a1537debb.1640018297.git.andreyknvl@google.com Fixes: 03a9349ac0e0 ("lib/test_meminit: add a kmem_cache_alloc_bulk() test") Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_meminit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/test_meminit.c b/lib/test_meminit.c index e4f706a404b3a1..3ca717f1139774 100644 --- a/lib/test_meminit.c +++ b/lib/test_meminit.c @@ -337,6 +337,7 @@ static int __init do_kmem_cache_size_bulk(int size, int *total_failures) if (num) kmem_cache_free_bulk(c, num, objects); } + kmem_cache_destroy(c); *total_failures += fail; return 1; } From 36f8b348a94c12e30ca5c81eb31c9a445117ef7b Mon Sep 17 00:00:00 2001 From: Jerome Forissier Date: Wed, 19 Jan 2022 18:09:31 -0800 Subject: [PATCH 34/55] checkpatch: relax regexp for COMMIT_LOG_LONG_LINE One exceptions to the COMMIT_LOG_LONG_LINE rule is a file path followed by ':'. That is typically some sort diagnostic message from a compiler or a build tool, in which case we don't want to wrap the lines but keep the message unmodified. The regular expression used to match this pattern currently doesn't accept absolute paths or + characters. This can result in false positives as in the following (out-of-tree) example: ... /home/jerome/work/optee_repo_qemu/build/../toolchains/aarch32/bin/arm-linux-gnueabihf-ld.bfd: /home/jerome/work/toolchains-gcc10.2/aarch32/bin/../lib/gcc/arm-none-linux-gnueabihf/10.2.1/../../../../arm-none-linux-gnueabihf/lib/libstdc++.a(eh_alloc.o): in function `__cxa_allocate_exception': /tmp/dgboter/bbs/build03--cen7x86_64/buildbot/cen7x86_64--arm-none-linux-gnueabihf/build/src/gcc/libstdc++-v3/libsupc++/eh_alloc.cc:284: undefined reference to `malloc' ... Update the regular expression to match the above paths. Link: https://lkml.kernel.org/r/20210923143842.2837983-1-jerome@forissier.org Signed-off-by: Jerome Forissier Acked-by: Joe Perches Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 1784921c645dad..49d185a2698611 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3172,7 +3172,7 @@ sub process { length($line) > 75 && !($line =~ /^\s*[a-zA-Z0-9_\/\.]+\s+\|\s+\d+/ || # file delta changes - $line =~ /^\s*(?:[\w\.\-]+\/)++[\w\.\-]+:/ || + $line =~ /^\s*(?:[\w\.\-\+]*\/)++[\w\.\-\+]+:/ || # filename then : $line =~ /^\s*(?:Fixes:|Link:|$signature_tags)/i || # A Fixes: or Link: line or signature tag line From b8709bce9089996528f594cd1f71f1a085761aad Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 19 Jan 2022 18:09:34 -0800 Subject: [PATCH 35/55] checkpatch: improve Kconfig help test The Kconfig help test erroneously counts patch context lines as part of the help text. Fix that and improve the message block output. Link: https://lkml.kernel.org/r/06c0cdc157ae1502e8e9eb3624b9ea995cf11e7a.camel@perches.com Signed-off-by: Joe Perches Tested-by: Randy Dunlap Acked-by: Randy Dunlap Cc: Andy Whitcroft Cc: Dwaipayan Ray Cc: Lukas Bulwahn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 52 +++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 49d185a2698611..b01c36a15d9dd0 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3479,47 +3479,47 @@ sub process { # Kconfig supports named choices), so use a word boundary # (\b) rather than a whitespace character (\s) $line =~ /^\+\s*(?:config|menuconfig|choice)\b/) { - my $length = 0; - my $cnt = $realcnt; - my $ln = $linenr + 1; - my $f; - my $is_start = 0; - my $is_end = 0; - for (; $cnt > 0 && defined $lines[$ln - 1]; $ln++) { - $f = $lines[$ln - 1]; - $cnt-- if ($lines[$ln - 1] !~ /^-/); - $is_end = $lines[$ln - 1] =~ /^\+/; + my $ln = $linenr; + my $needs_help = 0; + my $has_help = 0; + my $help_length = 0; + while (defined $lines[$ln]) { + my $f = $lines[$ln++]; next if ($f =~ /^-/); - last if (!$file && $f =~ /^\@\@/); + last if ($f !~ /^[\+ ]/); # !patch context - if ($lines[$ln - 1] =~ /^\+\s*(?:bool|tristate|prompt)\s*["']/) { - $is_start = 1; - } elsif ($lines[$ln - 1] =~ /^\+\s*(?:---)?help(?:---)?$/) { - $length = -1; + if ($f =~ /^\+\s*(?:bool|tristate|prompt)\s*["']/) { + $needs_help = 1; + next; + } + if ($f =~ /^\+\s*help\s*$/) { + $has_help = 1; + next; } - $f =~ s/^.//; - $f =~ s/#.*//; - $f =~ s/^\s+//; - next if ($f =~ /^$/); + $f =~ s/^.//; # strip patch context [+ ] + $f =~ s/#.*//; # strip # directives + $f =~ s/^\s+//; # strip leading blanks + next if ($f =~ /^$/); # skip blank lines + # At the end of this Kconfig block: # This only checks context lines in the patch # and so hopefully shouldn't trigger false # positives, even though some of these are # common words in help texts - if ($f =~ /^\s*(?:config|menuconfig|choice|endchoice| - if|endif|menu|endmenu|source)\b/x) { - $is_end = 1; + if ($f =~ /^(?:config|menuconfig|choice|endchoice| + if|endif|menu|endmenu|source)\b/x) { last; } - $length++; + $help_length++ if ($has_help); } - if ($is_start && $is_end && $length < $min_conf_desc_length) { + if ($needs_help && + $help_length < $min_conf_desc_length) { + my $stat_real = get_stat_real($linenr, $ln - 1); WARN("CONFIG_DESCRIPTION", - "please write a paragraph that describes the config symbol fully\n" . $herecurr); + "please write a help paragraph that fully describes the config symbol\n" . "$here\n$stat_real\n"); } - #print "is_start<$is_start> is_end<$is_end> length<$length>\n"; } # check MAINTAINERS entries From c55cdc5cd6663ff616c94ecf7204e92c7049bb5e Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Wed, 19 Jan 2022 18:09:37 -0800 Subject: [PATCH 36/55] const_structs.checkpatch: add frequently used ops structs Add commonly used structs (>50 instances) which are always or almost always const. Link: https://lkml.kernel.org/r/20211127101134.33101-1-rikard.falkeborn@gmail.com Signed-off-by: Rikard Falkeborn Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/const_structs.checkpatch | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/scripts/const_structs.checkpatch b/scripts/const_structs.checkpatch index 3980985205a060..1eeb7b42c5b9fa 100644 --- a/scripts/const_structs.checkpatch +++ b/scripts/const_structs.checkpatch @@ -12,19 +12,27 @@ driver_info drm_connector_funcs drm_encoder_funcs drm_encoder_helper_funcs +dvb_frontend_ops +dvb_tuner_ops ethtool_ops extent_io_ops +fb_ops file_lock_operations file_operations hv_ops +hwmon_ops +ib_device_ops ide_dma_ops ide_port_ops +ieee80211_ops +iio_buffer_setup_ops inode_operations intel_dvo_dev_ops irq_domain_ops item_operations iwl_cfg iwl_ops +kernel_param_ops kgdb_arch kgdb_io kset_uevent_ops @@ -32,25 +40,33 @@ lock_manager_operations machine_desc microcode_ops mlxsw_reg_info +mtd_ooblayout_ops mtrr_ops +nand_controller_ops neigh_ops net_device_ops +nft_expr_ops nlmsvc_binding nvkm_device_chip of_device_id pci_raw_ops phy_ops +pinconf_ops pinctrl_ops pinmux_ops pipe_buf_operations platform_hibernation_ops platform_suspend_ops +proc_ops proto_ops +pwm_ops regmap_access_table regulator_ops +reset_control_ops rpc_pipe_ops rtc_class_ops sd_desc +sdhci_ops seq_operations sirfsoc_padmux snd_ac97_build_ops @@ -67,6 +83,13 @@ uart_ops usb_mon_operations v4l2_ctrl_ops v4l2_ioctl_ops +v4l2_subdev_core_ops +v4l2_subdev_internal_ops +v4l2_subdev_ops +v4l2_subdev_pad_ops +v4l2_subdev_video_ops +vb2_ops vm_operations_struct wacom_features +watchdog_ops wd_ops From 9630f0d60fec5fbcaa4435a66f75df1dc9704b66 Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Wed, 19 Jan 2022 18:09:40 -0800 Subject: [PATCH 37/55] fs/binfmt_elf: use PT_LOAD p_align values for static PIE Extend commit ce81bb256a22 ("fs/binfmt_elf: use PT_LOAD p_align values for suitable start address") which fixed PIE binaries built with -Wl,-z,max-page-size=0x200000, to cover static PIE binaries. This fixes: https://bugzilla.kernel.org/show_bug.cgi?id=215275 Tested by verifying static PIE binaries with -Wl,-z,max-page-size=0x200000 loading. Link: https://lkml.kernel.org/r/20211209174052.370537-1-hjl.tools@gmail.com Signed-off-by: H.J. Lu Cc: Chris Kennelly Cc: Al Viro Cc: Alexey Dobriyan Cc: Song Liu Cc: David Rientjes Cc: Ian Rogers Cc: Hugh Dickins Cc: Suren Baghdasaryan Cc: Sandeep Patil Cc: Fangrui Song Cc: Nick Desaulniers Cc: Kirill A. Shutemov Cc: Mike Kravetz Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_elf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b9a33cc34d6bf6..605017eb9349e5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1116,11 +1116,11 @@ static int load_elf_binary(struct linux_binprm *bprm) * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ - if (interpreter) { + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + if (alignment > ELF_MIN_ALIGN) { load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; From e1ce8a97befa98566f49acb99c79cc233cf3a703 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 19 Jan 2022 18:09:44 -0800 Subject: [PATCH 38/55] nilfs2: remove redundant pointer sbufs Pointer sbufs is being assigned a value but it's not being used later on. The pointer is redundant and can be removed. Cleans up scan-build static analysis warning: fs/nilfs2/page.c:203:8: warning: Although the value stored to 'sbufs' is used in the enclosing expression, the value is never actually read from 'sbufs' [deadcode.DeadStores] sbh = sbufs = page_buffers(src); Link: https://lkml.kernel.org/r/20211211180955.550380-1-colin.i.king@gmail.com Link: https://lkml.kernel.org/r/1640712476-15136-1-git-send-email-konishi.ryusuke@gmail.com Signed-off-by: Colin Ian King Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/page.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index bc3e2cd4117ffa..063dd16d75b590 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -195,12 +195,12 @@ void nilfs_page_bug(struct page *page) */ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) { - struct buffer_head *dbh, *dbufs, *sbh, *sbufs; + struct buffer_head *dbh, *dbufs, *sbh; unsigned long mask = NILFS_BUFFER_INHERENT_BITS; BUG_ON(PageWriteback(dst)); - sbh = sbufs = page_buffers(src); + sbh = page_buffers(src); if (!page_has_buffers(dst)) create_empty_buffers(dst, sbh->b_size, 0); From e35fa567a082a7547a4ec21e50a27eecf38961aa Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Jan 2022 18:09:47 -0800 Subject: [PATCH 39/55] hfsplus: use struct_group_attr() for memcpy() region In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memset(), avoid intentionally writing across neighboring fields. Add struct_group() to mark the "info" region (containing struct DInfo and struct DXInfo structs) in struct hfsplus_cat_folder and struct hfsplus_cat_file that are written into directly, so the compiler can correctly reason about the expected size of the writes. "pahole" shows no size nor member offset changes to struct hfsplus_cat_folder nor struct hfsplus_cat_file. "objdump -d" shows no object code changes. Link: https://lkml.kernel.org/r/20211119192851.1046717-1-keescook@chromium.org Signed-off-by: Kees Cook Acked-by: Christian Brauner Cc: Zhen Lei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/hfsplus_raw.h | 12 ++++++++---- fs/hfsplus/xattr.c | 4 ++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h index 456e87aec7fd7e..68b4240c61916d 100644 --- a/fs/hfsplus/hfsplus_raw.h +++ b/fs/hfsplus/hfsplus_raw.h @@ -260,8 +260,10 @@ struct hfsplus_cat_folder { __be32 access_date; __be32 backup_date; struct hfsplus_perm permissions; - struct DInfo user_info; - struct DXInfo finder_info; + struct_group_attr(info, __packed, + struct DInfo user_info; + struct DXInfo finder_info; + ); __be32 text_encoding; __be32 subfolders; /* Subfolder count in HFSX. Reserved in HFS+. */ } __packed; @@ -294,8 +296,10 @@ struct hfsplus_cat_file { __be32 access_date; __be32 backup_date; struct hfsplus_perm permissions; - struct FInfo user_info; - struct FXInfo finder_info; + struct_group_attr(info, __packed, + struct FInfo user_info; + struct FXInfo finder_info; + ); __be32 text_encoding; u32 reserved2; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e2855ceefd3943..49891b12c41566 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -296,7 +296,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, sizeof(hfsplus_cat_entry)); if (be16_to_cpu(entry.type) == HFSPLUS_FOLDER) { if (size == folder_finderinfo_len) { - memcpy(&entry.folder.user_info, value, + memcpy(&entry.folder.info, value, folder_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, @@ -309,7 +309,7 @@ int __hfsplus_setxattr(struct inode *inode, const char *name, } } else if (be16_to_cpu(entry.type) == HFSPLUS_FILE) { if (size == file_finderinfo_len) { - memcpy(&entry.file.user_info, value, + memcpy(&entry.file.info, value, file_finderinfo_len); hfs_bnode_write(cat_fd.bnode, &entry, cat_fd.entryoffset, From 9bb56d592532526f638468ed0781b7ab7925a1ec Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 19 Jan 2022 18:09:50 -0800 Subject: [PATCH 40/55] FAT: use io_schedule_timeout() instead of congestion_wait() congestion_wait() in this context is just a sleep - block devices do not support congestion signalling any more. The goal for this wait, which was introduced in commit ae78bf9c4f5f ("[PATCH] add -o flush for fat") is to wait for any recently written data to get to storage. We currently have no direct mechanism to do this, so a simple wait that behaves identically to the current congestion_wait() is the best we can do. This is a step towards removing congestion_wait() Link: https://lkml.kernel.org/r/163936544519.22433.13400436295732112065@noble.neil.brown.name Signed-off-by: NeilBrown Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fat/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/fat/file.c b/fs/fat/file.c index 13855ba49cd976..a5a309fcc7faf6 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -175,9 +175,10 @@ long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static int fat_file_release(struct inode *inode, struct file *filp) { if ((filp->f_mode & FMODE_WRITE) && - MSDOS_SB(inode->i_sb)->options.flush) { + MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); - congestion_wait(BLK_RW_ASYNC, HZ/10); + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(HZ/10); } return 0; } From 25d2e88632c9069cb21e23340e14cd19b8a0b1df Mon Sep 17 00:00:00 2001 From: Minghao Chi Date: Wed, 19 Jan 2022 18:09:53 -0800 Subject: [PATCH 41/55] fs/adfs: remove unneeded variable make code cleaner Return value directly instead of taking this in a variable. Link: https://lkml.kernel.org/r/20211210023211.424609-1-chi.minghao@zte.com.cn Signed-off-by: Minghao Chi Reported-by: Zeal Robot Cc: Christian Brauner Cc: Jan Kara Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/adfs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index adbb3a1edcbf56..5156821bfe6a56 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -355,7 +355,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct object_info obj; - int ret; obj.indaddr = ADFS_I(inode)->indaddr; obj.name_len = 0; @@ -365,6 +364,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); - return ret; + return adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); } From 23b36fec7e14f8cf1c17e832e53dd4761e0dfe83 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 19 Jan 2022 18:09:56 -0800 Subject: [PATCH 42/55] panic: use error_report_end tracepoint on warnings Introduce the error detector "warning" to the error_report event and use the error_report_end tracepoint at the end of a warning report. This allows in-kernel tests but also userspace to more easily determine if a warning occurred without polling kernel logs. [akpm@linux-foundation.org: add comma to enum list, per Andy] Link: https://lkml.kernel.org/r/20211115085630.1756817-1-elver@google.com Signed-off-by: Marco Elver Cc: Steven Rostedt Cc: Ingo Molnar Cc: Alexander Potapenko Cc: Petr Mladek Cc: Luis Chamberlain Cc: Wei Liu Cc: Mike Rapoport Cc: Arnd Bergmann Cc: John Ogness Cc: Andy Shevchenko Cc: Alexander Popov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/error_report.h | 8 +++++--- kernel/panic.c | 2 ++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/trace/events/error_report.h b/include/trace/events/error_report.h index 96f64bf218b256..a1922a800e6fa9 100644 --- a/include/trace/events/error_report.h +++ b/include/trace/events/error_report.h @@ -17,14 +17,16 @@ enum error_detector { ERROR_DETECTOR_KFENCE, - ERROR_DETECTOR_KASAN + ERROR_DETECTOR_KASAN, + ERROR_DETECTOR_WARN, }; #endif /* __ERROR_REPORT_DECLARE_TRACE_ENUMS_ONCE_ONLY */ -#define error_detector_list \ +#define error_detector_list \ EM(ERROR_DETECTOR_KFENCE, "kfence") \ - EMe(ERROR_DETECTOR_KASAN, "kasan") + EM(ERROR_DETECTOR_KASAN, "kasan") \ + EMe(ERROR_DETECTOR_WARN, "warning") /* Always end the list with an EMe. */ #undef EM diff --git a/kernel/panic.c b/kernel/panic.c index cefd7d82366fbc..8e299cae1615ed 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #define PANIC_TIMER_STEP 100 @@ -609,6 +610,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, print_irqtrace_events(current); print_oops_end_marker(); + trace_error_report_end(ERROR_DETECTOR_WARN, (unsigned long)caller); /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); From e83a4472bf9f556d01984048e398e64246c4dd6f Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Wed, 19 Jan 2022 18:09:59 -0800 Subject: [PATCH 43/55] panic: remove oops_id The oops id has been added as part of the end of trace marker for the kerneloops.org project. The id is used to automatically identify duplicate submissions of the same report. Identical looking reports with different a id can be considered as the same oops occurred again. The early initialisation of the oops_id can create a warning if the random core is not yet fully initialized. On PREEMPT_RT it is problematic if the id is initialized on demand from non preemptible context. The kernel oops project is not available since 2017. Remove the oops_id and use 0 in the output in case parser rely on it. Link: https://bugs.debian.org/953172 Link: https://lkml.kernel.org/r/Ybdi16aP2NEugWHq@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Cc: Arjan van de Ven Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/kernel/panic.c b/kernel/panic.c index 8e299cae1615ed..55b50e052ec3a3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -534,26 +534,9 @@ void oops_enter(void) trigger_all_cpu_backtrace(); } -/* - * 64-bit random ID for oopses: - */ -static u64 oops_id; - -static int init_oops_id(void) -{ - if (!oops_id) - get_random_bytes(&oops_id, sizeof(oops_id)); - else - oops_id++; - - return 0; -} -late_initcall(init_oops_id); - static void print_oops_end_marker(void) { - init_oops_id(); - pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); + pr_warn("---[ end trace %016llx ]---\n", 0ULL); } /* From a3d5dc908a5f572ce3e31fe83fd2459a1c3c5422 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Jan 2022 18:10:02 -0800 Subject: [PATCH 44/55] delayacct: support swapin delay accounting for swapping without blkio Currently delayacct accounts swapin delay only for swapping that cause blkio. If we use zram for swapping, tools/accounting/getdelays can't get any SWAP delay. It's useful to get zram swapin delay information, for example to adjust compress algorithm or /proc/sys/vm/swappiness. Reference to PSI, it accounts any kind of swapping by doing its work in swap_readpage(), no matter whether swapping causes blkio. Let delayacct do the similar work. Link: https://lkml.kernel.org/r/20211112083813.8559-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Balbir Singh Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 44 +++++++++++++++++++-------------------- kernel/delayacct.c | 33 ++++++++++++++++------------- mm/memory.c | 4 ---- mm/page_io.c | 3 +++ 4 files changed, 43 insertions(+), 41 deletions(-) diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index af7e6eb5028373..b96d68f310a2b7 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -9,14 +9,6 @@ #include -/* - * Per-task flags relevant to delay accounting - * maintained privately to avoid exhausting similar flags in sched.h:PF_* - * Used to set current->delays->flags - */ -#define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ -#define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ - #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; @@ -37,13 +29,13 @@ struct task_delay_info { * associated with the operation is added to XXX_delay. * XXX_delay contains the accumulated delay time in nanoseconds. */ - u64 blkio_start; /* Shared by blkio, swapin */ + u64 blkio_start; u64 blkio_delay; /* wait for sync block io completion */ - u64 swapin_delay; /* wait for swapin block io completion */ + u64 swapin_start; + u64 swapin_delay; /* wait for swapin */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ - u32 swapin_count; /* total count of the number of swapin block */ - /* io operations performed */ + u32 swapin_count; /* total count of swapin */ u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ @@ -79,14 +71,8 @@ extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); - -static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) -{ - if (p->delays) - return (p->delays->flags & DELAYACCT_PF_BLKIO); - else - return 0; -} +extern void __delayacct_swapin_start(void); +extern void __delayacct_swapin_end(void); static inline void delayacct_set_flag(struct task_struct *p, int flag) { @@ -123,7 +109,6 @@ static inline void delayacct_blkio_start(void) if (!static_branch_unlikely(&delayacct_key)) return; - delayacct_set_flag(current, DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } @@ -135,7 +120,6 @@ static inline void delayacct_blkio_end(struct task_struct *p) if (p->delays) __delayacct_blkio_end(p); - delayacct_clear_flag(p, DELAYACCT_PF_BLKIO); } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) @@ -169,6 +153,18 @@ static inline void delayacct_thrashing_end(void) __delayacct_thrashing_end(); } +static inline void delayacct_swapin_start(void) +{ + if (current->delays) + __delayacct_swapin_start(); +} + +static inline void delayacct_swapin_end(void) +{ + if (current->delays) + __delayacct_swapin_end(); +} + #else static inline void delayacct_set_flag(struct task_struct *p, int flag) {} @@ -199,6 +195,10 @@ static inline void delayacct_thrashing_start(void) {} static inline void delayacct_thrashing_end(void) {} +static inline void delayacct_swapin_start(void) +{} +static inline void delayacct_swapin_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 51530d5b15a8a7..97699848c1f0c4 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -100,19 +100,10 @@ void __delayacct_blkio_start(void) */ void __delayacct_blkio_end(struct task_struct *p) { - struct task_delay_info *delays = p->delays; - u64 *total; - u32 *count; - - if (p->delays->flags & DELAYACCT_PF_SWAPIN) { - total = &delays->swapin_delay; - count = &delays->swapin_count; - } else { - total = &delays->blkio_delay; - count = &delays->blkio_count; - } - - delayacct_end(&delays->lock, &delays->blkio_start, total, count); + delayacct_end(&p->delays->lock, + &p->delays->blkio_start, + &p->delays->blkio_delay, + &p->delays->blkio_count); } int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) @@ -179,8 +170,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk) unsigned long flags; raw_spin_lock_irqsave(&tsk->delays->lock, flags); - ret = nsec_to_clock_t(tsk->delays->blkio_delay + - tsk->delays->swapin_delay); + ret = nsec_to_clock_t(tsk->delays->blkio_delay); raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return ret; } @@ -210,3 +200,16 @@ void __delayacct_thrashing_end(void) ¤t->delays->thrashing_delay, ¤t->delays->thrashing_count); } + +void __delayacct_swapin_start(void) +{ + current->delays->swapin_start = local_clock(); +} + +void __delayacct_swapin_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->swapin_start, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); +} diff --git a/mm/memory.c b/mm/memory.c index 8f1de811a1dcb5..ced3274c3deb99 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3507,7 +3507,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - delayacct_set_flag(current, DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; @@ -3555,7 +3554,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto unlock; } @@ -3569,13 +3567,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); - delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; diff --git a/mm/page_io.c b/mm/page_io.c index 9725c7e1eeea13..0bf8e40f4e573b 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -25,6 +25,7 @@ #include #include #include +#include void end_swap_bio_write(struct bio *bio) { @@ -370,6 +371,7 @@ int swap_readpage(struct page *page, bool synchronous) * significant part of overall IO time. */ psi_memstall_enter(&pflags); + delayacct_swapin_start(); if (frontswap_load(page) == 0) { SetPageUptodate(page); @@ -432,6 +434,7 @@ int swap_readpage(struct page *page, bool synchronous) out: psi_memstall_leave(&pflags); + delayacct_swapin_end(); return ret; } From 82065b7266899fbdce4c7394d7dd02688161f0cf Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Jan 2022 18:10:06 -0800 Subject: [PATCH 45/55] delayacct: fix incomplete disable operation when switch enable to disable When a task is created after delayacct is enabled, kernel will do all the delay accountings for that task. The problems is if user disables delayacct by set /proc/sys/kernel/task_delayacct to zero, only blkio delay accounting is disabled. Now disable all the kinds of delay accountings when /proc/sys/kernel/task_delayacct sets to zero. Link: https://lkml.kernel.org/r/20211123140342.32962-1-ran.xiaokai@zte.com.cn Signed-off-by: Yang Yang Reported-by: Zeal Robot Cc: Balbir Singh Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index b96d68f310a2b7..c675cfb6437e29 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -131,36 +131,54 @@ static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) static inline void delayacct_freepages_start(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_freepages_start(); } static inline void delayacct_freepages_end(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_freepages_end(); } static inline void delayacct_thrashing_start(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_thrashing_start(); } static inline void delayacct_thrashing_end(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_thrashing_end(); } static inline void delayacct_swapin_start(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_swapin_start(); } static inline void delayacct_swapin_end(void) { + if (!static_branch_unlikely(&delayacct_key)) + return; + if (current->delays) __delayacct_swapin_end(); } From 1193829da1a6728249cd02577a020bd64fd9c160 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 19 Jan 2022 18:10:09 -0800 Subject: [PATCH 46/55] delayacct: cleanup flags in struct task_delay_info and functions use it Flags in struct task_delay_info is used to distinguish the difference between swapin and blkio delay acountings. But after patch "delayacct: support swapin delay accounting for swapping without blkio", there is no need to do that since swapin and blkio delay accounting use their own functions. Link: https://lkml.kernel.org/r/20211124065958.36703-1-yang.yang29@zte.com.cn Signed-off-by: Yang Yang Cc: Balbir Singh Cc: Ingo Molnar Cc: Johannes Weiner Cc: Peter Zijlstra Cc: Zeal Robot Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index c675cfb6437e29..435c3654a0ff07 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -12,7 +12,6 @@ #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; - unsigned int flags; /* Private per-task flags */ /* For each stat XXX, add following, aligned appropriately * @@ -74,18 +73,6 @@ extern void __delayacct_thrashing_end(void); extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); -static inline void delayacct_set_flag(struct task_struct *p, int flag) -{ - if (p->delays) - p->delays->flags |= flag; -} - -static inline void delayacct_clear_flag(struct task_struct *p, int flag) -{ - if (p->delays) - p->delays->flags &= ~flag; -} - static inline void delayacct_tsk_init(struct task_struct *tsk) { /* reinitialize in case parent's non-null pointer was dup'ed*/ @@ -184,10 +171,6 @@ static inline void delayacct_swapin_end(void) } #else -static inline void delayacct_set_flag(struct task_struct *p, int flag) -{} -static inline void delayacct_clear_flag(struct task_struct *p, int flag) -{} static inline void delayacct_init(void) {} static inline void delayacct_tsk_init(struct task_struct *tsk) From ec710aa8b2385e6a2239f79120fbf9b78400865b Mon Sep 17 00:00:00 2001 From: wangyong Date: Wed, 19 Jan 2022 18:10:12 -0800 Subject: [PATCH 47/55] Documentation/accounting/delay-accounting.rst: add thrashing page cache and direct compact Add thrashing page cache and direct compact related descriptions and update the usage of getdelays userspace utility. The following patches modifications have been updated: https://lore.kernel.org/all/20190312102002.31737-4-jinpuwang@gmail.com/ https://lore.kernel.org/all/1638619795-71451-1-git-send-email- wang.yong12@zte.com.cn/ Link: https://lkml.kernel.org/r/1639583021-92977-1-git-send-email-wang.yong12@zte.com.cn Signed-off-by: wangyong Reviewed-by: Yang Yang Reported-by: Zeal Robot Cc: Balbir Singh Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/accounting/delay-accounting.rst | 55 +++++++++---------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/Documentation/accounting/delay-accounting.rst b/Documentation/accounting/delay-accounting.rst index 1b8b46deeb299a..197fe319cbec38 100644 --- a/Documentation/accounting/delay-accounting.rst +++ b/Documentation/accounting/delay-accounting.rst @@ -13,6 +13,8 @@ a) waiting for a CPU (while being runnable) b) completion of synchronous block I/O initiated by the task c) swapping in pages d) memory reclaim +e) thrashing page cache +f) direct compact and makes these statistics available to userspace through the taskstats interface. @@ -41,11 +43,12 @@ generic data structure to userspace corresponding to per-pid and per-tgid statistics. The delay accounting functionality populates specific fields of this structure. See - include/linux/taskstats.h + include/uapi/linux/taskstats.h for a description of the fields pertaining to delay accounting. It will generally be in the form of counters returning the cumulative -delay seen for cpu, sync block I/O, swapin, memory reclaim etc. +delay seen for cpu, sync block I/O, swapin, memory reclaim, thrash page +cache, direct compact etc. Taking the difference of two successive readings of a given counter (say cpu_delay_total) for a task will give the delay @@ -88,41 +91,37 @@ seen. General format of the getdelays command:: - getdelays [-t tgid] [-p pid] [-c cmd...] - + getdelays [-dilv] [-t tgid] [-p pid] Get delays, since system boot, for pid 10:: - # ./getdelays -p 10 + # ./getdelays -d -p 10 (output similar to next case) Get sum of delays, since system boot, for all pids with tgid 5:: - # ./getdelays -t 5 - - - CPU count real total virtual total delay total - 7876 92005750 100000000 24001500 - IO count delay total - 0 0 - SWAP count delay total - 0 0 - RECLAIM count delay total - 0 0 + # ./getdelays -d -t 5 + print delayacct stats ON + TGID 5 -Get delays seen in executing a given simple command:: - # ./getdelays -c ls / + CPU count real total virtual total delay total delay average + 8 7000000 6872122 3382277 0.423ms + IO count delay total delay average + 0 0 0ms + SWAP count delay total delay average + 0 0 0ms + RECLAIM count delay total delay average + 0 0 0ms + THRASHING count delay total delay average + 0 0 0ms + COMPACT count delay total delay average + 0 0 0ms - bin data1 data3 data5 dev home media opt root srv sys usr - boot data2 data4 data6 etc lib mnt proc sbin subdomain tmp var +Get IO accounting for pid 1, it works only with -p:: + # ./getdelays -i -p 1 + printing IO accounting + linuxrc: read=65536, write=0, cancelled_write=0 - CPU count real total virtual total delay total - 6 4000250 4000000 0 - IO count delay total - 0 0 - SWAP count delay total - 0 0 - RECLAIM count delay total - 0 0 +The above command can be used with -v to get more debug information. From 5bf18281534451bf1ad56a45a3085cd7ad46860d Mon Sep 17 00:00:00 2001 From: wangyong Date: Wed, 19 Jan 2022 18:10:15 -0800 Subject: [PATCH 48/55] delayacct: track delays from memory compact Delay accounting does not track the delay of memory compact. When there is not enough free memory, tasks can spend a amount of their time waiting for compact. To get the impact of tasks in direct memory compact, measure the delay when allocating memory through memory compact. Also update tools/accounting/getdelays.c: / # ./getdelays_next -di -p 304 print delayacct stats ON printing IO accounting PID 304 CPU count real total virtual total delay total delay average 277 780000000 849039485 18877296 0.068ms IO count delay total delay average 0 0 0ms SWAP count delay total delay average 0 0 0ms RECLAIM count delay total delay average 5 11088812685 2217ms THRASHING count delay total delay average 0 0 0ms COMPACT count delay total delay average 3 72758 0ms watch: read=0, write=0, cancelled_write=0 Link: https://lkml.kernel.org/r/1638619795-71451-1-git-send-email-wang.yong12@zte.com.cn Signed-off-by: wangyong Reviewed-by: Jiang Xuexin Reviewed-by: Zhang Wenya Reviewed-by: Yang Yang Reviewed-by: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/delayacct.h | 28 ++++++++++++++++++++++++++++ include/uapi/linux/taskstats.h | 6 +++++- kernel/delayacct.c | 16 ++++++++++++++++ mm/page_alloc.c | 3 +++ tools/accounting/getdelays.c | 8 +++++++- 5 files changed, 59 insertions(+), 2 deletions(-) diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h index 435c3654a0ff07..3e03d010bd2e3e 100644 --- a/include/linux/delayacct.h +++ b/include/linux/delayacct.h @@ -42,8 +42,12 @@ struct task_delay_info { u64 thrashing_start; u64 thrashing_delay; /* wait for thrashing page */ + u64 compact_start; + u64 compact_delay; /* wait for memory compact */ + u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ + u32 compact_count; /* total count of memory compact */ }; #endif @@ -72,6 +76,8 @@ extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); extern void __delayacct_swapin_start(void); extern void __delayacct_swapin_end(void); +extern void __delayacct_compact_start(void); +extern void __delayacct_compact_end(void); static inline void delayacct_tsk_init(struct task_struct *tsk) { @@ -170,6 +176,24 @@ static inline void delayacct_swapin_end(void) __delayacct_swapin_end(); } +static inline void delayacct_compact_start(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_compact_start(); +} + +static inline void delayacct_compact_end(void) +{ + if (!static_branch_unlikely(&delayacct_key)) + return; + + if (current->delays) + __delayacct_compact_end(); +} + #else static inline void delayacct_init(void) {} @@ -200,6 +224,10 @@ static inline void delayacct_swapin_start(void) {} static inline void delayacct_swapin_end(void) {} +static inline void delayacct_compact_start(void) +{} +static inline void delayacct_compact_end(void) +{} #endif /* CONFIG_TASK_DELAY_ACCT */ diff --git a/include/uapi/linux/taskstats.h b/include/uapi/linux/taskstats.h index ccbd0870932173..12327d32378f5f 100644 --- a/include/uapi/linux/taskstats.h +++ b/include/uapi/linux/taskstats.h @@ -34,7 +34,7 @@ */ -#define TASKSTATS_VERSION 10 +#define TASKSTATS_VERSION 11 #define TS_COMM_LEN 32 /* should be >= TASK_COMM_LEN * in linux/sched.h */ @@ -172,6 +172,10 @@ struct taskstats { /* v10: 64-bit btime to avoid overflow */ __u64 ac_btime64; /* 64-bit begin time */ + + /* Delay waiting for memory compact */ + __u64 compact_count; + __u64 compact_delay_total; }; diff --git a/kernel/delayacct.c b/kernel/delayacct.c index 97699848c1f0c4..c5e8cea9e05ff2 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -155,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp; tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay; d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp; + tmp = d->compact_delay_total + tsk->delays->compact_delay; + d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp; d->blkio_count += tsk->delays->blkio_count; d->swapin_count += tsk->delays->swapin_count; d->freepages_count += tsk->delays->freepages_count; d->thrashing_count += tsk->delays->thrashing_count; + d->compact_count += tsk->delays->compact_count; raw_spin_unlock_irqrestore(&tsk->delays->lock, flags); return 0; @@ -213,3 +216,16 @@ void __delayacct_swapin_end(void) ¤t->delays->swapin_delay, ¤t->delays->swapin_count); } + +void __delayacct_compact_start(void) +{ + current->delays->compact_start = local_clock(); +} + +void __delayacct_compact_end(void) +{ + delayacct_end(¤t->delays->lock, + ¤t->delays->compact_start, + ¤t->delays->compact_delay, + ¤t->delays->compact_count); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c5952749ad40bd..635063f4967106 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -4348,6 +4349,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, return NULL; psi_memstall_enter(&pflags); + delayacct_compact_start(); noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, @@ -4355,6 +4357,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, memalloc_noreclaim_restore(noreclaim_flag); psi_memstall_leave(&pflags); + delayacct_compact_end(); if (*compact_result == COMPACT_SKIPPED) return NULL; diff --git a/tools/accounting/getdelays.c b/tools/accounting/getdelays.c index 5ef1c15e88ad2e..11e86739456d82 100644 --- a/tools/accounting/getdelays.c +++ b/tools/accounting/getdelays.c @@ -205,6 +205,8 @@ static void print_delayacct(struct taskstats *t) "RECLAIM %12s%15s%15s\n" " %15llu%15llu%15llums\n" "THRASHING%12s%15s%15s\n" + " %15llu%15llu%15llums\n" + "COMPACT %12s%15s%15s\n" " %15llu%15llu%15llums\n", "count", "real total", "virtual total", "delay total", "delay average", @@ -228,7 +230,11 @@ static void print_delayacct(struct taskstats *t) "count", "delay total", "delay average", (unsigned long long)t->thrashing_count, (unsigned long long)t->thrashing_delay_total, - average_ms(t->thrashing_delay_total, t->thrashing_count)); + average_ms(t->thrashing_delay_total, t->thrashing_count), + "count", "delay total", "delay average", + (unsigned long long)t->compact_count, + (unsigned long long)t->compact_delay_total, + average_ms(t->compact_delay_total, t->compact_count)); } static void task_context_switch_counts(struct taskstats *t) From 0aaa8977acbf3996d351f51b3b15295943092f63 Mon Sep 17 00:00:00 2001 From: Qian Cai Date: Wed, 19 Jan 2022 18:10:18 -0800 Subject: [PATCH 49/55] configs: introduce debug.config for CI-like setup Some general debugging features like kmemleak, KASAN, lockdep, UBSAN etc help fix many viruses like a microscope. On the other hand, those features are scatter around and mixed up with more situational debugging options making them difficult to consume properly. This cold help amplify the general debugging/testing efforts and help establish sensitive default values for those options across the broad. This could also help different distros to collaborate on maintaining debug-flavored kernels. The config is based on years' experiences running daily CI inside the largest enterprise Linux distro company to seek regressions on linux-next builds on different bare-metal and virtual platforms. It can be used for example, $ make ARCH=arm64 defconfig debug.config Since KASAN and KCSAN can't be enabled together, we will need to create a separate one for KCSAN later as well. Link: https://lkml.kernel.org/r/20211115134754.7334-1-quic_qiancai@quicinc.com Signed-off-by: Qian Cai Acked-by: Paul E. McKenney Cc: Marco Elver Cc: Dmitry Vyukov Cc: Daniel Thompson Cc: Masahiro Yamada Cc: Naresh Kamboju Cc: "Stephen Rothwell" Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/configs/debug.config | 105 ++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) create mode 100644 kernel/configs/debug.config diff --git a/kernel/configs/debug.config b/kernel/configs/debug.config new file mode 100644 index 00000000000000..e9ffb0cc1eecf5 --- /dev/null +++ b/kernel/configs/debug.config @@ -0,0 +1,105 @@ +# The config is based on running daily CI for enterprise Linux distros to +# seek regressions on linux-next builds on different bare-metal and virtual +# platforms. It can be used for example, +# +# $ make ARCH=arm64 defconfig debug.config +# +# Keep alphabetically sorted inside each section. +# +# printk and dmesg options +# +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DYNAMIC_DEBUG=y +CONFIG_PRINTK_CALLER=y +CONFIG_PRINTK_TIME=y +CONFIG_SYMBOLIC_ERRNAME=y +# +# Compile-time checks and compiler options +# +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_SECTION_MISMATCH=y +CONFIG_FRAME_WARN=2048 +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +# +# Generic Kernel Debugging Instruments +# +# CONFIG_UBSAN_ALIGNMENT is not set +# CONFIG_UBSAN_DIV_ZERO is not set +# CONFIG_UBSAN_TRAP is not set +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +CONFIG_DEBUG_FS=y +CONFIG_DEBUG_FS_ALLOW_ALL=y +CONFIG_DEBUG_IRQFLAGS=y +CONFIG_UBSAN=y +CONFIG_UBSAN_BOOL=y +CONFIG_UBSAN_BOUNDS=y +CONFIG_UBSAN_ENUM=y +CONFIG_UBSAN_SHIFT=y +CONFIG_UBSAN_UNREACHABLE=y +# +# Memory Debugging +# +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_WX is not set +# CONFIG_KFENCE is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_SLUB_STATS is not set +CONFIG_PAGE_EXTENSION=y +CONFIG_PAGE_OWNER=y +CONFIG_DEBUG_KMEMLEAK=y +CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN=y +CONFIG_DEBUG_OBJECTS=y +CONFIG_DEBUG_OBJECTS_ENABLE_DEFAULT=1 +CONFIG_DEBUG_OBJECTS_FREE=y +CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER=y +CONFIG_DEBUG_OBJECTS_RCU_HEAD=y +CONFIG_DEBUG_OBJECTS_TIMERS=y +CONFIG_DEBUG_OBJECTS_WORK=y +CONFIG_DEBUG_PER_CPU_MAPS=y +CONFIG_DEBUG_STACK_USAGE=y +CONFIG_DEBUG_VIRTUAL=y +CONFIG_DEBUG_VM=y +CONFIG_DEBUG_VM_PGFLAGS=y +CONFIG_DEBUG_VM_RB=y +CONFIG_DEBUG_VM_VMACACHE=y +CONFIG_GENERIC_PTDUMP=y +CONFIG_KASAN=y +CONFIG_KASAN_GENERIC=y +CONFIG_KASAN_INLINE=y +CONFIG_KASAN_VMALLOC=y +CONFIG_PTDUMP_DEBUGFS=y +CONFIG_SCHED_STACK_END_CHECK=y +CONFIG_SLUB_DEBUG_ON=y +# +# Debug Oops, Lockups and Hangs +# +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +# CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set +CONFIG_DEBUG_ATOMIC_SLEEP=y +CONFIG_DETECT_HUNG_TASK=y +CONFIG_PANIC_ON_OOPS=y +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SOFTLOCKUP_DETECTOR=y +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +# CONFIG_PROVE_RAW_LOCK_NESTING is not set +CONFIG_PROVE_LOCKING=y +# +# Debug kernel data structures +# +CONFIG_BUG_ON_DATA_CORRUPTION=y +# +# RCU Debugging +# +CONFIG_PROVE_RCU=y +CONFIG_PROVE_RCU_LIST=y +# +# Tracers +# +CONFIG_BRANCH_PROFILE_NONE=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y From e4bbd20d8c2b9fb5a937bf132775f5257ccb0412 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 19 Jan 2022 18:10:22 -0800 Subject: [PATCH 50/55] arch/Kconfig: split PAGE_SIZE_LESS_THAN_256KB from PAGE_SIZE_LESS_THAN_64KB Patch series "Fix CONFIG_TEST_KMOD with 256kB page size". The kernel test robot reported a build error [1] from a failed assertion in fs/btrfs/inode.c with a hexagon randconfig that includes CONFIG_PAGE_SIZE_256KB. This error is the same one that was addressed by commit b05fbcc36be1 ("btrfs: disable build on platforms having page size 256K") but CONFIG_TEST_KMOD selects CONFIG_BTRFS without having the "page size less than 256kB dependency", which results in the error reappearing. The first patch introduces CONFIG_PAGE_SIZE_LESS_THAN_256KB by splitting it off from CONFIG_PAGE_SIZE_LESS_THAN_64KB, which was introduced in commit 1f0e290cc5fd ("arch: Add generic Kconfig option indicating page size smaller than 64k") for a similar reason in 5.16-rc3. The second patch uses that configuration option for CONFIG_BTRFS to reduce duplication. The third patch resolves the build error by adding CONFIG_PAGE_SIZE_LESS_THAN_256KB as a dependency to CONFIG_TEST_KMOD so that CONFIG_BTRFS does not get enabled under that invalid configuration. [1]: https://lore.kernel.org/r/202111270255.UYOoN5VN-lkp@intel.com/ This patch (of 3): btrfs requires a page size smaller than 256kB. To use that dependency in other places, introduce CONFIG_PAGE_SIZE_LESS_THAN_256KB and reuse that dependency in CONFIG_PAGE_SIZE_LESS_THAN_64KB. Link: https://lkml.kernel.org/r/20211129230141.228085-1-nathan@kernel.org Link: https://lkml.kernel.org/r/20211129230141.228085-2-nathan@kernel.org Signed-off-by: Nathan Chancellor Cc: Chris Mason Cc: Josef Bacik Cc: David Sterba Cc: Luis Chamberlain Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/Kconfig b/arch/Kconfig index d3c4ab249e9c27..c1936e154e66be 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -998,6 +998,10 @@ config PAGE_SIZE_LESS_THAN_64KB depends on !PAGE_SIZE_64KB depends on !PARISC_PAGE_SIZE_64KB depends on !PPC_64K_PAGES + depends on PAGE_SIZE_LESS_THAN_256KB + +config PAGE_SIZE_LESS_THAN_256KB + def_bool y depends on !PPC_256K_PAGES depends on !PAGE_SIZE_256KB From e9009095998a8de4491692e89ca303fb74047c9e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 19 Jan 2022 18:10:25 -0800 Subject: [PATCH 51/55] btrfs: use generic Kconfig option for 256kB page size limit Use the newly introduced CONFIG_PAGE_SIZE_LESS_THAN_256KB to describe the dependency introduced by commit b05fbcc36be1 ("btrfs: disable build on platforms having page size 256K"). Link: https://lkml.kernel.org/r/20211129230141.228085-3-nathan@kernel.org Signed-off-by: Nathan Chancellor Acked-by: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: kernel test robot Cc: Luis Chamberlain Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 520a0f6a7d9e97..183e5c4aed348e 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -18,8 +18,7 @@ config BTRFS_FS select RAID6_PQ select XOR_BLOCKS select SRCU - depends on !PPC_256K_PAGES # powerpc - depends on !PAGE_SIZE_256KB # hexagon + depends on PAGE_SIZE_LESS_THAN_256KB help Btrfs is a general purpose copy-on-write filesystem with extents, From bbd2e05fad3e692ff2495895975bd0fce02bdbae Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 19 Jan 2022 18:10:28 -0800 Subject: [PATCH 52/55] lib/Kconfig.debug: make TEST_KMOD depend on PAGE_SIZE_LESS_THAN_256KB Commit b05fbcc36be1 ("btrfs: disable build on platforms having page size 256K") disabled btrfs for configurations that used a 256kB page size. However, it did not fully solve the problem because CONFIG_TEST_KMOD selects CONFIG_BTRFS, which does not account for the dependency. This results in a Kconfig warning and the failed BUILD_BUG_ON error returning. WARNING: unmet direct dependencies detected for BTRFS_FS Depends on [n]: BLOCK [=y] && !PPC_256K_PAGES && !PAGE_SIZE_256KB [=y] Selected by [m]: - TEST_KMOD [=m] && RUNTIME_TESTING_MENU [=y] && m && MODULES [=y] && NETDEVICES [=y] && NET_CORE [=y] && INET [=y] && BLOCK [=y] To resolve this, add CONFIG_PAGE_SIZE_LESS_THAN_256KB as a dependency of CONFIG_TEST_KMOD so there is no more invalid configuration or build errors. Link: https://lkml.kernel.org/r/20211129230141.228085-4-nathan@kernel.org Fixes: b05fbcc36be1 ("btrfs: disable build on platforms having page size 256K") Signed-off-by: Nathan Chancellor Reported-by: kernel test robot Cc: Chris Mason Cc: David Sterba Cc: Josef Bacik Cc: Luis Chamberlain Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.debug | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a789da4a19a17b..666e070feeb6b1 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -2505,6 +2505,7 @@ config TEST_KMOD depends on m depends on NETDEVICES && NET_CORE && INET # for TUN depends on BLOCK + depends on PAGE_SIZE_LESS_THAN_256KB # for BTRFS select TEST_LKM select XFS_FS select TUN From bece04b5b41dd7730dd06aec0d6b15c53d1fbb5a Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Wed, 19 Jan 2022 18:10:31 -0800 Subject: [PATCH 53/55] kcov: fix generic Kconfig dependencies if ARCH_WANTS_NO_INSTR Until recent versions of GCC and Clang, it was not possible to disable KCOV instrumentation via a function attribute. The relevant function attribute was introduced in 540540d06e9d9 ("kcov: add __no_sanitize_coverage to fix noinstr for all architectures"). x86 was the first architecture to want a working noinstr, and at the time no compiler support for the attribute existed yet. Therefore, commit 0f1441b44e823 ("objtool: Fix noinstr vs KCOV") introduced the ability to NOP __sanitizer_cov_*() calls in .noinstr.text. However, this doesn't work for other architectures like arm64 and s390 that want a working noinstr per ARCH_WANTS_NO_INSTR. At the time of 0f1441b44e823, we didn't yet have ARCH_WANTS_NO_INSTR, but now we can move the Kconfig dependency checks to the generic KCOV option. KCOV will be available if: - architecture does not care about noinstr, OR - we have objtool support (like on x86), OR - GCC is 12.0 or newer, OR - Clang is 13.0 or newer. Link: https://lkml.kernel.org/r/20211201152604.3984495-1-elver@google.com Signed-off-by: Marco Elver Reviewed-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Cc: Mark Rutland Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Nick Desaulniers Cc: Dmitry Vyukov Cc: Andrey Konovalov Cc: Catalin Marinas Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 +- lib/Kconfig.debug | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1275bab8be2cba..f79a063d8ea387 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -78,7 +78,7 @@ config X86 select ARCH_HAS_FILTER_PGPROT select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_KCOV if X86_64 && STACK_VALIDATION + select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEM_ENCRYPT select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 666e070feeb6b1..1a5d168c388b1f 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1979,6 +1979,8 @@ config KCOV bool "Code coverage for fuzzing" depends on ARCH_HAS_KCOV depends on CC_HAS_SANCOV_TRACE_PC || GCC_PLUGINS + depends on !ARCH_WANTS_NO_INSTR || STACK_VALIDATION || \ + GCC_VERSION >= 120000 || CLANG_VERSION >= 130000 select DEBUG_FS select GCC_PLUGIN_SANCOV if !CC_HAS_SANCOV_TRACE_PC help From 69d0db01e210e07fe915e5da91b54a867cda040f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 19 Jan 2022 18:10:35 -0800 Subject: [PATCH 54/55] ubsan: remove CONFIG_UBSAN_OBJECT_SIZE The object-size sanitizer is redundant to -Warray-bounds, and inappropriately performs its checks at run-time when all information needed for the evaluation is available at compile-time, making it quite difficult to use: https://bugzilla.kernel.org/show_bug.cgi?id=214861 With -Warray-bounds almost enabled globally, it doesn't make sense to keep this around. Link: https://lkml.kernel.org/r/20211203235346.110809-1-keescook@chromium.org Signed-off-by: Kees Cook Reviewed-by: Marco Elver Cc: Masahiro Yamada Cc: Michal Marek Cc: Nick Desaulniers Cc: Nathan Chancellor Cc: Andrey Ryabinin Cc: "Peter Zijlstra (Intel)" Cc: Stephen Rothwell Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/Kconfig.ubsan | 13 ------------- lib/test_ubsan.c | 22 ---------------------- scripts/Makefile.ubsan | 1 - 3 files changed, 36 deletions(-) diff --git a/lib/Kconfig.ubsan b/lib/Kconfig.ubsan index e5372a13511df1..236c5cefc4cc50 100644 --- a/lib/Kconfig.ubsan +++ b/lib/Kconfig.ubsan @@ -112,19 +112,6 @@ config UBSAN_UNREACHABLE This option enables -fsanitize=unreachable which checks for control flow reaching an expected-to-be-unreachable position. -config UBSAN_OBJECT_SIZE - bool "Perform checking for accesses beyond the end of objects" - default UBSAN - # gcc hugely expands stack usage with -fsanitize=object-size - # https://lore.kernel.org/lkml/CAHk-=wjPasyJrDuwDnpHJS2TuQfExwe=px-SzLeN8GFMAQJPmQ@mail.gmail.com/ - depends on !CC_IS_GCC - depends on $(cc-option,-fsanitize=object-size) - help - This option enables -fsanitize=object-size which checks for accesses - beyond the end of objects where the optimizer can determine both the - object being operated on and its size, usually seen with bad downcasts, - or access to struct members from NULL pointers. - config UBSAN_BOOL bool "Perform checking for non-boolean values used as boolean" default UBSAN diff --git a/lib/test_ubsan.c b/lib/test_ubsan.c index 7e7bbd0f3fd27d..2062be1f2e80f6 100644 --- a/lib/test_ubsan.c +++ b/lib/test_ubsan.c @@ -79,15 +79,6 @@ static void test_ubsan_load_invalid_value(void) eval2 = eval; } -static void test_ubsan_null_ptr_deref(void) -{ - volatile int *ptr = NULL; - int val; - - UBSAN_TEST(CONFIG_UBSAN_OBJECT_SIZE); - val = *ptr; -} - static void test_ubsan_misaligned_access(void) { volatile char arr[5] __aligned(4) = {1, 2, 3, 4, 5}; @@ -98,29 +89,16 @@ static void test_ubsan_misaligned_access(void) *ptr = val; } -static void test_ubsan_object_size_mismatch(void) -{ - /* "((aligned(8)))" helps this not into be misaligned for ptr-access. */ - volatile int val __aligned(8) = 4; - volatile long long *ptr, val2; - - UBSAN_TEST(CONFIG_UBSAN_OBJECT_SIZE); - ptr = (long long *)&val; - val2 = *ptr; -} - static const test_ubsan_fp test_ubsan_array[] = { test_ubsan_shift_out_of_bounds, test_ubsan_out_of_bounds, test_ubsan_load_invalid_value, test_ubsan_misaligned_access, - test_ubsan_object_size_mismatch, }; /* Excluded because they Oops the module. */ static const test_ubsan_fp skip_ubsan_array[] = { test_ubsan_divrem_overflow, - test_ubsan_null_ptr_deref, }; static int __init test_ubsan_init(void) diff --git a/scripts/Makefile.ubsan b/scripts/Makefile.ubsan index 9e2092fd5206c7..7099c603ff0ad3 100644 --- a/scripts/Makefile.ubsan +++ b/scripts/Makefile.ubsan @@ -8,7 +8,6 @@ ubsan-cflags-$(CONFIG_UBSAN_LOCAL_BOUNDS) += -fsanitize=local-bounds ubsan-cflags-$(CONFIG_UBSAN_SHIFT) += -fsanitize=shift ubsan-cflags-$(CONFIG_UBSAN_DIV_ZERO) += -fsanitize=integer-divide-by-zero ubsan-cflags-$(CONFIG_UBSAN_UNREACHABLE) += -fsanitize=unreachable -ubsan-cflags-$(CONFIG_UBSAN_OBJECT_SIZE) += -fsanitize=object-size ubsan-cflags-$(CONFIG_UBSAN_BOOL) += -fsanitize=bool ubsan-cflags-$(CONFIG_UBSAN_ENUM) += -fsanitize=enum ubsan-cflags-$(CONFIG_UBSAN_TRAP) += -fsanitize-undefined-trap-on-error From b1e78ef3be2533973953a35a56739fda7325875c Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 19 Jan 2022 18:10:38 -0800 Subject: [PATCH 55/55] lib: remove redundant assignment to variable ret The variable ret is being assigned a value that is never read. If the for-loop is entered then ret is immediately re-assigned a new value. If the for-loop is not executed ret is never read. The assignment is redundant and can be removed. Link: https://lkml.kernel.org/r/20211230134557.83633-1-colin.i.king@gmail.com Signed-off-by: Colin Ian King Reviewed-by: Jarkko Sakkinen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/asn1_encoder.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/asn1_encoder.c b/lib/asn1_encoder.c index 27bbe891714f91..0fd3c454a4689b 100644 --- a/lib/asn1_encoder.c +++ b/lib/asn1_encoder.c @@ -164,8 +164,6 @@ asn1_encode_oid(unsigned char *data, const unsigned char *end_data, data_len -= 3; - ret = 0; - for (i = 2; i < oid_len; i++) { ret = asn1_encode_oid_digit(&d, &data_len, oid[i]); if (ret < 0)