diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey.inc b/meta-digi-arm/recipes-kernel/linux/linux-dey.inc index d9c4603c1..87f6ec74c 100644 --- a/meta-digi-arm/recipes-kernel/linux/linux-dey.inc +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey.inc @@ -71,6 +71,20 @@ do_configure:append() { if [ -n "${@' '.join(find_cfgs(d))}" ]; then ${S}/scripts/kconfig/merge_config.sh -m -O ${B} ${B}/.config ${@" ".join(find_cfgs(d))} fi + # Apply ST-specific config fragments (ending in .config and stored in a different folder) + if [ ! -z "${KERNEL_CONFIG_FRAGMENTS}" ]; then + for f in ${KERNEL_CONFIG_FRAGMENTS} + do + # Check if the config fragment was copied into the WORKDIR from + # the OE meta data + if [ ! -e "$f" ]; then + bb_warn "Could not find kernel config fragment $f" + exit 1 + fi + done + # Now that all the fragments are located merge them. + (${S}/scripts/kconfig/merge_config.sh -m -r -O ${B} ${B}/.config ${KERNEL_CONFIG_FRAGMENTS} 1>&2 ) + fi } # Don't create custom folder for kernel artifacts diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0023-5.15-stm32mp-rt-49-r1-CLOCK.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0023-5.15-stm32mp-rt-49-r1-CLOCK.patch new file mode 100644 index 000000000..41180995b --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0023-5.15-stm32mp-rt-49-r1-CLOCK.patch @@ -0,0 +1,26 @@ +From 63e709173a20b85b473bbf4832f4e909692fd361 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:54:24 +0100 +Subject: [PATCH 23/28] 5.15-stm32mp-rt-49-r1 CLOCK + +Signed-off-by: Lionel VITTE +--- + drivers/clk/stm32/clk-stm32mp13.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/clk/stm32/clk-stm32mp13.c b/drivers/clk/stm32/clk-stm32mp13.c +index 15ee05df8..2f7a823bf 100644 +--- a/drivers/clk/stm32/clk-stm32mp13.c ++++ b/drivers/clk/stm32/clk-stm32mp13.c +@@ -840,7 +840,7 @@ static CLK_STM32_GATE(sai1, "pclk2", 0, GATE_SAI1); + static CLK_STM32_GATE(sai2, "pclk2", 0, GATE_SAI2); + static CLK_STM32_GATE(spi1, "pclk2", 0, GATE_SPI1); + +-static CLK_STM32_GATE(syscfg, "pclk3", 0, GATE_SYSCFG); ++static CLK_STM32_GATE(syscfg, "pclk3", CLK_IS_CRITICAL, GATE_SYSCFG); + static CLK_STM32_GATE(vref, "pclk3", 0, GATE_VREF); + static CLK_STM32_GATE(dts, "pclk3", 0, GATE_DTS); + static CLK_STM32_GATE(pmbctrl, "pclk3", 0, GATE_PMBCTRL); +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0024-5.15-stm32mp-rt-49-r1-DMA.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0024-5.15-stm32mp-rt-49-r1-DMA.patch new file mode 100644 index 000000000..af368b2d0 --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0024-5.15-stm32mp-rt-49-r1-DMA.patch @@ -0,0 +1,131 @@ +From 5a55de398d12848f13f7df59fb2f1853b7dd9ee8 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:56:07 +0100 +Subject: [PATCH 24/28] 5.15-stm32mp-rt-49-r1 DMA + +Signed-off-by: Lionel VITTE +--- + drivers/dma/stm32-dma.c | 35 +++++++++++++++++++++++++---------- + drivers/dma/stm32-mdma.c | 4 ++++ + 2 files changed, 29 insertions(+), 10 deletions(-) + +diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c +index 7c6078c6c..128edfb4f 100644 +--- a/drivers/dma/stm32-dma.c ++++ b/drivers/dma/stm32-dma.c +@@ -238,6 +238,7 @@ struct stm32_dma_chan { + u32 residue_after_drain; + struct workqueue_struct *mdma_wq; + struct work_struct mdma_work; ++ struct completion mdma_drain_completion; + }; + + struct stm32_dma_device { +@@ -570,8 +571,9 @@ static u32 stm32_dma_get_remaining_bytes(struct stm32_dma_chan *chan) + return ndtr << width; + } + +-static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan) ++static void stm32_dma_mdma_drain_worker(struct work_struct *work) + { ++ struct stm32_dma_chan *chan = container_of(work, struct stm32_dma_chan, mdma_work); + struct stm32_dma_mdma *mchan = &chan->mchan; + struct stm32_dma_sg_req *sg_req; + struct dma_device *ddev = mchan->chan->device; +@@ -583,14 +585,12 @@ static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan) + int ret; + unsigned long flags; + +- flush_workqueue(chan->mdma_wq); +- + /* DMA/MDMA chain: drain remaining data in SRAM */ + + /* Get the residue on MDMA side */ + status = dmaengine_tx_status(mchan->chan, mchan->chan->cookie, &state); + if (status == DMA_COMPLETE) +- return status; ++ goto mdma_complete; + + mdma_residue = state.residue; + sg_req = &chan->desc->sg_req[chan->next_sg - 1]; +@@ -623,24 +623,25 @@ static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan) + desc = ddev->device_prep_dma_memcpy(mchan->chan, dst_buf, src_buf, dma_to_write, + DMA_PREP_INTERRUPT); + if (!desc) +- return -EINVAL; ++ return; + + ret = dma_submit_error(dmaengine_submit(desc)); + if (ret < 0) +- return ret; ++ return; + + status = dma_wait_for_async_tx(desc); + if (status != DMA_COMPLETE) { + dev_err(chan2dev(chan), "%s dma_wait_for_async_tx error\n", __func__); + dmaengine_terminate_async(mchan->chan); +- return -EBUSY; ++ return; + } + + /* We need to store residue for tx_status() */ + chan->residue_after_drain = len - (mdma_wrote + dma_to_write); + } + +- return 0; ++mdma_complete: ++ complete(&chan->mdma_drain_completion); + } + + static void stm32_dma_synchronize(struct dma_chan *c) +@@ -648,9 +649,22 @@ static void stm32_dma_synchronize(struct dma_chan *c) + struct stm32_dma_chan *chan = to_stm32_dma_chan(c); + struct stm32_dma_mdma *mchan = &chan->mchan; + +- if (chan->desc && chan->use_mdma && mchan->dir == DMA_DEV_TO_MEM) +- if (stm32_dma_mdma_drain(chan)) ++ if (chan->desc && chan->use_mdma && mchan->dir == DMA_DEV_TO_MEM) { ++ unsigned long ms = 5000 + 100; /* dma_sync_wait_timeout + extra 100ms */ ++ ++ reinit_completion(&chan->mdma_drain_completion); ++ ++ flush_workqueue(chan->mdma_wq); ++ INIT_WORK(&chan->mdma_work, stm32_dma_mdma_drain_worker); ++ ++ if (!queue_work(chan->mdma_wq, &chan->mdma_work)) ++ dev_warn(chan2dev(chan), "Work already queued\n"); ++ ++ ms = wait_for_completion_timeout(&chan->mdma_drain_completion, ++ msecs_to_jiffies(ms)); ++ if (ms == 0) + dev_err(chan2dev(chan), "%s: can't drain DMA\n", __func__); ++ } + + if (chan->use_mdma) + dmaengine_synchronize(mchan->chan); +@@ -2338,6 +2352,7 @@ static int stm32_dma_probe(struct platform_device *pdev) + dev_warn(&pdev->dev, + "can't alloc MDMA workqueue for %s\n", name); + } ++ init_completion(&chan->mdma_drain_completion); + } + } + } +diff --git a/drivers/dma/stm32-mdma.c b/drivers/dma/stm32-mdma.c +index 133534663..a08c94638 100644 +--- a/drivers/dma/stm32-mdma.c ++++ b/drivers/dma/stm32-mdma.c +@@ -1270,6 +1270,10 @@ static int stm32_mdma_resume(struct dma_chan *c) + unsigned long flags; + u32 status, reg; + ++ /* Transfer can be terminated */ ++ if (!chan->desc || (stm32_mdma_read(dmadev, STM32_MDMA_CCR(chan->id)) & STM32_MDMA_CCR_EN)) ++ return -EPERM; ++ + hwdesc = chan->desc->node[chan->curr_hwdesc].hwdesc; + + spin_lock_irqsave(&chan->vchan.lock, flags); +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0025-5.15-stm32mp-rt-49-r1-MFD.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0025-5.15-stm32mp-rt-49-r1-MFD.patch new file mode 100644 index 000000000..80d4f531e --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0025-5.15-stm32mp-rt-49-r1-MFD.patch @@ -0,0 +1,27 @@ +From be5ec688053e6d136bc8ea54ed1e93d523b24580 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:56:45 +0100 +Subject: [PATCH 25/28] 5.15-stm32mp-rt-49-r1 MFD + +Signed-off-by: Lionel VITTE +--- + drivers/mfd/syscon.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c +index 191fdb87c..24530dfe5 100644 +--- a/drivers/mfd/syscon.c ++++ b/drivers/mfd/syscon.c +@@ -38,6 +38,9 @@ static const struct regmap_config syscon_regmap_config = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, ++#ifdef CONFIG_PREEMPT_RT ++ .use_raw_spinlock = true, ++#endif + }; + + static struct syscon *of_syscon_register(struct device_node *np, bool check_clk) +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch new file mode 100644 index 000000000..4bcaaebfb --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch @@ -0,0 +1,64 @@ +From 1f4b70cda804c4f3771902254a2614d87a1d366c Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:57:06 +0100 +Subject: [PATCH 26/28] 5.15-stm32mp-rt-49-r1 NET-TTY + +Signed-off-by: Lionel VITTE +--- + drivers/tty/serial/stm32-usart.c | 32 +++++++++++--------------------- + 1 file changed, 11 insertions(+), 21 deletions(-) + +diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c +index 4d7a31664..0cd8e9672 100644 +--- a/drivers/tty/serial/stm32-usart.c ++++ b/drivers/tty/serial/stm32-usart.c +@@ -772,26 +772,16 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) + } + + if ((sr & USART_SR_RTOF) && !(stm32_port->throttled) && +- stm32_usart_rx_dma_started(stm32_port)) +- return IRQ_WAKE_THREAD; +- else +- return IRQ_HANDLED; +-} +- +-static irqreturn_t stm32_usart_threaded_interrupt(int irq, void *ptr) +-{ +- struct uart_port *port = ptr; +- struct tty_port *tport = &port->state->port; +- unsigned int size; +- unsigned long flags; +- +- /* Receiver timeout irq for DMA RX */ +- spin_lock_irqsave(&port->lock, flags); +- size = stm32_usart_receive_chars(port, false); +- uart_unlock_and_check_sysrq_irqrestore(port, flags); +- if (size) +- tty_flip_buffer_push(tport); ++ stm32_usart_rx_dma_started(stm32_port)) { ++ unsigned long flags; + ++ spin_lock_irqsave(&port->lock, flags); ++ /* Receiver timeout irq for DMA RX */ ++ size = stm32_usart_receive_chars(port, false); ++ uart_unlock_and_check_sysrq_irqrestore(port, flags); ++ if (size) ++ tty_flip_buffer_push(tport); ++ } + return IRQ_HANDLED; + } + +@@ -980,8 +970,8 @@ static int stm32_usart_startup(struct uart_port *port) + u32 val; + int ret; + +- ret = request_threaded_irq(port->irq, stm32_usart_interrupt, +- stm32_usart_threaded_interrupt, ++ ret = request_threaded_irq(port->irq, NULL, ++ stm32_usart_interrupt, + IRQF_ONESHOT | IRQF_NO_SUSPEND, + name, port); + if (ret) +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch new file mode 100644 index 000000000..dac422be8 --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch @@ -0,0 +1,25 @@ +From e1bd8bc5502e661be4feaadfca1889da1d48cd73 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:57:43 +0100 +Subject: [PATCH 27/28] 5.15-stm32mp-rt-49-r1 DEVICETREE + +Signed-off-by: Lionel VITTE +--- + arch/arm/boot/dts/stm32mp131.dtsi | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/arch/arm/boot/dts/stm32mp131.dtsi b/arch/arm/boot/dts/stm32mp131.dtsi +index 8121ddc97..3fc06961a 100644 +--- a/arch/arm/boot/dts/stm32mp131.dtsi ++++ b/arch/arm/boot/dts/stm32mp131.dtsi +@@ -1241,7 +1241,6 @@ exti-interrupt-map { + syscfg: syscon@50020000 { + compatible = "st,stm32mp157-syscfg", "syscon"; + reg = <0x50020000 0x400>; +- clocks = <&rcc SYSCFG>; + }; + + lptimer2: timer@50021000 { +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0028-5.15-stm32mp-rt-49-r1-CONFIG.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0028-5.15-stm32mp-rt-49-r1-CONFIG.patch new file mode 100644 index 000000000..e697ee7e5 --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0028-5.15-stm32mp-rt-49-r1-CONFIG.patch @@ -0,0 +1,82 @@ +From 05ea3c26ccad3359d94dbe3c7ba758c2ba2f7dd9 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:59:08 +0100 +Subject: [PATCH 28/28] 5.15-stm32mp-rt-49-r1 CONFIG + +Signed-off-by: Lionel VITTE +--- + .../configs/fragment-07-rt-sysvinit.config | 12 +++++++ + arch/arm/configs/fragment-07-rt.config | 32 +++++++++++++++++++ + arch/arm/configs/fragment-08-rt-mp13.config | 2 ++ + 3 files changed, 46 insertions(+) + create mode 100644 arch/arm/configs/fragment-07-rt-sysvinit.config + create mode 100644 arch/arm/configs/fragment-07-rt.config + create mode 100644 arch/arm/configs/fragment-08-rt-mp13.config + +diff --git a/arch/arm/configs/fragment-07-rt-sysvinit.config b/arch/arm/configs/fragment-07-rt-sysvinit.config +new file mode 100644 +index 000000000..49a4baf60 +--- /dev/null ++++ b/arch/arm/configs/fragment-07-rt-sysvinit.config +@@ -0,0 +1,12 @@ ++CONFIG_CGROUPS=y ++# CONFIG_CGROUP_SCHED is not set ++# CONFIG_CGROUP_PIDS is not set ++# CONFIG_CGROUP_RDMA is not set ++# CONFIG_CGROUP_FREEZER is not set ++# CONFIG_CGROUP_DEVICE is not set ++# CONFIG_CGROUP_CPUACCT is not set ++# CONFIG_CGROUP_PERF is not set ++# CONFIG_CGROUP_DEBUG is not set ++# CONFIG_CGROUP_NET_PRIO is not set ++# CONFIG_CGROUP_NET_CLASSID is not set ++ +diff --git a/arch/arm/configs/fragment-07-rt.config b/arch/arm/configs/fragment-07-rt.config +new file mode 100644 +index 000000000..98bb8735f +--- /dev/null ++++ b/arch/arm/configs/fragment-07-rt.config +@@ -0,0 +1,32 @@ ++CONFIG_PREEMPT_RT=y ++ ++# disable SCHED_MC ++# CONFIG_MCPM is not set ++ ++# Disable CPUFREQ and CPUIDLE ++# CONFIG_CPU_FREQ is not set ++# CONFIG_CPU_IDLE is not set ++ ++# Force to have HIGH_RES_TIMERS ++CONFIG_HIGH_RES_TIMERS=y ++ ++# force do not go to sleep ++# For multiple core, you should set the specific boot options ++# for isolate the core and render it tickless: "isolcpus=2,3 nohz_full=2,3" ++# Warning: to active only if SMP are present ++# CONFIG_HZ_PERIODIC=y ++ ++# to Enable ftrace, you need to enable the following configuraiton: ++# CONFIG_FTRACE=y ++# CONFIG_IRQSOFF_TRACER=y ++# CONFIG_PREEMPT_TRACER=y ++# CONFIG_SCHED_TRACER=y ++# CONFIG_FUNCTION_TRACER=y ++# By default, the ftrace for RT kernel are disabled ++# CONFIG_FTRACE is not set ++# CONFIG_IRQSOFF_TRACER is not set ++# CONFIG_PREEMPT_TRACER is not set ++# CONFIG_SCHED_TRACER is not set ++# CONFIG_FUNCTION_TRACER is not set ++ ++ +diff --git a/arch/arm/configs/fragment-08-rt-mp13.config b/arch/arm/configs/fragment-08-rt-mp13.config +new file mode 100644 +index 000000000..c70d7adc6 +--- /dev/null ++++ b/arch/arm/configs/fragment-08-rt-mp13.config +@@ -0,0 +1,2 @@ ++# Disable SMP on MP13 ++# CONFIG_SMP is not set +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/patch-5.15.119-rt65.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/patch-5.15.119-rt65.patch new file mode 100644 index 000000000..b2cf5ed68 --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/patch-5.15.119-rt65.patch @@ -0,0 +1,10968 @@ +diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst +index dd913eefbf31..33d3c988b951 100644 +--- a/Documentation/admin-guide/cgroup-v1/memory.rst ++++ b/Documentation/admin-guide/cgroup-v1/memory.rst +@@ -64,6 +64,7 @@ Brief summary of control files. + threads + cgroup.procs show list of processes + cgroup.event_control an interface for event_fd() ++ This knob is not available on CONFIG_PREEMPT_RT systems. + memory.usage_in_bytes show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes show current usage for memory+Swap +@@ -75,6 +76,7 @@ Brief summary of control files. + memory.max_usage_in_bytes show max memory usage recorded + memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded + memory.soft_limit_in_bytes set/show soft limit of memory usage ++ This knob is not available on CONFIG_PREEMPT_RT systems. + memory.stat show various statistics + memory.use_hierarchy set/show hierarchical account enabled + This knob is deprecated and shouldn't be +diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst +index d2c4c27e1702..d83c9ab49427 100644 +--- a/Documentation/dev-tools/kcov.rst ++++ b/Documentation/dev-tools/kcov.rst +@@ -50,6 +50,7 @@ program using kcov: + #include + #include + #include ++ #include + + #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) + #define KCOV_ENABLE _IO('c', 100) +@@ -177,6 +178,8 @@ Comparison operands collection is similar to coverage collection: + /* Read number of comparisons collected. */ + n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + for (i = 0; i < n; i++) { ++ uint64_t ip; ++ + type = cover[i * KCOV_WORDS_PER_CMP + 1]; + /* arg1 and arg2 - operands of the comparison. */ + arg1 = cover[i * KCOV_WORDS_PER_CMP + 2]; +@@ -251,6 +254,8 @@ selectively from different subsystems. + + .. code-block:: c + ++ /* Same includes and defines as above. */ ++ + struct kcov_remote_arg { + __u32 trace_mode; + __u32 area_size; +diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h +index 1d5716bc060b..2526fd3be5fd 100644 +--- a/arch/alpha/include/asm/spinlock_types.h ++++ b/arch/alpha/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ALPHA_SPINLOCK_TYPES_H + #define _ALPHA_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index a8ae17f5740d..0e8631b96e0f 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -32,6 +32,7 @@ config ARM + select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_MEMTEST +@@ -68,7 +69,7 @@ config ARM + select HARDIRQS_SW_RESEND + select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT + select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 +- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU ++ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT + select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL + select HAVE_ARCH_MMAP_RND_BITS if MMU +@@ -109,6 +110,7 @@ config ARM + select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RSEQ +@@ -124,6 +126,7 @@ config ARM + select OLD_SIGSUSPEND3 + select PCI_SYSCALL if PCI + select PERF_USE_VMALLOC ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select RTC_LIB + select SYS_SUPPORTS_APM_EMULATION + select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M +diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h +index 5976958647fe..0c14b36ef101 100644 +--- a/arch/arm/include/asm/spinlock_types.h ++++ b/arch/arm/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h +index b682189a2b5d..e5e2ceb59544 100644 +--- a/arch/arm/include/asm/thread_info.h ++++ b/arch/arm/include/asm/thread_info.h +@@ -52,6 +52,7 @@ struct cpu_context_save { + struct thread_info { + unsigned long flags; /* low level flags */ + int preempt_count; /* 0 => preemptable, <0 => bug */ ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + struct task_struct *task; /* main task structure */ + __u32 cpu; /* cpu */ + __u32 cpu_domain; /* cpu domain */ +@@ -130,6 +131,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ + #define TIF_UPROBE 3 /* breakpointed or singlestepping */ + #define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 9 + + #define TIF_USING_IWMMXT 17 + #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ +@@ -149,6 +151,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) + #define _TIF_SECCOMP (1 << TIF_SECCOMP) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) + + /* Checks for any syscall work in entry-common.S */ +@@ -158,7 +161,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + /* + * Change these and you break ASM code in entry-common.S + */ +-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ ++#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NOTIFY_SIGNAL) + +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index a646a3f6440f..beb09d74684f 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -43,6 +43,7 @@ int main(void) + BLANK(); + DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); ++ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); + DEFINE(TI_TASK, offsetof(struct thread_info, task)); + DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); + DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); +diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S +index 68261a83b7ad..fa7d110ce555 100644 +--- a/arch/arm/kernel/entry-armv.S ++++ b/arch/arm/kernel/entry-armv.S +@@ -206,11 +206,18 @@ __irq_svc: + + #ifdef CONFIG_PREEMPTION + ldr r8, [tsk, #TI_PREEMPT] @ get preempt count +- ldr r0, [tsk, #TI_FLAGS] @ get flags + teq r8, #0 @ if preempt count != 0 ++ bne 1f @ return from exeption ++ ldr r0, [tsk, #TI_FLAGS] @ get flags ++ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set ++ blne svc_preempt @ preempt! ++ ++ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count ++ teq r8, #0 @ if preempt lazy count != 0 + movne r0, #0 @ force flags to 0 +- tst r0, #_TIF_NEED_RESCHED ++ tst r0, #_TIF_NEED_RESCHED_LAZY + blne svc_preempt ++1: + #endif + + svc_exit r5, irq = 1 @ return from exception +@@ -225,8 +232,14 @@ svc_preempt: + 1: bl preempt_schedule_irq @ irq en/disable is done inside + ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS + tst r0, #_TIF_NEED_RESCHED ++ bne 1b ++ tst r0, #_TIF_NEED_RESCHED_LAZY + reteq r8 @ go again +- b 1b ++ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count ++ teq r0, #0 @ if preempt lazy count != 0 ++ beq 1b ++ ret r8 @ go again ++ + #endif + + __und_fault: +diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c +index 539897ac2828..4655f04ccdcd 100644 +--- a/arch/arm/kernel/signal.c ++++ b/arch/arm/kernel/signal.c +@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) + */ + trace_hardirqs_off(); + do { +- if (likely(thread_flags & _TIF_NEED_RESCHED)) { ++ if (likely(thread_flags & (_TIF_NEED_RESCHED | ++ _TIF_NEED_RESCHED_LAZY))) { + schedule(); + } else { + if (unlikely(!user_mode(regs))) +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index af5177801fb1..1de016008e2e 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -400,6 +400,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + if (addr < TASK_SIZE) + return do_page_fault(addr, fsr, regs); + ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ + if (user_mode(regs)) + goto bad_area; + +@@ -470,6 +473,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + static int + do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ + do_bad_area(addr, fsr, regs); + return 0; + } +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 9d3cbe786f8d..c86b845d0d79 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -88,6 +88,7 @@ config ARM64 + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_NUMA_BALANCING ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +@@ -191,6 +192,7 @@ config ARM64 + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP + select HAVE_REGS_AND_STACK_ACCESS_API ++ select HAVE_PREEMPT_LAZY + select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_FUTEX_CMPXCHG if FUTEX + select MMU_GATHER_RCU_TABLE_FREE +@@ -212,6 +214,7 @@ config ARM64 + select PCI_DOMAINS_GENERIC if PCI + select PCI_ECAM if (ACPI && PCI) + select PCI_SYSCALL if PCI ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select POWER_RESET + select POWER_SUPPLY + select SPARSE_IRQ +diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h +index ed57717cd004..63b39229890b 100644 +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -1001,7 +1001,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, + */ + static inline bool arch_faults_on_old_pte(void) + { +- WARN_ON(preemptible()); ++ WARN_ON(is_migratable()); + + return !cpu_has_hw_af(); + } +diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h +index e83f0982b99c..2545c17281e1 100644 +--- a/arch/arm64/include/asm/preempt.h ++++ b/arch/arm64/include/asm/preempt.h +@@ -70,13 +70,36 @@ static inline bool __preempt_count_dec_and_test(void) + * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE + * pair. + */ +- return !pc || !READ_ONCE(ti->preempt_count); ++ if (!pc || !READ_ONCE(ti->preempt_count)) ++ return true; ++#ifdef CONFIG_PREEMPT_LAZY ++ if ((pc & ~PREEMPT_NEED_RESCHED)) ++ return false; ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else ++ return false; ++#endif + } + + static inline bool should_resched(int preempt_offset) + { ++#ifdef CONFIG_PREEMPT_LAZY ++ u64 pc = READ_ONCE(current_thread_info()->preempt_count); ++ if (pc == preempt_offset) ++ return true; ++ ++ if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) ++ return false; ++ ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + return pc == preempt_offset; ++#endif + } + + #ifdef CONFIG_PREEMPTION +diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h +index ef449f5f4ba8..5e535c3e4926 100644 +--- a/arch/arm64/include/asm/signal.h ++++ b/arch/arm64/include/asm/signal.h +@@ -22,4 +22,8 @@ static inline void __user *arch_untagged_si_addr(void __user *addr, + } + #define arch_untagged_si_addr arch_untagged_si_addr + ++#if defined(CONFIG_PREEMPT_RT) ++#define ARCH_RT_DELAYS_SIGNAL_SEND ++#endif ++ + #endif +diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h +index 18782f0c4721..11ab1c077697 100644 +--- a/arch/arm64/include/asm/spinlock_types.h ++++ b/arch/arm64/include/asm/spinlock_types.h +@@ -5,7 +5,7 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) ++#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) + # error "please don't include this file directly" + #endif + +diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h +index 6623c99f0984..c55ccec33a5a 100644 +--- a/arch/arm64/include/asm/thread_info.h ++++ b/arch/arm64/include/asm/thread_info.h +@@ -26,6 +26,7 @@ struct thread_info { + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ + #endif ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + union { + u64 preempt_count; /* 0 => preemptible, <0 => bug */ + struct { +@@ -67,6 +68,7 @@ int arch_dup_task_struct(struct task_struct *dst, + #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ + #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ + #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 7 + #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ + #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ + #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ +@@ -97,8 +99,10 @@ int arch_dup_task_struct(struct task_struct *dst, + #define _TIF_SVE (1 << TIF_SVE) + #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + +-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ ++#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ + _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ + _TIF_NOTIFY_SIGNAL) +@@ -107,6 +111,8 @@ int arch_dup_task_struct(struct task_struct *dst, + _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ + _TIF_SYSCALL_EMU) + ++#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) ++ + #ifdef CONFIG_SHADOW_CALL_STACK + #define INIT_SCS \ + .scs_base = init_shadow_call_stack, \ +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index 551427ae8cc5..96a4f6c9eb78 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -31,6 +31,7 @@ int main(void) + BLANK(); + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); ++ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); + #endif +diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c +index 7a3fcf21b18a..5689d2b0c0b6 100644 +--- a/arch/arm64/kernel/fpsimd.c ++++ b/arch/arm64/kernel/fpsimd.c +@@ -179,10 +179,19 @@ static void __get_cpu_fpsimd_context(void) + * + * The double-underscore version must only be called if you know the task + * can't be preempted. ++ * ++ * On RT kernels local_bh_disable() is not sufficient because it only ++ * serializes soft interrupt related sections via a local lock, but stays ++ * preemptible. Disabling preemption is the right choice here as bottom ++ * half processing is always in thread context on RT kernels so it ++ * implicitly prevents bottom half processing as well. + */ + static void get_cpu_fpsimd_context(void) + { +- local_bh_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_bh_disable(); ++ else ++ preempt_disable(); + __get_cpu_fpsimd_context(); + } + +@@ -203,7 +212,10 @@ static void __put_cpu_fpsimd_context(void) + static void put_cpu_fpsimd_context(void) + { + __put_cpu_fpsimd_context(); +- local_bh_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_bh_enable(); ++ else ++ preempt_enable(); + } + + static bool have_cpu_fpsimd_context(void) +@@ -1033,6 +1045,7 @@ void fpsimd_thread_switch(struct task_struct *next) + void fpsimd_flush_thread(void) + { + int vl, supported_vl; ++ void *sve_state = NULL; + + if (!system_supports_fpsimd()) + return; +@@ -1045,7 +1058,10 @@ void fpsimd_flush_thread(void) + + if (system_supports_sve()) { + clear_thread_flag(TIF_SVE); +- sve_free(current); ++ ++ /* Defer kfree() while in atomic context */ ++ sve_state = current->thread.sve_state; ++ current->thread.sve_state = NULL; + + /* + * Reset the task vector length as required. +@@ -1079,6 +1095,7 @@ void fpsimd_flush_thread(void) + } + + put_cpu_fpsimd_context(); ++ kfree(sve_state); + } + + /* +diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c +index b3e1beccf458..03183563feb8 100644 +--- a/arch/arm64/kernel/signal.c ++++ b/arch/arm64/kernel/signal.c +@@ -922,7 +922,7 @@ static void do_signal(struct pt_regs *regs) + void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) + { + do { +- if (thread_flags & _TIF_NEED_RESCHED) { ++ if (thread_flags & _TIF_NEED_RESCHED_MASK) { + /* Unmask Debug and SError for the next task */ + local_daif_restore(DAIF_PROCCTX_NOIRQ); + +@@ -930,6 +930,14 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) + } else { + local_daif_restore(DAIF_PROCCTX); + ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (unlikely(current->forced_info.si_signo)) { ++ struct task_struct *t = current; ++ force_sig_info(&t->forced_info); ++ t->forced_info.si_signo = 0; ++ } ++#endif ++ + if (thread_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + +diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c +index 3fe816c244ce..ba8c69cda361 100644 +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -828,7 +828,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + * involves poking the GIC, which must be done in a + * non-preemptible context. + */ +- preempt_disable(); ++ migrate_disable(); + + kvm_pmu_flush_hwstate(vcpu); + +@@ -852,7 +852,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + kvm_timer_sync_user(vcpu); + kvm_vgic_sync_hwstate(vcpu); + local_irq_enable(); +- preempt_enable(); ++ migrate_enable(); + continue; + } + +@@ -921,7 +921,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, ret); + +- preempt_enable(); ++ migrate_enable(); + + /* + * The ARMv8 architecture doesn't give the hypervisor +diff --git a/arch/csky/include/asm/spinlock_types.h b/arch/csky/include/asm/spinlock_types.h +index 8ff0f6ff3a00..db87a12c3827 100644 +--- a/arch/csky/include/asm/spinlock_types.h ++++ b/arch/csky/include/asm/spinlock_types.h +@@ -3,7 +3,7 @@ + #ifndef __ASM_CSKY_SPINLOCK_TYPES_H + #define __ASM_CSKY_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h +index 19d233497ba5..d5f66495b670 100644 +--- a/arch/hexagon/include/asm/spinlock_types.h ++++ b/arch/hexagon/include/asm/spinlock_types.h +@@ -8,7 +8,7 @@ + #ifndef _ASM_SPINLOCK_TYPES_H + #define _ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h +index 6e345fefcdca..14b8a161c165 100644 +--- a/arch/ia64/include/asm/spinlock_types.h ++++ b/arch/ia64/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_IA64_SPINLOCK_TYPES_H + #define _ASM_IA64_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 27222b75d2a4..5495225807eb 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -151,6 +151,7 @@ config PPC + select ARCH_STACKWALK + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if PPC64 + select ARCH_USE_MEMTEST +@@ -218,6 +219,7 @@ config PPC + select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) + select HAVE_IOREMAP_PROT + select HAVE_IRQ_TIME_ACCOUNTING ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE + select HAVE_KERNEL_LZO if DEFAULT_UIMAGE +@@ -234,6 +236,7 @@ config PPC + select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE + select HAVE_RSEQ +diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h +index 0f3cdd8faa95..08243338069d 100644 +--- a/arch/powerpc/include/asm/simple_spinlock_types.h ++++ b/arch/powerpc/include/asm/simple_spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h +index 7ef1cd8168a0..f9e63cacd220 100644 +--- a/arch/powerpc/include/asm/smp.h ++++ b/arch/powerpc/include/asm/smp.h +@@ -62,6 +62,7 @@ struct smp_ops_t { + + extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); + extern int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); ++extern void smp_send_debugger_break_cpu(unsigned int cpu); + extern void smp_send_debugger_break(void); + extern void start_secondary_resume(void); + extern void smp_generic_give_timebase(void); +diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h +index c5d742f18021..d5f8a74ed2e8 100644 +--- a/arch/powerpc/include/asm/spinlock_types.h ++++ b/arch/powerpc/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h +index 1c8460e23583..b1653c160bab 100644 +--- a/arch/powerpc/include/asm/stackprotector.h ++++ b/arch/powerpc/include/asm/stackprotector.h +@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void) + unsigned long canary; + + /* Try to get a semi random initial value. */ ++#ifdef CONFIG_PREEMPT_RT ++ canary = (unsigned long)&canary; ++#else + canary = get_random_canary(); ++#endif + canary ^= mftb(); + canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; +diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h +index 87013ac2a640..2920ed371188 100644 +--- a/arch/powerpc/include/asm/thread_info.h ++++ b/arch/powerpc/include/asm/thread_info.h +@@ -53,6 +53,8 @@ + struct thread_info { + int preempt_count; /* 0 => preemptable, + <0 => BUG */ ++ int preempt_lazy_count; /* 0 => preemptable, ++ <0 => BUG */ + unsigned long local_flags; /* private flags for thread */ + #ifdef CONFIG_LIVEPATCH + unsigned long *livepatch_sp; +@@ -99,6 +101,7 @@ void arch_setup_new_exec(void); + #define TIF_PATCH_PENDING 6 /* pending live patching update */ + #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ + #define TIF_SINGLESTEP 8 /* singlestepping active */ ++#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */ + #define TIF_SECCOMP 10 /* secure computing */ + #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ + #define TIF_NOERROR 12 /* Force successful syscall return */ +@@ -114,6 +117,7 @@ void arch_setup_new_exec(void); + #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ + #define TIF_32BIT 20 /* 32 bit binary */ + ++ + /* as above, but as bit values */ + #define _TIF_SYSCALL_TRACE (1<flags); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); +- if (ti_flags & _TIF_NEED_RESCHED) { ++ if (ti_flags & _TIF_NEED_RESCHED_MASK) { + schedule(); + } else { + /* +@@ -554,11 +554,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) + /* Returning to a kernel context with local irqs enabled. */ + WARN_ON_ONCE(!(regs->msr & MSR_EE)); + again: +- if (IS_ENABLED(CONFIG_PREEMPT)) { ++ if (IS_ENABLED(CONFIG_PREEMPTION)) { + /* Return to preemptible kernel context */ + if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) { + if (preempt_count() == 0) + preempt_schedule_irq(); ++ } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) { ++ if ((preempt_count() == 0) && ++ (current_thread_info()->preempt_lazy_count == 0)) ++ preempt_schedule_irq(); + } + } + +diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c +index c4f1d6b7d992..02e17a57da83 100644 +--- a/arch/powerpc/kernel/irq.c ++++ b/arch/powerpc/kernel/irq.c +@@ -690,6 +690,7 @@ static inline void check_stack_overflow(void) + } + } + ++#ifndef CONFIG_PREEMPT_RT + static __always_inline void call_do_softirq(const void *sp) + { + /* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */ +@@ -708,6 +709,7 @@ static __always_inline void call_do_softirq(const void *sp) + "r11", "r12" + ); + } ++#endif + + static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) + { +@@ -820,10 +822,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly; + void *softirq_ctx[NR_CPUS] __read_mostly; + void *hardirq_ctx[NR_CPUS] __read_mostly; + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + call_do_softirq(softirq_ctx[smp_processor_id()]); + } ++#endif + + irq_hw_number_t virq_to_hw(unsigned int virq) + { +diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c +index bdee7262c080..d57d37497862 100644 +--- a/arch/powerpc/kernel/kgdb.c ++++ b/arch/powerpc/kernel/kgdb.c +@@ -120,11 +120,19 @@ int kgdb_skipexception(int exception, struct pt_regs *regs) + + static int kgdb_debugger_ipi(struct pt_regs *regs) + { +- kgdb_nmicallback(raw_smp_processor_id(), regs); ++ int cpu = raw_smp_processor_id(); ++ ++ if (!kgdb_roundup_delay(cpu)) ++ kgdb_nmicallback(cpu, regs); + return 0; + } + + #ifdef CONFIG_SMP ++void kgdb_roundup_cpu(unsigned int cpu) ++{ ++ smp_send_debugger_break_cpu(cpu); ++} ++ + void kgdb_roundup_cpus(void) + { + smp_send_debugger_break(); +diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c +index fb95f92dcfac..308765f2e7a0 100644 +--- a/arch/powerpc/kernel/smp.c ++++ b/arch/powerpc/kernel/smp.c +@@ -590,6 +590,11 @@ static void debugger_ipi_callback(struct pt_regs *regs) + debugger_ipi(regs); + } + ++void smp_send_debugger_break_cpu(unsigned int cpu) ++{ ++ smp_send_nmi_ipi(cpu, debugger_ipi_callback, 1000000); ++} ++ + void smp_send_debugger_break(void) + { + smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000); +diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c +index a08bb7cefdc5..ae34f68eedc1 100644 +--- a/arch/powerpc/kernel/traps.c ++++ b/arch/powerpc/kernel/traps.c +@@ -260,12 +260,17 @@ static char *get_mmu_str(void) + + static int __die(const char *str, struct pt_regs *regs, long err) + { ++ const char *pr = ""; ++ + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + ++ if (IS_ENABLED(CONFIG_PREEMPTION)) ++ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; ++ + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", + IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", + PAGE_SIZE / 1024, get_mmu_str(), +- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", ++ pr, + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", +diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig +index ff581d70f20c..e5c84d55bdfb 100644 +--- a/arch/powerpc/kvm/Kconfig ++++ b/arch/powerpc/kvm/Kconfig +@@ -178,6 +178,7 @@ config KVM_E500MC + config KVM_MPIC + bool "KVM in-kernel MPIC emulation" + depends on KVM && E500 ++ depends on !PREEMPT_RT + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD + select HAVE_KVM_IRQ_ROUTING +diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c +index ec5d84b4958c..62a80ecc6735 100644 +--- a/arch/powerpc/platforms/pseries/iommu.c ++++ b/arch/powerpc/platforms/pseries/iommu.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -200,7 +201,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + return ret; + } + +-static DEFINE_PER_CPU(__be64 *, tce_page); ++struct tce_page { ++ __be64 * page; ++ local_lock_t lock; ++}; ++static DEFINE_PER_CPU(struct tce_page, tce_page) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + long npages, unsigned long uaddr, +@@ -223,9 +230,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + direction, attrs); + } + +- local_irq_save(flags); /* to protect tcep and the page behind it */ ++ /* to protect tcep and the page behind it */ ++ local_lock_irqsave(&tce_page.lock, flags); + +- tcep = __this_cpu_read(tce_page); ++ tcep = __this_cpu_read(tce_page.page); + + /* This is safe to do since interrupts are off when we're called + * from iommu_alloc{,_sg}() +@@ -234,12 +242,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + /* If allocation fails, fall back to the loop implementation */ + if (!tcep) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tceshift, + npages, uaddr, direction, attrs); + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + rpn = __pa(uaddr) >> tceshift; +@@ -269,7 +277,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + tcenum += limit; + } while (npages > 0 && !rc); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); + + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { + ret = (int)rc; +@@ -454,16 +462,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + DMA_BIDIRECTIONAL, 0); + } + +- local_irq_disable(); /* to protect tcep and the page behind it */ +- tcep = __this_cpu_read(tce_page); ++ /* to protect tcep and the page behind it */ ++ local_lock_irq(&tce_page.lock); ++ tcep = __this_cpu_read(tce_page.page); + + if (!tcep) { + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { +- local_irq_enable(); ++ local_unlock_irq(&tce_page.lock); + return -ENOMEM; + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; +@@ -506,7 +515,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + + /* error cleanup: caller will clear whole range */ + +- local_irq_enable(); ++ local_unlock_irq(&tce_page.lock); + return rc; + } + +diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h +index f398e7638dd6..5a35a49505da 100644 +--- a/arch/riscv/include/asm/spinlock_types.h ++++ b/arch/riscv/include/asm/spinlock_types.h +@@ -6,7 +6,7 @@ + #ifndef _ASM_RISCV_SPINLOCK_TYPES_H + #define _ASM_RISCV_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h +index a2bbfd7df85f..b69695e39957 100644 +--- a/arch/s390/include/asm/spinlock_types.h ++++ b/arch/s390/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h +index e82369f286a2..907bda4b1619 100644 +--- a/arch/sh/include/asm/spinlock_types.h ++++ b/arch/sh/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_SH_SPINLOCK_TYPES_H + #define __ASM_SH_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c +index ef0f0827cf57..2d3eca8fee01 100644 +--- a/arch/sh/kernel/irq.c ++++ b/arch/sh/kernel/irq.c +@@ -149,6 +149,7 @@ void irq_ctx_exit(int cpu) + hardirq_ctx[cpu] = NULL; + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + struct thread_info *curctx; +@@ -176,6 +177,7 @@ void do_softirq_own_stack(void) + "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" + ); + } ++#endif + #else + static inline void handle_one_irq(unsigned int irq) + { +diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c +index c8848bb681a1..41fa1be980a3 100644 +--- a/arch/sparc/kernel/irq_64.c ++++ b/arch/sparc/kernel/irq_64.c +@@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs) + set_irq_regs(old_regs); + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + void *orig_sp, *sp = softirq_stack[smp_processor_id()]; +@@ -869,6 +870,7 @@ void do_softirq_own_stack(void) + __asm__ __volatile__("mov %0, %%sp" + : : "r" (orig_sp)); + } ++#endif + + #ifdef CONFIG_HOTPLUG_CPU + void fixup_irqs(void) +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index a08ce6360382..4a4498670861 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -107,6 +107,7 @@ config X86 + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN ++ select ARCH_SUPPORTS_RT + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_MEMTEST + select ARCH_USE_QUEUED_RWLOCKS +@@ -230,6 +231,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_REGS_AND_STACK_ACCESS_API +diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h +index e087cd7837c3..96cc92f63b06 100644 +--- a/arch/x86/include/asm/irq_stack.h ++++ b/arch/x86/include/asm/irq_stack.h +@@ -202,6 +202,7 @@ + IRQ_CONSTRAINTS, regs, vector); \ + } + ++#ifndef CONFIG_PREEMPT_RT + /* + * Macro to invoke __do_softirq on the irq stack. This is only called from + * task context when bottom halves are about to be reenabled and soft +@@ -215,6 +216,8 @@ + __this_cpu_write(hardirq_stack_inuse, false); \ + } + ++#endif ++ + #else /* CONFIG_X86_64 */ + /* System vector handlers always run on the stack they interrupted. */ + #define run_sysvec_on_irqstack_cond(func, regs) \ +diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h +index fe5efbcba824..ab8cb5fc2329 100644 +--- a/arch/x86/include/asm/preempt.h ++++ b/arch/x86/include/asm/preempt.h +@@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val) + * a decrement which hits zero means we have no preempt_count and should + * reschedule. + */ +-static __always_inline bool __preempt_count_dec_and_test(void) ++static __always_inline bool ____preempt_count_dec_and_test(void) + { + return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + } + ++static __always_inline bool __preempt_count_dec_and_test(void) ++{ ++ if (____preempt_count_dec_and_test()) ++ return true; ++#ifdef CONFIG_PREEMPT_LAZY ++ if (preempt_count()) ++ return false; ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else ++ return false; ++#endif ++} ++ + /* + * Returns true when we need to resched and can (barring IRQ state). + */ + static __always_inline bool should_resched(int preempt_offset) + { ++#ifdef CONFIG_PREEMPT_LAZY ++ u32 tmp; ++ tmp = raw_cpu_read_4(__preempt_count); ++ if (tmp == preempt_offset) ++ return true; ++ ++ /* preempt count == 0 ? */ ++ tmp &= ~PREEMPT_NEED_RESCHED; ++ if (tmp != preempt_offset) ++ return false; ++ /* XXX PREEMPT_LOCK_OFFSET */ ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); ++#endif + } + + #ifdef CONFIG_PREEMPTION +diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h +index 2dfb5fea13af..fc03f4f7ed84 100644 +--- a/arch/x86/include/asm/signal.h ++++ b/arch/x86/include/asm/signal.h +@@ -28,6 +28,19 @@ typedef struct { + #define SA_IA32_ABI 0x02000000u + #define SA_X32_ABI 0x01000000u + ++/* ++ * Because some traps use the IST stack, we must keep preemption ++ * disabled while calling do_trap(), but do_trap() may call ++ * force_sig_info() which will grab the signal spin_locks for the ++ * task, which in PREEMPT_RT are mutexes. By defining ++ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set ++ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the ++ * trap. ++ */ ++#if defined(CONFIG_PREEMPT_RT) ++#define ARCH_RT_DELAYS_SIGNAL_SEND ++#endif ++ + #ifndef CONFIG_COMPAT + #define compat_sigset_t compat_sigset_t + typedef sigset_t compat_sigset_t; +diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h +index 24a8d6c4fb18..2fc22c27df18 100644 +--- a/arch/x86/include/asm/stackprotector.h ++++ b/arch/x86/include/asm/stackprotector.h +@@ -50,7 +50,7 @@ + */ + static __always_inline void boot_init_stack_canary(void) + { +- u64 canary; ++ u64 canary = 0; + u64 tsc; + + #ifdef CONFIG_X86_64 +@@ -61,8 +61,14 @@ static __always_inline void boot_init_stack_canary(void) + * of randomness. The TSC only matters for very early init, + * there it already has some randomness on most systems. Later + * on during the bootup the random pool has true entropy too. ++ * For preempt-rt we need to weaken the randomness a bit, as ++ * we can't call into the random generator from atomic context ++ * due to locking constraints. We just leave canary ++ * uninitialized and use the TSC based randomness on top of it. + */ ++#ifndef CONFIG_PREEMPT_RT + get_random_bytes(&canary, sizeof(canary)); ++#endif + tsc = rdtsc(); + canary += tsc + (tsc << 32UL); + canary &= CANARY_MASK; +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index cf132663c219..75dc786e6365 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -57,11 +57,14 @@ struct thread_info { + unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ + u32 status; /* thread synchronous flags */ ++ int preempt_lazy_count; /* 0 => lazy preemptable ++ <0 => BUG */ + }; + + #define INIT_THREAD_INFO(tsk) \ + { \ + .flags = 0, \ ++ .preempt_lazy_count = 0, \ + } + + #else /* !__ASSEMBLY__ */ +@@ -90,6 +93,7 @@ struct thread_info { + #define TIF_NOTSC 16 /* TSC is not accessible in userland */ + #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ + #define TIF_SLD 18 /* Restore split lock detection on context switch */ ++#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ + #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ + #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ + #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +@@ -114,6 +118,7 @@ struct thread_info { + #define _TIF_NOTSC (1 << TIF_NOTSC) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) + #define _TIF_SLD (1 << TIF_SLD) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) + #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) + #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) +diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c +index 044902d5a3c4..e5dd6da78713 100644 +--- a/arch/x86/kernel/irq_32.c ++++ b/arch/x86/kernel/irq_32.c +@@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) + return 0; + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + struct irq_stack *irqstk; +@@ -148,6 +149,7 @@ void do_softirq_own_stack(void) + + call_on_stack(__do_softirq, isp); + } ++#endif + + void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) + { +diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c +index 3a43a2dee658..37bd37cdf2b6 100644 +--- a/arch/x86/kernel/kgdb.c ++++ b/arch/x86/kernel/kgdb.c +@@ -502,9 +502,12 @@ static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) + if (atomic_read(&kgdb_active) != -1) { + /* KGDB CPU roundup */ + cpu = raw_smp_processor_id(); +- kgdb_nmicallback(cpu, regs); +- set_bit(cpu, was_in_debug_nmi); +- touch_nmi_watchdog(); ++ ++ if (!kgdb_roundup_delay(cpu)) { ++ kgdb_nmicallback(cpu, regs); ++ set_bit(cpu, was_in_debug_nmi); ++ touch_nmi_watchdog(); ++ } + + return NMI_HANDLED; + } +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 7e1e3bc74562..38639c57b462 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8686,6 +8686,14 @@ int kvm_arch_init(void *opaque) + goto out; + } + ++#ifdef CONFIG_PREEMPT_RT ++ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { ++ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); ++ r = -EOPNOTSUPP; ++ goto out; ++ } ++#endif ++ + r = -ENOMEM; + x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), + __alignof__(struct fpu), SLAB_ACCOUNT, +diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h +index 64c9389254f1..797aed7df3dd 100644 +--- a/arch/xtensa/include/asm/spinlock_types.h ++++ b/arch/xtensa/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) ++#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) + # error "please don't include this file directly" + #endif + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index bbbbcd2c1941..0fc928de505d 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -1567,14 +1567,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, + return; + + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { +- int cpu = get_cpu(); ++ int cpu = get_cpu_light(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); +- put_cpu(); ++ put_cpu_light(); + return; + } + +- put_cpu(); ++ put_cpu_light(); + } + + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, +diff --git a/crypto/testmgr.c b/crypto/testmgr.c +index 163a1283a866..444183fe847d 100644 +--- a/crypto/testmgr.c ++++ b/crypto/testmgr.c +@@ -1061,14 +1061,14 @@ static void generate_random_testvec_config(struct testvec_config *cfg, + + static void crypto_disable_simd_for_test(void) + { +- preempt_disable(); ++ migrate_disable(); + __this_cpu_write(crypto_simd_disabled_for_test, true); + } + + static void crypto_reenable_simd_for_test(void) + { + __this_cpu_write(crypto_simd_disabled_for_test, false); +- preempt_enable(); ++ migrate_enable(); + } + + /* +diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c +index 6383c81ac5b3..abb695f5f5e4 100644 +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); + static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio); + ++#ifdef CONFIG_PREEMPT_RT ++static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) ++{ ++ size_t index; ++ ++ for (index = 0; index < num_pages; index++) ++ spin_lock_init(&zram->table[index].lock); ++} ++ ++static int zram_slot_trylock(struct zram *zram, u32 index) ++{ ++ int ret; ++ ++ ret = spin_trylock(&zram->table[index].lock); ++ if (ret) ++ __set_bit(ZRAM_LOCK, &zram->table[index].flags); ++ return ret; ++} ++ ++static void zram_slot_lock(struct zram *zram, u32 index) ++{ ++ spin_lock(&zram->table[index].lock); ++ __set_bit(ZRAM_LOCK, &zram->table[index].flags); ++} ++ ++static void zram_slot_unlock(struct zram *zram, u32 index) ++{ ++ __clear_bit(ZRAM_LOCK, &zram->table[index].flags); ++ spin_unlock(&zram->table[index].lock); ++} ++ ++#else ++ ++static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } + + static int zram_slot_trylock(struct zram *zram, u32 index) + { +@@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) + { + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + } ++#endif + + static inline bool init_done(struct zram *zram) + { +@@ -1169,6 +1204,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) + + if (!huge_class_size) + huge_class_size = zs_huge_class_size(zram->mem_pool); ++ zram_meta_init_table_locks(zram, num_pages); + return true; + } + +diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h +index 80c3b43b4828..d8f6d880f915 100644 +--- a/drivers/block/zram/zram_drv.h ++++ b/drivers/block/zram/zram_drv.h +@@ -63,6 +63,7 @@ struct zram_table_entry { + unsigned long element; + }; + unsigned long flags; ++ spinlock_t lock; + #ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; + #endif +diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c +index dfb463ee7ca1..b19c4f745ee3 100644 +--- a/drivers/char/tpm/tpm_tis.c ++++ b/drivers/char/tpm/tpm_tis.c +@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da + return container_of(data, struct tpm_tis_tcg_phy, priv); + } + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * Flushes previous write operations to chip so that a subsequent ++ * ioread*()s won't stall a cpu. ++ */ ++static inline void tpm_tis_flush(void __iomem *iobase) ++{ ++ ioread8(iobase + TPM_ACCESS(0)); ++} ++#else ++#define tpm_tis_flush(iobase) do { } while (0) ++#endif ++ ++static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) ++{ ++ iowrite8(b, iobase + addr); ++ tpm_tis_flush(iobase); ++} ++ ++static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) ++{ ++ iowrite32(b, iobase + addr); ++ tpm_tis_flush(iobase); ++} ++ + static int interrupts = -1; + module_param(interrupts, int, 0444); + MODULE_PARM_DESC(interrupts, "Enable interrupts"); +@@ -186,7 +211,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, + struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); + + while (len--) +- iowrite8(*value++, phy->iobase + addr); ++ tpm_tis_iowrite8(*value++, phy->iobase, addr); + + return 0; + } +@@ -213,7 +238,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) + { + struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); + +- iowrite32(value, phy->iobase + addr); ++ tpm_tis_iowrite32(value, phy->iobase, addr); + + return 0; + } +diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c +index 332739f3eded..8589df0e8c1f 100644 +--- a/drivers/firmware/efi/efi.c ++++ b/drivers/firmware/efi/efi.c +@@ -66,7 +66,7 @@ struct mm_struct efi_mm = { + + struct workqueue_struct *efi_rts_wq; + +-static bool disable_runtime; ++static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); + static int __init setup_noefi(char *arg) + { + disable_runtime = true; +@@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str) + if (parse_option_str(str, "noruntime")) + disable_runtime = true; + ++ if (parse_option_str(str, "runtime")) ++ disable_runtime = false; ++ + if (parse_option_str(str, "nosoftreserve")) + set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); + +diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c +index 254e67141a77..7a39029b083f 100644 +--- a/drivers/gpu/drm/i915/display/intel_crtc.c ++++ b/drivers/gpu/drm/i915/display/intel_crtc.c +@@ -425,7 +425,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + */ + intel_psr_wait_for_idle(new_crtc_state); + +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + + crtc->debug.min_vbl = min; + crtc->debug.max_vbl = max; +@@ -450,11 +451,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + break; + } + +- local_irq_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + + timeout = schedule_timeout(timeout); + +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + } + + finish_wait(wq, &wait); +@@ -487,7 +490,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + return; + + irq_disable: +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + } + + #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) +@@ -566,7 +570,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) + new_crtc_state->uapi.event = NULL; + } + +- local_irq_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + + /* Send VRR Push to terminate Vblank */ + intel_vrr_send_push(new_crtc_state); +diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +index 209cf265bf74..6e1b9068d944 100644 +--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c ++++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +@@ -311,10 +311,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) + /* Kick the work once more to drain the signalers, and disarm the irq */ + irq_work_sync(&b->irq_work); + while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { +- local_irq_disable(); +- signal_irq_work(&b->irq_work); +- local_irq_enable(); ++ irq_work_queue(&b->irq_work); + cond_resched(); ++ irq_work_sync(&b->irq_work); + } + } + +diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h +index c41098950746..601274ba86e4 100644 +--- a/drivers/gpu/drm/i915/gt/intel_context.h ++++ b/drivers/gpu/drm/i915/gt/intel_context.h +@@ -163,7 +163,8 @@ static inline void intel_context_enter(struct intel_context *ce) + + static inline void intel_context_mark_active(struct intel_context *ce) + { +- lockdep_assert_held(&ce->timeline->mutex); ++ lockdep_assert(lockdep_is_held(&ce->timeline->mutex) || ++ test_bit(CONTEXT_IS_PARKED, &ce->flags)); + ++ce->active_count; + } + +diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h +index a63631ea0ec4..314457fb9db5 100644 +--- a/drivers/gpu/drm/i915/gt/intel_context_types.h ++++ b/drivers/gpu/drm/i915/gt/intel_context_types.h +@@ -112,6 +112,7 @@ struct intel_context { + #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 + #define CONTEXT_NOPREEMPT 8 + #define CONTEXT_LRCA_DIRTY 9 ++#define CONTEXT_IS_PARKED 10 + + struct { + u64 timeout_us; +diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c +index dacd62773735..73e96ca024df 100644 +--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c ++++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c +@@ -80,39 +80,6 @@ static int __engine_unpark(struct intel_wakeref *wf) + return 0; + } + +-#if IS_ENABLED(CONFIG_LOCKDEP) +- +-static unsigned long __timeline_mark_lock(struct intel_context *ce) +-{ +- unsigned long flags; +- +- local_irq_save(flags); +- mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); +- +- return flags; +-} +- +-static void __timeline_mark_unlock(struct intel_context *ce, +- unsigned long flags) +-{ +- mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); +- local_irq_restore(flags); +-} +- +-#else +- +-static unsigned long __timeline_mark_lock(struct intel_context *ce) +-{ +- return 0; +-} +- +-static void __timeline_mark_unlock(struct intel_context *ce, +- unsigned long flags) +-{ +-} +- +-#endif /* !IS_ENABLED(CONFIG_LOCKDEP) */ +- + static void duration(struct dma_fence *fence, struct dma_fence_cb *cb) + { + struct i915_request *rq = to_request(fence); +@@ -159,7 +126,6 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) + { + struct intel_context *ce = engine->kernel_context; + struct i915_request *rq; +- unsigned long flags; + bool result = true; + + /* GPU is pointing to the void, as good as in the kernel context. */ +@@ -201,7 +167,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) + * engine->wakeref.count, we may see the request completion and retire + * it causing an underflow of the engine->wakeref. + */ +- flags = __timeline_mark_lock(ce); ++ set_bit(CONTEXT_IS_PARKED, &ce->flags); + GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0); + + rq = __i915_request_create(ce, GFP_NOWAIT); +@@ -233,7 +199,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) + + result = false; + out_unlock: +- __timeline_mark_unlock(ce, flags); ++ clear_bit(CONTEXT_IS_PARKED, &ce->flags); + return result; + } + +diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +index 773ff5121833..f330457209d5 100644 +--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c ++++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +@@ -1286,7 +1286,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + * and context switches) submission. + */ + +- spin_lock(&sched_engine->lock); ++ spin_lock_irq(&sched_engine->lock); + + /* + * If the queue is higher priority than the last +@@ -1386,7 +1386,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + * Even if ELSP[1] is occupied and not worthy + * of timeslices, our queue might be. + */ +- spin_unlock(&sched_engine->lock); ++ spin_unlock_irq(&sched_engine->lock); + return; + } + } +@@ -1412,7 +1412,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + + if (last && !can_merge_rq(last, rq)) { + spin_unlock(&ve->base.sched_engine->lock); +- spin_unlock(&engine->sched_engine->lock); ++ spin_unlock_irq(&engine->sched_engine->lock); + return; /* leave this for another sibling */ + } + +@@ -1574,7 +1574,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + */ + sched_engine->queue_priority_hint = queue_prio(sched_engine); + i915_sched_engine_reset_on_empty(sched_engine); +- spin_unlock(&sched_engine->lock); ++ spin_unlock_irq(&sched_engine->lock); + + /* + * We can skip poking the HW if we ended up with exactly the same set +@@ -1600,13 +1600,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + } + } + +-static void execlists_dequeue_irq(struct intel_engine_cs *engine) +-{ +- local_irq_disable(); /* Suspend interrupts across request submission */ +- execlists_dequeue(engine); +- local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ +-} +- + static void clear_ports(struct i915_request **ports, int count) + { + memset_p((void **)ports, NULL, count); +@@ -2442,7 +2435,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) + } + + if (!engine->execlists.pending[0]) { +- execlists_dequeue_irq(engine); ++ execlists_dequeue(engine); + start_timeslice(engine); + } + +diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c +index 9bc4f4a8e12e..547347241a47 100644 +--- a/drivers/gpu/drm/i915/i915_irq.c ++++ b/drivers/gpu/drm/i915/i915_irq.c +@@ -886,7 +886,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, + */ + spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); + +- /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + + /* Get optional system timestamp before query. */ + if (stime) +@@ -950,7 +951,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, + if (etime) + *etime = ktime_get(); + +- /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + + spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); + +diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c +index 79da5eca60af..b9dd6100c6d1 100644 +--- a/drivers/gpu/drm/i915/i915_request.c ++++ b/drivers/gpu/drm/i915/i915_request.c +@@ -559,7 +559,6 @@ bool __i915_request_submit(struct i915_request *request) + + RQ_TRACE(request, "\n"); + +- GEM_BUG_ON(!irqs_disabled()); + lockdep_assert_held(&engine->sched_engine->lock); + + /* +@@ -668,7 +667,6 @@ void __i915_request_unsubmit(struct i915_request *request) + */ + RQ_TRACE(request, "\n"); + +- GEM_BUG_ON(!irqs_disabled()); + lockdep_assert_held(&engine->sched_engine->lock); + + /* +diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h +index 1bc1349ba3c2..a2f713b4ac2f 100644 +--- a/drivers/gpu/drm/i915/i915_request.h ++++ b/drivers/gpu/drm/i915/i915_request.h +@@ -609,7 +609,8 @@ i915_request_timeline(const struct i915_request *rq) + { + /* Valid only while the request is being constructed (or retired). */ + return rcu_dereference_protected(rq->timeline, +- lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex)); ++ lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex) || ++ test_bit(CONTEXT_IS_PARKED, &rq->context->flags)); + } + + static inline struct i915_gem_context * +diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h +index 63fec1c3c132..f345a0f12bf6 100644 +--- a/drivers/gpu/drm/i915/i915_trace.h ++++ b/drivers/gpu/drm/i915/i915_trace.h +@@ -2,6 +2,10 @@ + #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) + #define _I915_TRACE_H_ + ++#ifdef CONFIG_PREEMPT_RT ++#define NOTRACE ++#endif ++ + #include + #include + #include +@@ -819,7 +823,7 @@ DEFINE_EVENT(i915_request, i915_request_add, + TP_ARGS(rq) + ); + +-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) ++#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) + DEFINE_EVENT(i915_request, i915_request_guc_submit, + TP_PROTO(struct i915_request *rq), + TP_ARGS(rq) +diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h +index 5259edacde38..b36b27c09049 100644 +--- a/drivers/gpu/drm/i915/i915_utils.h ++++ b/drivers/gpu/drm/i915/i915_utils.h +@@ -343,7 +343,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) + #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) + + /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ +-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) ++#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) + # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) + #else + # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) +diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c +index 1cf68f85b2e1..8ccf0c928bb4 100644 +--- a/drivers/i2c/busses/i2c-cht-wc.c ++++ b/drivers/i2c/busses/i2c-cht-wc.c +@@ -99,15 +99,8 @@ static irqreturn_t cht_wc_i2c_adap_thread_handler(int id, void *data) + * interrupt handler as well, so running the client irq handler from + * this thread will cause things to lock up. + */ +- if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) { +- /* +- * generic_handle_irq expects local IRQs to be disabled +- * as normally it is called from interrupt context. +- */ +- local_irq_disable(); +- generic_handle_irq(adap->client_irq); +- local_irq_enable(); +- } ++ if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) ++ generic_handle_irq_safe(adap->client_irq); + + return IRQ_HANDLED; + } +diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c +index 8fb065caf30b..c232535ca8f4 100644 +--- a/drivers/i2c/i2c-core-base.c ++++ b/drivers/i2c/i2c-core-base.c +@@ -1422,7 +1422,7 @@ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr) + if (irq <= 0) + return -ENXIO; + +- generic_handle_irq(irq); ++ generic_handle_irq_safe(irq); + + return 0; + } +diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig +index 1f1d57288085..dc6816d36d06 100644 +--- a/drivers/leds/trigger/Kconfig ++++ b/drivers/leds/trigger/Kconfig +@@ -64,6 +64,7 @@ config LEDS_TRIGGER_BACKLIGHT + + config LEDS_TRIGGER_CPU + bool "LED CPU Trigger" ++ depends on !PREEMPT_RT + help + This allows LEDs to be controlled by active CPUs. This shows + the active CPUs across an array of LEDs so you can see which +diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c +index c2a42486f985..451a22641b5a 100644 +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -2218,8 +2218,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) + struct raid5_percpu *percpu; + unsigned long cpu; + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + percpu = per_cpu_ptr(conf->percpu, cpu); ++ spin_lock(&percpu->lock); + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { + ops_run_biofill(sh); + overlap_clear++; +@@ -2278,7 +2279,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&sh->raid_conf->wait_for_overlap); + } +- put_cpu(); ++ spin_unlock(&percpu->lock); ++ put_cpu_light(); + } + + static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) +@@ -7110,6 +7112,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) + __func__, cpu); + return -ENOMEM; + } ++ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); + return 0; + } + +diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h +index 5c05acf20e1f..665fe138ab4f 100644 +--- a/drivers/md/raid5.h ++++ b/drivers/md/raid5.h +@@ -635,6 +635,7 @@ struct r5conf { + int recovery_disabled; + /* per cpu variables */ + struct raid5_percpu { ++ spinlock_t lock; /* Protection for -RT */ + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address +diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c +index 70fa18b04ad2..b14d3f98e1eb 100644 +--- a/drivers/mfd/ezx-pcap.c ++++ b/drivers/mfd/ezx-pcap.c +@@ -193,13 +193,11 @@ static void pcap_isr_work(struct work_struct *work) + ezx_pcap_write(pcap, PCAP_REG_MSR, isr | msr); + ezx_pcap_write(pcap, PCAP_REG_ISR, isr); + +- local_irq_disable(); + service = isr & ~msr; + for (irq = pcap->irq_base; service; service >>= 1, irq++) { + if (service & 1) +- generic_handle_irq(irq); ++ generic_handle_irq_safe(irq); + } +- local_irq_enable(); + ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); + } while (gpio_get_value(pdata->gpio)); + } +diff --git a/drivers/misc/hi6421v600-irq.c b/drivers/misc/hi6421v600-irq.c +index 08535e97ff43..0585a5821d05 100644 +--- a/drivers/misc/hi6421v600-irq.c ++++ b/drivers/misc/hi6421v600-irq.c +@@ -118,8 +118,8 @@ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) + * If both powerkey down and up IRQs are received, + * handle them at the right order + */ +- generic_handle_irq(priv->irqs[POWERKEY_DOWN]); +- generic_handle_irq(priv->irqs[POWERKEY_UP]); ++ generic_handle_irq_safe(priv->irqs[POWERKEY_DOWN]); ++ generic_handle_irq_safe(priv->irqs[POWERKEY_UP]); + pending &= ~HISI_IRQ_POWERKEY_UP_DOWN; + } + +@@ -127,7 +127,7 @@ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) + continue; + + for_each_set_bit(offset, &pending, BITS_PER_BYTE) { +- generic_handle_irq(priv->irqs[offset + i * BITS_PER_BYTE]); ++ generic_handle_irq_safe(priv->irqs[offset + i * BITS_PER_BYTE]); + } + } + +diff --git a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c +index 2473fb5f75e5..2a5cc64227e9 100644 +--- a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c ++++ b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c +@@ -458,7 +458,7 @@ nfp_abm_qdisc_graft(struct nfp_abm_link *alink, u32 handle, u32 child_handle, + static void + nfp_abm_stats_calculate(struct nfp_alink_stats *new, + struct nfp_alink_stats *old, +- struct gnet_stats_basic_packed *bstats, ++ struct gnet_stats_basic_sync *bstats, + struct gnet_stats_queue *qstats) + { + _bstats_update(bstats, new->tx_bytes - old->tx_bytes, +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index 5700c9d20a3e..be3330a1c922 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -1367,11 +1367,8 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb) + netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata); + lan78xx_defer_kevent(dev, EVENT_LINK_RESET); + +- if (dev->domain_data.phyirq > 0) { +- local_irq_disable(); +- generic_handle_irq(dev->domain_data.phyirq); +- local_irq_enable(); +- } ++ if (dev->domain_data.phyirq > 0) ++ generic_handle_irq_safe(dev->domain_data.phyirq); + } else { + netdev_warn(dev->net, + "unexpected interrupt: 0x%08x\n", intdata); +diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c +index 76dbdae0e987..967431858dcd 100644 +--- a/drivers/scsi/fcoe/fcoe.c ++++ b/drivers/scsi/fcoe/fcoe.c +@@ -1450,11 +1450,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, + static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) + { + struct fcoe_percpu_s *fps; +- int rc; ++ int rc, cpu = get_cpu_light(); + +- fps = &get_cpu_var(fcoe_percpu); ++ fps = &per_cpu(fcoe_percpu, cpu); + rc = fcoe_get_paged_crc_eof(skb, tlen, fps); +- put_cpu_var(fcoe_percpu); ++ put_cpu_light(); + + return rc; + } +@@ -1639,11 +1639,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport, + return 0; + } + +- stats = per_cpu_ptr(lport->stats, get_cpu()); ++ stats = per_cpu_ptr(lport->stats, get_cpu_light()); + stats->InvalidCRCCount++; + if (stats->InvalidCRCCount < 5) + printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); +- put_cpu(); ++ put_cpu_light(); + return -EINVAL; + } + +@@ -1684,7 +1684,7 @@ static void fcoe_recv_frame(struct sk_buff *skb) + */ + hp = (struct fcoe_hdr *) skb_network_header(skb); + +- stats = per_cpu_ptr(lport->stats, get_cpu()); ++ stats = per_cpu_ptr(lport->stats, get_cpu_light()); + if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { + if (stats->ErrorFrames < 5) + printk(KERN_WARNING "fcoe: FCoE version " +@@ -1716,13 +1716,13 @@ static void fcoe_recv_frame(struct sk_buff *skb) + goto drop; + + if (!fcoe_filter_frames(lport, fp)) { +- put_cpu(); ++ put_cpu_light(); + fc_exch_recv(lport, fp); + return; + } + drop: + stats->ErrorFrames++; +- put_cpu(); ++ put_cpu_light(); + kfree_skb(skb); + } + +diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c +index 558f3f4e1859..f08feaa4f398 100644 +--- a/drivers/scsi/fcoe/fcoe_ctlr.c ++++ b/drivers/scsi/fcoe/fcoe_ctlr.c +@@ -828,7 +828,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) + + INIT_LIST_HEAD(&del_list); + +- stats = per_cpu_ptr(fip->lp->stats, get_cpu()); ++ stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); + + list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { + deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; +@@ -864,7 +864,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) + sel_time = fcf->time; + } + } +- put_cpu(); ++ put_cpu_light(); + + list_for_each_entry_safe(fcf, next, &del_list, list) { + /* Removes fcf from current list */ +diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c +index aa223db4cf53..0ceb93800704 100644 +--- a/drivers/scsi/libfc/fc_exch.c ++++ b/drivers/scsi/libfc/fc_exch.c +@@ -825,10 +825,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, + } + memset(ep, 0, sizeof(*ep)); + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + pool = per_cpu_ptr(mp->pool, cpu); + spin_lock_bh(&pool->lock); +- put_cpu(); ++ put_cpu_light(); + + /* peek cache of free slot */ + if (pool->left != FC_XID_UNKNOWN) { +diff --git a/drivers/staging/greybus/gpio.c b/drivers/staging/greybus/gpio.c +index 7e6347fe93f9..8a7cf1d0e968 100644 +--- a/drivers/staging/greybus/gpio.c ++++ b/drivers/staging/greybus/gpio.c +@@ -391,10 +391,7 @@ static int gb_gpio_request_handler(struct gb_operation *op) + return -EINVAL; + } + +- local_irq_disable(); +- ret = generic_handle_irq(irq); +- local_irq_enable(); +- ++ ret = generic_handle_irq_safe(irq); + if (ret) + dev_err(dev, "failed to invoke irq handler\n"); + +diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h +index bb1a98c97adf..8639210a89c7 100644 +--- a/drivers/tty/serial/8250/8250.h ++++ b/drivers/tty/serial/8250/8250.h +@@ -156,12 +156,55 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) + up->dl_write(up, value); + } + ++static inline void serial8250_set_IER(struct uart_8250_port *up, ++ unsigned char ier) ++{ ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(flags); ++ ++ serial_out(up, UART_IER, ier); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++} ++ ++static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int clearval = 0; ++ unsigned long flags; ++ unsigned int prior; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (up->capabilities & UART_CAP_UUE) ++ clearval = UART_IER_UUE; ++ ++ if (is_console) ++ console_atomic_lock(flags); ++ ++ prior = serial_port_in(port, UART_IER); ++ serial_port_out(port, UART_IER, clearval); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++ ++ return prior; ++} ++ + static inline bool serial8250_set_THRI(struct uart_8250_port *up) + { + if (up->ier & UART_IER_THRI) + return false; + up->ier |= UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +@@ -170,7 +213,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) + if (!(up->ier & UART_IER_THRI)) + return false; + up->ier &= ~UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c +index 1890f342f090..16d5d450b32f 100644 +--- a/drivers/tty/serial/8250/8250_core.c ++++ b/drivers/tty/serial/8250/8250_core.c +@@ -265,10 +265,8 @@ static void serial8250_backup_timeout(struct timer_list *t) + * Must disable interrupts or else we risk racing with the interrupt + * based handler. + */ +- if (up->port.irq) { +- ier = serial_in(up, UART_IER); +- serial_out(up, UART_IER, 0); +- } ++ if (up->port.irq) ++ ier = serial8250_clear_IER(up); + + iir = serial_in(up, UART_IIR); + +@@ -291,7 +289,7 @@ static void serial8250_backup_timeout(struct timer_list *t) + serial8250_tx_chars(up); + + if (up->port.irq) +- serial_out(up, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + spin_unlock_irqrestore(&up->port.lock, flags); + +@@ -578,6 +576,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) + + #ifdef CONFIG_SERIAL_8250_CONSOLE + ++static void univ8250_console_write_atomic(struct console *co, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_port *up = &serial8250_ports[co->index]; ++ ++ serial8250_console_write_atomic(up, s, count); ++} ++ + static void univ8250_console_write(struct console *co, const char *s, + unsigned int count) + { +@@ -671,6 +677,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, + + static struct console univ8250_console = { + .name = "ttyS", ++ .write_atomic = univ8250_console_write_atomic, + .write = univ8250_console_write, + .device = uart_console_device, + .setup = univ8250_console_setup, +diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c +index 6a22f3a970f3..a6c02140eff0 100644 +--- a/drivers/tty/serial/8250/8250_fsl.c ++++ b/drivers/tty/serial/8250/8250_fsl.c +@@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port) + + /* Stop processing interrupts on input overrun */ + if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { ++ unsigned long flags; + unsigned long delay; ++ bool is_console; + ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(flags); + up->ier = port->serial_in(port, UART_IER); ++ if (is_console) ++ console_atomic_unlock(flags); ++ + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { + port->ops->stop_rx(port); + } else { +diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c +index 65402d05eff9..8122645ab05c 100644 +--- a/drivers/tty/serial/8250/8250_ingenic.c ++++ b/drivers/tty/serial/8250/8250_ingenic.c +@@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", + + static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + { ++ unsigned long flags; ++ bool is_console; + int ier; + + switch (offset) { +@@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + * If we have enabled modem status IRQs we should enable + * modem mode. + */ ++ is_console = uart_console(p); ++ if (is_console) ++ console_atomic_lock(flags); + ier = p->serial_in(p, UART_IER); ++ if (is_console) ++ console_atomic_unlock(flags); + + if (ier & UART_IER_MSI) + value |= UART_MCR_MDCE | UART_MCR_FCM; +diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c +index de48a58460f4..364ee950f21a 100644 +--- a/drivers/tty/serial/8250/8250_mtk.c ++++ b/drivers/tty/serial/8250/8250_mtk.c +@@ -222,12 +222,37 @@ static void mtk8250_shutdown(struct uart_port *port) + + static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int ier; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier & (~mask)); ++ ++ if (is_console) ++ console_atomic_unlock(flags); + } + + static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int ier; ++ ++ if (uart_console(port)) ++ console_atomic_lock(flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier | mask); ++ ++ if (uart_console(port)) ++ console_atomic_unlock(flags); + } + + static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) +diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c +index bfdd9ecc2baf..479b94b3238a 100644 +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -752,7 +752,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) + serial_out(p, UART_EFR, UART_EFR_ECB); + serial_out(p, UART_LCR, 0); + } +- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); ++ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); + if (p->capabilities & UART_CAP_EFR) { + serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); + serial_out(p, UART_EFR, efr); +@@ -1427,7 +1427,7 @@ static void serial8250_stop_rx(struct uart_port *port) + + up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); + up->port.read_status_mask &= ~UART_LSR_DR; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + serial8250_rpm_put(up); + } +@@ -1457,7 +1457,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p) + serial8250_clear_and_reinit_fifos(p); + + p->ier |= UART_IER_RLSI | UART_IER_RDI; +- serial_port_out(&p->port, UART_IER, p->ier); ++ serial8250_set_IER(p, p->ier); + } + } + EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); +@@ -1693,7 +1693,7 @@ static void serial8250_disable_ms(struct uart_port *port) + mctrl_gpio_disable_ms(up->gpios); + + up->ier &= ~UART_IER_MSI; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + } + + static void serial8250_enable_ms(struct uart_port *port) +@@ -1709,7 +1709,7 @@ static void serial8250_enable_ms(struct uart_port *port) + up->ier |= UART_IER_MSI; + + serial8250_rpm_get(up); +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + serial8250_rpm_put(up); + } + +@@ -2143,14 +2143,7 @@ static void serial8250_put_poll_char(struct uart_port *port, + struct uart_8250_port *up = up_to_u8250p(port); + + serial8250_rpm_get(up); +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ ier = serial8250_clear_IER(up); + + wait_for_xmitr(up, BOTH_EMPTY); + /* +@@ -2163,7 +2156,7 @@ static void serial8250_put_poll_char(struct uart_port *port, + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); +- serial_port_out(port, UART_IER, ier); ++ serial8250_set_IER(up, ier); + serial8250_rpm_put(up); + } + +@@ -2468,7 +2461,7 @@ void serial8250_do_shutdown(struct uart_port *port) + */ + spin_lock_irqsave(&port->lock, flags); + up->ier = 0; +- serial_port_out(port, UART_IER, 0); ++ serial8250_set_IER(up, 0); + spin_unlock_irqrestore(&port->lock, flags); + + synchronize_irq(port->irq); +@@ -2850,7 +2843,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, + if (up->capabilities & UART_CAP_RTOIE) + up->ier |= UART_IER_RTOIE; + +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + if (up->capabilities & UART_CAP_EFR) { + unsigned char efr = 0; +@@ -3315,7 +3308,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); + + #ifdef CONFIG_SERIAL_8250_CONSOLE + +-static void serial8250_console_putchar(struct uart_port *port, int ch) ++static void serial8250_console_putchar_locked(struct uart_port *port, int ch) + { + struct uart_8250_port *up = up_to_u8250p(port); + +@@ -3323,6 +3316,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch) + serial_port_out(port, UART_TX, ch); + } + ++static void serial8250_console_putchar(struct uart_port *port, int ch) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ ++ console_atomic_lock(flags); ++ serial8250_console_putchar_locked(port, ch); ++ console_atomic_unlock(flags); ++} ++ + /* + * Restore serial console when h/w power-off detected + */ +@@ -3349,6 +3354,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) + serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); + } + ++void serial8250_console_write_atomic(struct uart_8250_port *up, ++ const char *s, unsigned int count) ++{ ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int ier; ++ ++ console_atomic_lock(flags); ++ ++ touch_nmi_watchdog(); ++ ++ ier = serial8250_clear_IER(up); ++ ++ if (atomic_fetch_inc(&up->console_printing)) { ++ uart_console_write(port, "\n", 1, ++ serial8250_console_putchar_locked); ++ } ++ uart_console_write(port, s, count, serial8250_console_putchar_locked); ++ atomic_dec(&up->console_printing); ++ ++ wait_for_xmitr(up, BOTH_EMPTY); ++ serial8250_set_IER(up, ier); ++ ++ console_atomic_unlock(flags); ++} ++ + /* + * Print a string to the serial port trying not to disturb + * any possible real use of the port... +@@ -3365,24 +3396,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; +- int locked = 1; + + touch_nmi_watchdog(); + +- if (oops_in_progress) +- locked = spin_trylock_irqsave(&port->lock, flags); +- else +- spin_lock_irqsave(&port->lock, flags); +- +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); ++ spin_lock_irqsave(&port->lock, flags); + +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ ier = serial8250_clear_IER(up); + + /* check scratch reg to see if port powered off during system sleep */ + if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { +@@ -3396,7 +3415,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + mdelay(port->rs485.delay_rts_before_send); + } + ++ atomic_inc(&up->console_printing); + uart_console_write(port, s, count, serial8250_console_putchar); ++ atomic_dec(&up->console_printing); + + /* + * Finally, wait for transmitter to become empty +@@ -3409,8 +3430,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + if (em485->tx_stopped) + up->rs485_stop_tx(up); + } +- +- serial_port_out(port, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + /* + * The receive handling will happen properly because the +@@ -3422,8 +3442,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + if (up->msr_saved_flags) + serial8250_modem_status(up); + +- if (locked) +- spin_unlock_irqrestore(&port->lock, flags); ++ spin_unlock_irqrestore(&port->lock, flags); + } + + static unsigned int probe_baud(struct uart_port *port) +@@ -3443,6 +3462,7 @@ static unsigned int probe_baud(struct uart_port *port) + + int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + { ++ struct uart_8250_port *up = up_to_u8250p(port); + int baud = 9600; + int bits = 8; + int parity = 'n'; +@@ -3452,6 +3472,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + if (!port->iobase && !port->membase) + return -ENODEV; + ++ atomic_set(&up->console_printing, 0); ++ + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + else if (probe) +diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c +index b91fe25a64a1..5986658e130b 100644 +--- a/drivers/tty/serial/amba-pl011.c ++++ b/drivers/tty/serial/amba-pl011.c +@@ -2340,18 +2340,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) + { + struct uart_amba_port *uap = amba_ports[co->index]; + unsigned int old_cr = 0, new_cr; +- unsigned long flags; ++ unsigned long flags = 0; + int locked = 1; + + clk_enable(uap->clk); + +- local_irq_save(flags); ++ /* ++ * local_irq_save(flags); ++ * ++ * This local_irq_save() is nonsense. If we come in via sysrq ++ * handling then interrupts are already disabled. Aside of ++ * that the port.sysrq check is racy on SMP regardless. ++ */ + if (uap->port.sysrq) + locked = 0; + else if (oops_in_progress) +- locked = spin_trylock(&uap->port.lock); ++ locked = spin_trylock_irqsave(&uap->port.lock, flags); + else +- spin_lock(&uap->port.lock); ++ spin_lock_irqsave(&uap->port.lock, flags); + + /* + * First save the CR then disable the interrupts +@@ -2377,8 +2383,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) + pl011_write(old_cr, uap, REG_CR); + + if (locked) +- spin_unlock(&uap->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&uap->port.lock, flags); + + clk_disable(uap->clk); + } +diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c +index 0862941862c8..10970632f0e4 100644 +--- a/drivers/tty/serial/omap-serial.c ++++ b/drivers/tty/serial/omap-serial.c +@@ -1255,13 +1255,10 @@ serial_omap_console_write(struct console *co, const char *s, + unsigned int ier; + int locked = 1; + +- local_irq_save(flags); +- if (up->port.sysrq) +- locked = 0; +- else if (oops_in_progress) +- locked = spin_trylock(&up->port.lock); ++ if (up->port.sysrq || oops_in_progress) ++ locked = spin_trylock_irqsave(&up->port.lock, flags); + else +- spin_lock(&up->port.lock); ++ spin_lock_irqsave(&up->port.lock, flags); + + /* + * First save the IER then disable the interrupts +@@ -1288,8 +1285,7 @@ serial_omap_console_write(struct console *co, const char *s, + check_modem_status(up); + + if (locked) +- spin_unlock(&up->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&up->port.lock, flags); + } + + static int __init +diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c +index df5184979b28..d4ad211dce7a 100644 +--- a/drivers/virt/acrn/irqfd.c ++++ b/drivers/virt/acrn/irqfd.c +@@ -17,7 +17,6 @@ + #include "acrn_drv.h" + + static LIST_HEAD(acrn_irqfd_clients); +-static DEFINE_MUTEX(acrn_irqfds_mutex); + + /** + * struct hsm_irqfd - Properties of HSM irqfd +diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c +index 45cfd50a9521..502b56597f10 100644 +--- a/fs/afs/dir_silly.c ++++ b/fs/afs/dir_silly.c +@@ -239,7 +239,7 @@ int afs_silly_iput(struct dentry *dentry, struct inode *inode) + struct dentry *alias; + int ret; + +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); + +diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c +index 1929e80c09ee..48eb8c30c6db 100644 +--- a/fs/cifs/readdir.c ++++ b/fs/cifs/readdir.c +@@ -69,7 +69,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, + struct inode *inode; + struct super_block *sb = parent->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); + +diff --git a/fs/dcache.c b/fs/dcache.c +index cf871a81f4fd..02db80f2817f 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -2537,7 +2537,13 @@ EXPORT_SYMBOL(d_rehash); + + static inline unsigned start_dir_add(struct inode *dir) + { +- ++ /* ++ * The caller has a spinlock_t (dentry::d_lock) acquired which disables ++ * preemption on !PREEMPT_RT. On PREEMPT_RT the lock does not disable ++ * preemption and it has be done explicitly. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + for (;;) { + unsigned n = dir->i_dir_seq; + if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) +@@ -2549,25 +2555,30 @@ static inline unsigned start_dir_add(struct inode *dir) + static inline void end_dir_add(struct inode *dir, unsigned n) + { + smp_store_release(&dir->i_dir_seq, n + 2); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + } + + static void d_wait_lookup(struct dentry *dentry) + { +- if (d_in_lookup(dentry)) { +- DECLARE_WAITQUEUE(wait, current); +- add_wait_queue(dentry->d_wait, &wait); +- do { +- set_current_state(TASK_UNINTERRUPTIBLE); +- spin_unlock(&dentry->d_lock); +- schedule(); +- spin_lock(&dentry->d_lock); +- } while (d_in_lookup(dentry)); +- } ++ struct swait_queue __wait; ++ ++ if (!d_in_lookup(dentry)) ++ return; ++ ++ INIT_LIST_HEAD(&__wait.task_list); ++ do { ++ prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); ++ spin_unlock(&dentry->d_lock); ++ schedule(); ++ spin_lock(&dentry->d_lock); ++ } while (d_in_lookup(dentry)); ++ finish_swait(dentry->d_wait, &__wait); + } + + struct dentry *d_alloc_parallel(struct dentry *parent, + const struct qstr *name, +- wait_queue_head_t *wq) ++ struct swait_queue_head *wq) + { + unsigned int hash = name->hash; + struct hlist_bl_head *b = in_lookup_hash(parent, hash); +@@ -2682,7 +2693,7 @@ void __d_lookup_done(struct dentry *dentry) + hlist_bl_lock(b); + dentry->d_flags &= ~DCACHE_PAR_LOOKUP; + __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); +- wake_up_all(dentry->d_wait); ++ swake_up_all(dentry->d_wait); + dentry->d_wait = NULL; + hlist_bl_unlock(b); + INIT_HLIST_NODE(&dentry->d_u.d_alias); +diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h +index c3e4804b8fcb..9edb87e11680 100644 +--- a/fs/fscache/internal.h ++++ b/fs/fscache/internal.h +@@ -81,7 +81,6 @@ extern unsigned fscache_debug; + extern struct kobject *fscache_root; + extern struct workqueue_struct *fscache_object_wq; + extern struct workqueue_struct *fscache_op_wq; +-DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + + extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); + +diff --git a/fs/fscache/main.c b/fs/fscache/main.c +index 4207f98e405f..85f8cf3a323d 100644 +--- a/fs/fscache/main.c ++++ b/fs/fscache/main.c +@@ -41,8 +41,6 @@ struct kobject *fscache_root; + struct workqueue_struct *fscache_object_wq; + struct workqueue_struct *fscache_op_wq; + +-DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +- + /* these values serve as lower bounds, will be adjusted in fscache_init() */ + static unsigned fscache_object_max_active = 4; + static unsigned fscache_op_max_active = 2; +@@ -138,7 +136,6 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) + static int __init fscache_init(void) + { + unsigned int nr_cpus = num_possible_cpus(); +- unsigned int cpu; + int ret; + + fscache_object_max_active = +@@ -161,9 +158,6 @@ static int __init fscache_init(void) + if (!fscache_op_wq) + goto error_op_wq; + +- for_each_possible_cpu(cpu) +- init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); +- + ret = fscache_proc_init(); + if (ret < 0) + goto error_proc; +diff --git a/fs/fscache/object.c b/fs/fscache/object.c +index 6a675652129b..7a972d144b54 100644 +--- a/fs/fscache/object.c ++++ b/fs/fscache/object.c +@@ -798,6 +798,8 @@ void fscache_object_destroy(struct fscache_object *object) + } + EXPORT_SYMBOL(fscache_object_destroy); + ++static DECLARE_WAIT_QUEUE_HEAD(fscache_object_cong_wait); ++ + /* + * enqueue an object for metadata-type processing + */ +@@ -806,16 +808,12 @@ void fscache_enqueue_object(struct fscache_object *object) + _enter("{OBJ%x}", object->debug_id); + + if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { +- wait_queue_head_t *cong_wq = +- &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) +- wake_up(cong_wq); ++ wake_up(&fscache_object_cong_wait); + } else + fscache_put_object(object, fscache_obj_put_queue); +- +- put_cpu_var(fscache_object_cong_wait); + } + } + +@@ -833,16 +831,15 @@ void fscache_enqueue_object(struct fscache_object *object) + */ + bool fscache_object_sleep_till_congested(signed long *timeoutp) + { +- wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + +- add_wait_queue_exclusive(cong_wq, &wait); ++ add_wait_queue_exclusive(&fscache_object_cong_wait, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); +- finish_wait(cong_wq, &wait); ++ finish_wait(&fscache_object_cong_wait, &wait); + + return fscache_object_congested(); + } +diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c +index d5294e663df5..ee8846818b34 100644 +--- a/fs/fuse/readdir.c ++++ b/fs/fuse/readdir.c +@@ -160,7 +160,7 @@ static int fuse_direntplus_link(struct file *file, + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* +diff --git a/fs/namei.c b/fs/namei.c +index 02e99606c65b..c1d11a2e7fa3 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -1635,7 +1635,7 @@ static struct dentry *__lookup_slow(const struct qstr *name, + { + struct dentry *dentry, *old; + struct inode *inode = dir->d_inode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + /* Don't go there if it's already dead */ + if (unlikely(IS_DEADDIR(inode))) +@@ -3305,7 +3305,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + struct dentry *dentry; + int error, create_error = 0; + umode_t mode = op->mode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + if (unlikely(IS_DEADDIR(dir_inode))) + return ERR_PTR(-ENOENT); +diff --git a/fs/namespace.c b/fs/namespace.c +index 1a9df6afb90b..373b0e738997 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -344,8 +344,24 @@ int __mnt_want_write(struct vfsmount *m) + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); +- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) +- cpu_relax(); ++ might_lock(&mount_lock.lock); ++ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ cpu_relax(); ++ } else { ++ /* ++ * This prevents priority inversion, if the task ++ * setting MNT_WRITE_HOLD got preempted on a remote ++ * CPU, and it prevents life lock if the task setting ++ * MNT_WRITE_HOLD has a lower priority and is bound to ++ * the same CPU as the task that is spinning here. ++ */ ++ preempt_enable(); ++ lock_mount_hash(); ++ unlock_mount_hash(); ++ preempt_disable(); ++ } ++ } + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index 32c3d0c454b1..b8ff452317e6 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -637,7 +637,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, + unsigned long dir_verifier) + { + struct qstr filename = QSTR_INIT(entry->name, entry->len); +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + struct dentry *dentry; + struct dentry *alias; + struct inode *inode; +@@ -1873,7 +1873,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode) + { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + struct nfs_open_context *ctx; + struct dentry *res; + struct iattr attr = { .ia_valid = ATTR_OPEN }; +diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c +index d5ccf095b2a7..0944c068f5cb 100644 +--- a/fs/nfs/unlink.c ++++ b/fs/nfs/unlink.c +@@ -13,7 +13,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + +@@ -184,7 +184,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) + + data->cred = get_current_cred(); + data->res.dir_attr = &data->dir_attr; +- init_waitqueue_head(&data->wq); ++ init_swait_queue_head(&data->wq); + + status = -EBUSY; + spin_lock(&dentry->d_lock); +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 300d53ee7040..6ab25d4d4037 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -96,6 +96,7 @@ + #include + #include + #include ++#include + #include + #include + #include "internal.h" +@@ -2071,7 +2072,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, + + child = d_hash_and_lookup(dir, &qname); + if (!child) { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + goto end_instantiate; +diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c +index 0b7a00ed6c49..a7828fce675a 100644 +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -679,7 +679,7 @@ static bool proc_sys_fill_cache(struct file *file, + + child = d_lookup(dir, &qname); + if (!child) { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + return false; +diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h +index eceeecf6a5bd..d3e2d81656e0 100644 +--- a/include/asm-generic/softirq_stack.h ++++ b/include/asm-generic/softirq_stack.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_GENERIC_SOFTIRQ_STACK_H + #define __ASM_GENERIC_SOFTIRQ_STACK_H + +-#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK ++#if defined(CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK) && !defined(CONFIG_PREEMPT_RT) + void do_softirq_own_stack(void); + #else + static inline void do_softirq_own_stack(void) +diff --git a/include/linux/console.h b/include/linux/console.h +index a97f277cfdfa..487a4266ab2c 100644 +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -16,6 +16,13 @@ + + #include + #include ++#include ++#include ++ ++struct latched_seq { ++ seqcount_latch_t latch; ++ u64 val[2]; ++}; + + struct vc_data; + struct console_font_op; +@@ -136,10 +143,12 @@ static inline int con_debug_leave(void) + #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ + #define CON_BRL (32) /* Used for a braille device */ + #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ ++#define CON_HANDOVER (128) /* Device was previously a boot console. */ + + struct console { + char name[16]; + void (*write)(struct console *, const char *, unsigned); ++ void (*write_atomic)(struct console *co, const char *s, unsigned int count); + int (*read)(struct console *, char *, unsigned); + struct tty_driver *(*device)(struct console *, int *); + void (*unblank)(void); +@@ -149,6 +158,16 @@ struct console { + short flags; + short index; + int cflag; ++#ifdef CONFIG_PRINTK ++ char sync_buf[CONSOLE_LOG_MAX]; ++ struct latched_seq printk_seq; ++ struct latched_seq printk_sync_seq; ++#ifdef CONFIG_HAVE_NMI ++ struct latched_seq printk_sync_nmi_seq; ++#endif ++#endif /* CONFIG_PRINTK */ ++ ++ struct task_struct *thread; + uint ispeed; + uint ospeed; + void *data; +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index 9e23d33bb6f1..9f89d4887e35 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -108,7 +108,7 @@ struct dentry { + + union { + struct list_head d_lru; /* LRU list */ +- wait_queue_head_t *d_wait; /* in-lookup ones only */ ++ struct swait_queue_head *d_wait; /* in-lookup ones only */ + }; + struct list_head d_child; /* child of parent list */ + struct list_head d_subdirs; /* our children */ +@@ -240,7 +240,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op + extern struct dentry * d_alloc(struct dentry *, const struct qstr *); + extern struct dentry * d_alloc_anon(struct super_block *); + extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, +- wait_queue_head_t *); ++ struct swait_queue_head *); + extern struct dentry * d_splice_alias(struct inode *, struct dentry *); + extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); + extern struct dentry * d_exact_alias(struct dentry *, struct inode *); +diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h +index 2e2b8d6140ed..71064a2c2caf 100644 +--- a/include/linux/entry-common.h ++++ b/include/linux/entry-common.h +@@ -57,9 +57,15 @@ + # define ARCH_EXIT_TO_USER_MODE_WORK (0) + #endif + ++#ifdef CONFIG_PREEMPT_LAZY ++# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) ++#else ++# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED) ++#endif ++ + #define EXIT_TO_USER_MODE_WORK \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ +- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ++ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + ARCH_EXIT_TO_USER_MODE_WORK) + + /** +diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h +index ec2a47a81e42..8cd11a223260 100644 +--- a/include/linux/irq_work.h ++++ b/include/linux/irq_work.h +@@ -3,6 +3,7 @@ + #define _LINUX_IRQ_WORK_H + + #include ++#include + + /* + * An entry can be in one of four states: +@@ -16,11 +17,13 @@ + struct irq_work { + struct __call_single_node node; + void (*func)(struct irq_work *); ++ struct rcuwait irqwait; + }; + + #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ + .node = { .u_flags = (_flags), }, \ + .func = (_func), \ ++ .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ + } + + #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) +@@ -46,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) + return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; + } + ++static inline bool irq_work_is_hard(struct irq_work *work) ++{ ++ return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; ++} ++ + bool irq_work_queue(struct irq_work *work); + bool irq_work_queue_on(struct irq_work *work, int cpu); + +diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h +index 59aea39785bf..d69b819b53e0 100644 +--- a/include/linux/irqdesc.h ++++ b/include/linux/irqdesc.h +@@ -160,6 +160,7 @@ static inline void generic_handle_irq_desc(struct irq_desc *desc) + + int handle_irq_desc(struct irq_desc *desc); + int generic_handle_irq(unsigned int irq); ++int generic_handle_irq_safe(unsigned int irq); + + #ifdef CONFIG_IRQ_DOMAIN + /* +diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h +index 747f40e0c326..5ec0fa71399e 100644 +--- a/include/linux/irqflags.h ++++ b/include/linux/irqflags.h +@@ -71,14 +71,6 @@ do { \ + do { \ + __this_cpu_dec(hardirq_context); \ + } while (0) +-# define lockdep_softirq_enter() \ +-do { \ +- current->softirq_context++; \ +-} while (0) +-# define lockdep_softirq_exit() \ +-do { \ +- current->softirq_context--; \ +-} while (0) + + # define lockdep_hrtimer_enter(__hrtimer) \ + ({ \ +@@ -140,6 +132,21 @@ do { \ + # define lockdep_irq_work_exit(__work) do { } while (0) + #endif + ++#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) ++# define lockdep_softirq_enter() \ ++do { \ ++ current->softirq_context++; \ ++} while (0) ++# define lockdep_softirq_exit() \ ++do { \ ++ current->softirq_context--; \ ++} while (0) ++ ++#else ++# define lockdep_softirq_enter() do { } while (0) ++# define lockdep_softirq_exit() do { } while (0) ++#endif ++ + #if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) + extern void stop_critical_timings(void); +diff --git a/include/linux/kernel.h b/include/linux/kernel.h +index f56cd8879a59..49f1e924b6e6 100644 +--- a/include/linux/kernel.h ++++ b/include/linux/kernel.h +@@ -111,8 +111,8 @@ static __always_inline void might_resched(void) + #endif /* CONFIG_PREEMPT_* */ + + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP +-extern void ___might_sleep(const char *file, int line, int preempt_offset); +-extern void __might_sleep(const char *file, int line, int preempt_offset); ++extern void __might_resched(const char *file, int line, unsigned int offsets); ++extern void __might_sleep(const char *file, int line); + extern void __cant_sleep(const char *file, int line, int preempt_offset); + extern void __cant_migrate(const char *file, int line); + +@@ -129,7 +129,7 @@ extern void __cant_migrate(const char *file, int line); + * supposed to. + */ + # define might_sleep() \ +- do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) ++ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) + /** + * cant_sleep - annotation for functions that cannot sleep + * +@@ -168,10 +168,9 @@ extern void __cant_migrate(const char *file, int line); + */ + # define non_block_end() WARN_ON(current->non_block_count-- == 0) + #else +- static inline void ___might_sleep(const char *file, int line, +- int preempt_offset) { } +- static inline void __might_sleep(const char *file, int line, +- int preempt_offset) { } ++ static inline void __might_resched(const char *file, int line, ++ unsigned int offsets) { } ++static inline void __might_sleep(const char *file, int line) { } + # define might_sleep() do { might_resched(); } while (0) + # define cant_sleep() do { } while (0) + # define cant_migrate() do { } while (0) +diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h +index 258cdde8d356..9bca0d98db5a 100644 +--- a/include/linux/kgdb.h ++++ b/include/linux/kgdb.h +@@ -212,6 +212,8 @@ extern void kgdb_call_nmi_hook(void *ignored); + */ + extern void kgdb_roundup_cpus(void); + ++extern void kgdb_roundup_cpu(unsigned int cpu); ++ + /** + * kgdb_arch_set_pc - Generic call back to the program counter + * @regs: Current &struct pt_regs. +@@ -365,5 +367,6 @@ extern void kgdb_free_init_mem(void); + #define dbg_late_init() + static inline void kgdb_panic(const char *msg) {} + static inline void kgdb_free_init_mem(void) { } ++static inline void kgdb_roundup_cpu(unsigned int cpu) {} + #endif /* ! CONFIG_KGDB */ + #endif /* _KGDB_H_ */ +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 7f8ee09c711f..e9672de22cf2 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -572,6 +573,9 @@ struct mm_struct { + bool tlb_flush_batched; + #endif + struct uprobes_state uprobes_state; ++#ifdef CONFIG_PREEMPT_RT ++ struct rcu_head delayed_drop; ++#endif + #ifdef CONFIG_HUGETLB_PAGE + atomic_long_t hugetlb_usage; + #endif +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index c0a4589ab706..0f9dadec3b46 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -1941,7 +1941,6 @@ enum netdev_ml_priv_type { + * @sfp_bus: attached &struct sfp_bus structure. + * + * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock +- * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount + * + * @proto_down: protocol port state information can be sent to the + * switch driver and used to set the phys state of the +@@ -2272,7 +2271,6 @@ struct net_device { + struct phy_device *phydev; + struct sfp_bus *sfp_bus; + struct lock_class_key *qdisc_tx_busylock; +- struct lock_class_key *qdisc_running_key; + bool proto_down; + unsigned wol_enabled:1; + unsigned threaded:1; +@@ -2382,13 +2380,11 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, + #define netdev_lockdep_set_classes(dev) \ + { \ + static struct lock_class_key qdisc_tx_busylock_key; \ +- static struct lock_class_key qdisc_running_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + static struct lock_class_key dev_addr_list_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ +- (dev)->qdisc_running_key = &qdisc_running_key; \ + lockdep_set_class(&(dev)->addr_list_lock, \ + &dev_addr_list_lock_key); \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ +diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h +index 7fcd56c6ded6..2ccb84f15aa3 100644 +--- a/include/linux/nfs_xdr.h ++++ b/include/linux/nfs_xdr.h +@@ -1692,7 +1692,7 @@ struct nfs_unlinkdata { + struct nfs_removeargs args; + struct nfs_removeres res; + struct dentry *dentry; +- wait_queue_head_t wq; ++ struct swait_queue_head wq; + const struct cred *cred; + struct nfs_fattr dir_attr; + long timeout; +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 4d244e295e85..3da73c968211 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -122,9 +122,10 @@ + * The preempt_count offset after spin_lock() + */ + #if !defined(CONFIG_PREEMPT_RT) +-#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET ++#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET + #else +-#define PREEMPT_LOCK_OFFSET 0 ++/* Locks on RT do not disable preemption */ ++#define PREEMPT_LOCK_OFFSET 0 + #endif + + /* +@@ -174,6 +175,20 @@ extern void preempt_count_sub(int val); + #define preempt_count_inc() preempt_count_add(1) + #define preempt_count_dec() preempt_count_sub(1) + ++#ifdef CONFIG_PREEMPT_LAZY ++#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) ++#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) ++#define inc_preempt_lazy_count() add_preempt_lazy_count(1) ++#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) ++#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) ++#else ++#define add_preempt_lazy_count(val) do { } while (0) ++#define sub_preempt_lazy_count(val) do { } while (0) ++#define inc_preempt_lazy_count() do { } while (0) ++#define dec_preempt_lazy_count() do { } while (0) ++#define preempt_lazy_count() (0) ++#endif ++ + #ifdef CONFIG_PREEMPT_COUNT + + #define preempt_disable() \ +@@ -182,13 +197,25 @@ do { \ + barrier(); \ + } while (0) + ++#define preempt_lazy_disable() \ ++do { \ ++ inc_preempt_lazy_count(); \ ++ barrier(); \ ++} while (0) ++ + #define sched_preempt_enable_no_resched() \ + do { \ + barrier(); \ + preempt_count_dec(); \ + } while (0) + +-#define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++#ifndef CONFIG_PREEMPT_RT ++# define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++# define preempt_check_resched_rt() barrier(); ++#else ++# define preempt_enable_no_resched() preempt_enable() ++# define preempt_check_resched_rt() preempt_check_resched() ++#endif + + #define preemptible() (preempt_count() == 0 && !irqs_disabled()) + +@@ -213,6 +240,18 @@ do { \ + __preempt_schedule(); \ + } while (0) + ++/* ++ * open code preempt_check_resched() because it is not exported to modules and ++ * used by local_unlock() or bpf_enable_instrumentation(). ++ */ ++#define preempt_lazy_enable() \ ++do { \ ++ dec_preempt_lazy_count(); \ ++ barrier(); \ ++ if (should_resched(0)) \ ++ __preempt_schedule(); \ ++} while (0) ++ + #else /* !CONFIG_PREEMPTION */ + #define preempt_enable() \ + do { \ +@@ -220,6 +259,12 @@ do { \ + preempt_count_dec(); \ + } while (0) + ++#define preempt_lazy_enable() \ ++do { \ ++ dec_preempt_lazy_count(); \ ++ barrier(); \ ++} while (0) ++ + #define preempt_enable_notrace() \ + do { \ + barrier(); \ +@@ -258,8 +303,12 @@ do { \ + #define preempt_disable_notrace() barrier() + #define preempt_enable_no_resched_notrace() barrier() + #define preempt_enable_notrace() barrier() ++#define preempt_check_resched_rt() barrier() + #define preemptible() 0 + ++#define preempt_lazy_disable() barrier() ++#define preempt_lazy_enable() barrier() ++ + #endif /* CONFIG_PREEMPT_COUNT */ + + #ifdef MODULE +@@ -278,7 +327,7 @@ do { \ + } while (0) + #define preempt_fold_need_resched() \ + do { \ +- if (tif_need_resched()) \ ++ if (tif_need_resched_now()) \ + set_preempt_need_resched(); \ + } while (0) + +@@ -394,8 +443,15 @@ extern void migrate_enable(void); + + #else + +-static inline void migrate_disable(void) { } +-static inline void migrate_enable(void) { } ++static inline void migrate_disable(void) ++{ ++ preempt_lazy_disable(); ++} ++ ++static inline void migrate_enable(void) ++{ ++ preempt_lazy_enable(); ++} + + #endif /* CONFIG_SMP */ + +diff --git a/include/linux/printk.h b/include/linux/printk.h +index 9497f6b98339..eddfc5de6ee7 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -47,6 +47,12 @@ static inline const char *printk_skip_headers(const char *buffer) + + #define CONSOLE_EXT_LOG_MAX 8192 + ++/* ++ * The maximum size of a record formatted for console printing ++ * (i.e. with the prefix prepended to every line). ++ */ ++#define CONSOLE_LOG_MAX 1024 ++ + /* printk's without a loglevel use this.. */ + #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT + +@@ -155,6 +161,8 @@ int vprintk(const char *fmt, va_list args); + asmlinkage __printf(1, 2) __cold + int _printk(const char *fmt, ...); + ++bool pr_flush(int timeout_ms, bool reset_on_progress); ++ + /* + * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! + */ +@@ -224,6 +232,11 @@ static inline void printk_deferred_exit(void) + { + } + ++static inline bool pr_flush(int timeout_ms, bool reset_on_progress) ++{ ++ return true; ++} ++ + static inline int printk_ratelimit(void) + { + return 0; +@@ -284,17 +297,30 @@ static inline void printk_trigger_flush(void) + extern int __printk_cpu_trylock(void); + extern void __printk_wait_on_cpu_lock(void); + extern void __printk_cpu_unlock(void); ++extern bool kgdb_roundup_delay(unsigned int cpu); ++ ++#else ++ ++#define __printk_cpu_trylock() 1 ++#define __printk_wait_on_cpu_lock() ++#define __printk_cpu_unlock() ++ ++static inline bool kgdb_roundup_delay(unsigned int cpu) ++{ ++ return false; ++} ++#endif /* CONFIG_SMP */ + + /** +- * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning +- * lock and disable interrupts. ++ * raw_printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning ++ * lock and disable interrupts. + * @flags: Stack-allocated storage for saving local interrupt state, +- * to be passed to printk_cpu_unlock_irqrestore(). ++ * to be passed to raw_printk_cpu_unlock_irqrestore(). + * + * If the lock is owned by another CPU, spin until it becomes available. + * Interrupts are restored while spinning. + */ +-#define printk_cpu_lock_irqsave(flags) \ ++#define raw_printk_cpu_lock_irqsave(flags) \ + for (;;) { \ + local_irq_save(flags); \ + if (__printk_cpu_trylock()) \ +@@ -304,22 +330,30 @@ extern void __printk_cpu_unlock(void); + } + + /** +- * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning +- * lock and restore interrupts. +- * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave(). ++ * raw_printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant ++ * spinning lock and restore interrupts. ++ * @flags: Caller's saved interrupt state from raw_printk_cpu_lock_irqsave(). + */ +-#define printk_cpu_unlock_irqrestore(flags) \ ++#define raw_printk_cpu_unlock_irqrestore(flags) \ + do { \ + __printk_cpu_unlock(); \ + local_irq_restore(flags); \ +- } while (0) \ +- +-#else ++ } while (0) + +-#define printk_cpu_lock_irqsave(flags) ((void)flags) +-#define printk_cpu_unlock_irqrestore(flags) ((void)flags) ++/* ++ * Used to synchronize atomic consoles. ++ * ++ * The same as raw_printk_cpu_lock_irqsave() except that hardware interrupts ++ * are _not_ restored while spinning. ++ */ ++#define console_atomic_lock(flags) \ ++ do { \ ++ local_irq_save(flags); \ ++ while (!__printk_cpu_trylock()) \ ++ cpu_relax(); \ ++ } while (0) + +-#endif /* CONFIG_SMP */ ++#define console_atomic_unlock raw_printk_cpu_unlock_irqrestore + + extern int kptr_restrict; + +diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h +index f0e535f199be..002266693e50 100644 +--- a/include/linux/ratelimit_types.h ++++ b/include/linux/ratelimit_types.h +@@ -4,7 +4,7 @@ + + #include + #include +-#include ++#include + + #define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) + #define DEFAULT_RATELIMIT_BURST 10 +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index 13bddb841ceb..e33445348eb0 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -94,6 +94,13 @@ void rcu_init_tasks_generic(void); + static inline void rcu_init_tasks_generic(void) { } + #endif + ++#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TASKS_RCU_GENERIC) ++void rcu_tasks_initiate_self_tests(void); ++#else ++static inline void rcu_tasks_initiate_self_tests(void) {} ++#endif ++ ++ + #ifdef CONFIG_RCU_STALL_COMMON + void rcu_sysrq_start(void); + void rcu_sysrq_end(void); +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index 9deedfeec2b1..7d049883a08a 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -99,13 +99,22 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); ++extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); + #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) ++#define rt_mutex_lock_nest_lock(lock, nest_lock) \ ++ do { \ ++ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ ++ _rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ ++ } while (0) ++ + #else + extern void rt_mutex_lock(struct rt_mutex *lock); + #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) ++#define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) + #endif + + extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); ++extern int rt_mutex_lock_killable(struct rt_mutex *lock); + extern int rt_mutex_trylock(struct rt_mutex *lock); + + extern void rt_mutex_unlock(struct rt_mutex *lock); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 7c17742d359c..2cdeb099d3c9 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -118,12 +118,8 @@ struct task_group; + + #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) + +-#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) +- + #define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) + +-#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) +- + /* + * Special states are those that do not use the normal wait-loop pattern. See + * the comment with set_special_state(). +@@ -1084,6 +1080,10 @@ struct task_struct { + /* Restored if set_restore_sigmask() was used: */ + sigset_t saved_sigmask; + struct sigpending pending; ++#ifdef CONFIG_PREEMPT_RT ++ /* TODO: move me into ->restart_block ? */ ++ struct kernel_siginfo forced_info; ++#endif + unsigned long sas_ss_sp; + size_t sas_ss_size; + unsigned int sas_ss_flags; +@@ -1738,6 +1738,16 @@ static __always_inline bool is_percpu_thread(void) + #endif + } + ++/* Is the current task guaranteed to stay on its current CPU? */ ++static inline bool is_migratable(void) ++{ ++#ifdef CONFIG_SMP ++ return preemptible() && !current->migration_disabled; ++#else ++ return false; ++#endif ++} ++ + /* Per-process atomic flags. */ + #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ + #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ +@@ -2013,6 +2023,118 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); ++} ++ ++static inline int need_resched_lazy(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline int need_resched_now(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED); ++} ++ ++#else ++static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } ++static inline int need_resched_lazy(void) { return 0; } ++ ++static inline int need_resched_now(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED); ++} ++ ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT ++static inline bool task_match_saved_state(struct task_struct *p, long match_state) ++{ ++ return p->saved_state == match_state; ++} ++ ++static inline bool task_is_traced(struct task_struct *task) ++{ ++ bool traced = false; ++ ++ /* in case the task is sleeping on tasklist_lock */ ++ raw_spin_lock_irq(&task->pi_lock); ++ if (READ_ONCE(task->__state) & __TASK_TRACED) ++ traced = true; ++ else if (task->saved_state & __TASK_TRACED) ++ traced = true; ++ raw_spin_unlock_irq(&task->pi_lock); ++ return traced; ++} ++ ++static inline bool task_is_stopped_or_traced(struct task_struct *task) ++{ ++ bool traced_stopped = false; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ ++ if (READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) ++ traced_stopped = true; ++ else if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) ++ traced_stopped = true; ++ ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++ return traced_stopped; ++} ++ ++#else ++ ++static inline bool task_match_saved_state(struct task_struct *p, long match_state) ++{ ++ return false; ++} ++ ++static inline bool task_is_traced(struct task_struct *task) ++{ ++ return READ_ONCE(task->__state) & __TASK_TRACED; ++} ++ ++static inline bool task_is_stopped_or_traced(struct task_struct *task) ++{ ++ return READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED); ++} ++#endif ++ ++static inline bool task_match_state_or_saved(struct task_struct *p, ++ long match_state) ++{ ++ if (READ_ONCE(p->__state) == match_state) ++ return true; ++ ++ return task_match_saved_state(p, match_state); ++} ++ ++static inline bool task_match_state_lock(struct task_struct *p, ++ long match_state) ++{ ++ bool match; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ match = task_match_state_or_saved(p, match_state); ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ return match; ++} ++ + /* + * cond_resched() and cond_resched_lock(): latency reduction via + * explicit rescheduling in places that are safe. The return +@@ -2047,7 +2169,7 @@ static inline int _cond_resched(void) { return 0; } + #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ + + #define cond_resched() ({ \ +- ___might_sleep(__FILE__, __LINE__, 0); \ ++ __might_resched(__FILE__, __LINE__, 0); \ + _cond_resched(); \ + }) + +@@ -2055,19 +2177,38 @@ extern int __cond_resched_lock(spinlock_t *lock); + extern int __cond_resched_rwlock_read(rwlock_t *lock); + extern int __cond_resched_rwlock_write(rwlock_t *lock); + +-#define cond_resched_lock(lock) ({ \ +- ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ +- __cond_resched_lock(lock); \ ++#define MIGHT_RESCHED_RCU_SHIFT 8 ++#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) ++ ++#ifndef CONFIG_PREEMPT_RT ++/* ++ * Non RT kernels have an elevated preempt count due to the held lock, ++ * but are not allowed to be inside a RCU read side critical section ++ */ ++# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET ++#else ++/* ++ * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in ++ * cond_resched*lock() has to take that into account because it checks for ++ * preempt_count() and rcu_preempt_depth(). ++ */ ++# define PREEMPT_LOCK_RESCHED_OFFSETS \ ++ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) ++#endif ++ ++#define cond_resched_lock(lock) ({ \ ++ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ ++ __cond_resched_lock(lock); \ + }) + +-#define cond_resched_rwlock_read(lock) ({ \ +- __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ +- __cond_resched_rwlock_read(lock); \ ++#define cond_resched_rwlock_read(lock) ({ \ ++ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ ++ __cond_resched_rwlock_read(lock); \ + }) + +-#define cond_resched_rwlock_write(lock) ({ \ +- __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ +- __cond_resched_rwlock_write(lock); \ ++#define cond_resched_rwlock_write(lock) ({ \ ++ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ ++ __cond_resched_rwlock_write(lock); \ + }) + + static inline void cond_resched_rcu(void) +diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h +index 95fb7aaaec8d..28e9cc60f47e 100644 +--- a/include/linux/sched/mm.h ++++ b/include/linux/sched/mm.h +@@ -49,6 +49,26 @@ static inline void mmdrop(struct mm_struct *mm) + __mmdrop(mm); + } + ++#ifdef CONFIG_PREEMPT_RT ++extern void __mmdrop_delayed(struct rcu_head *rhp); ++ ++/* ++ * Invoked from finish_task_switch(). Delegates the heavy lifting on RT ++ * kernels via RCU. ++ */ ++static inline void mmdrop_sched(struct mm_struct *mm) ++{ ++ /* Provides a full memory barrier. See mmdrop() */ ++ if (atomic_dec_and_test(&mm->mm_count)) ++ call_rcu(&mm->delayed_drop, __mmdrop_delayed); ++} ++#else ++static inline void mmdrop_sched(struct mm_struct *mm) ++{ ++ mmdrop(mm); ++} ++#endif ++ + /** + * mmget() - Pin the address space associated with a &struct mm_struct. + * @mm: The address space to pin. +diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h +index 68abc6bdd891..dfe81e08e143 100644 +--- a/include/linux/serial_8250.h ++++ b/include/linux/serial_8250.h +@@ -7,6 +7,7 @@ + #ifndef _LINUX_SERIAL_8250_H + #define _LINUX_SERIAL_8250_H + ++#include + #include + #include + #include +@@ -126,6 +127,8 @@ struct uart_8250_port { + #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA + unsigned char msr_saved_flags; + ++ atomic_t console_printing; ++ + struct uart_8250_dma *dma; + const struct uart_8250_ops *ops; + +@@ -181,6 +184,8 @@ void serial8250_init_port(struct uart_8250_port *up); + void serial8250_set_defaults(struct uart_8250_port *up); + void serial8250_console_write(struct uart_8250_port *up, const char *s, + unsigned int count); ++void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, ++ unsigned int count); + int serial8250_console_setup(struct uart_port *port, char *options, bool probe); + int serial8250_console_exit(struct uart_port *port); + +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 7ed1d4472c0c..6ac2df270a97 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -301,6 +301,7 @@ struct sk_buff_head { + + __u32 qlen; + spinlock_t lock; ++ raw_spinlock_t raw_lock; + }; + + struct sk_buff; +@@ -1993,6 +1994,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) + __skb_queue_head_init(list); + } + ++static inline void skb_queue_head_init_raw(struct sk_buff_head *list) ++{ ++ raw_spin_lock_init(&list->raw_lock); ++ __skb_queue_head_init(list); ++} ++ + static inline void skb_queue_head_init_class(struct sk_buff_head *list, + struct lock_class_key *class) + { +diff --git a/include/linux/smp.h b/include/linux/smp.h +index 510519e8a1eb..7ac9fdb5ad09 100644 +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -268,6 +268,9 @@ static inline int get_boot_cpu_id(void) + #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) + #define put_cpu() preempt_enable() + ++#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) ++#define put_cpu_light() migrate_enable() ++ + /* + * Callback to arch code if there's nosmp or maxcpus=0 on the + * boot command line: +diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h +index c09b6407ae1b..7f86a2016ac5 100644 +--- a/include/linux/spinlock_types_up.h ++++ b/include/linux/spinlock_types_up.h +@@ -1,7 +1,7 @@ + #ifndef __LINUX_SPINLOCK_TYPES_UP_H + #define __LINUX_SPINLOCK_TYPES_UP_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h +index 9a073535c0bd..0536fbba7f69 100644 +--- a/include/linux/thread_info.h ++++ b/include/linux/thread_info.h +@@ -177,7 +177,17 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti + clear_ti_thread_flag(task_thread_info(t), TIF_##fl) + #endif /* !CONFIG_GENERIC_ENTRY */ + +-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) ++#ifdef CONFIG_PREEMPT_LAZY ++#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ ++ test_thread_flag(TIF_NEED_RESCHED_LAZY)) ++#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) ++#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY) ++ ++#else ++#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) ++#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) ++#define tif_need_resched_lazy() 0 ++#endif + + #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES + static inline int arch_within_stack_frames(const void * const stack, +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index ff137179e0c3..54fe3b1a638d 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -69,6 +69,7 @@ struct trace_entry { + unsigned char flags; + unsigned char preempt_count; + int pid; ++ unsigned char preempt_lazy_count; + }; + + #define TRACE_EVENT_TYPE_MAX \ +@@ -158,9 +159,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, + unsigned int trace_ctx) + { + entry->preempt_count = trace_ctx & 0xff; ++ entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; + entry->pid = current->pid; + entry->type = type; +- entry->flags = trace_ctx >> 16; ++ entry->flags = trace_ctx >> 24; + } + + unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); +@@ -173,6 +175,7 @@ enum trace_flag_type { + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, ++ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, + }; + + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h +index e81856c0ba13..81dc1f5e181a 100644 +--- a/include/linux/u64_stats_sync.h ++++ b/include/linux/u64_stats_sync.h +@@ -66,7 +66,7 @@ + #include + + struct u64_stats_sync { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + seqcount_t seq; + #endif + }; +@@ -83,6 +83,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) + return local64_read(&p->v); + } + ++static inline void u64_stats_set(u64_stats_t *p, u64 val) ++{ ++ local64_set(&p->v, val); ++} ++ + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) + { + local64_add(val, &p->v); +@@ -104,6 +109,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) + return p->v; + } + ++static inline void u64_stats_set(u64_stats_t *p, u64 val) ++{ ++ p->v = val; ++} ++ + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) + { + p->v += val; +@@ -115,7 +125,7 @@ static inline void u64_stats_inc(u64_stats_t *p) + } + #endif + +-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) + #else + static inline void u64_stats_init(struct u64_stats_sync *syncp) +@@ -125,15 +135,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) + + static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + write_seqcount_begin(&syncp->seq); + #endif + } + + static inline void u64_stats_update_end(struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + write_seqcount_end(&syncp->seq); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + #endif + } + +@@ -142,8 +156,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) + { + unsigned long flags = 0; + +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +- local_irq_save(flags); ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); ++ else ++ local_irq_save(flags); + write_seqcount_begin(&syncp->seq); + #endif + return flags; +@@ -153,15 +170,18 @@ static inline void + u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + write_seqcount_end(&syncp->seq); +- local_irq_restore(flags); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); ++ else ++ local_irq_restore(flags); + #endif + } + + static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + return read_seqcount_begin(&syncp->seq); + #else + return 0; +@@ -170,7 +190,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * + + static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) + preempt_disable(); + #endif + return __u64_stats_fetch_begin(syncp); +@@ -179,7 +199,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy + static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + return read_seqcount_retry(&syncp->seq, start); + #else + return false; +@@ -189,7 +209,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) + preempt_enable(); + #endif + return __u64_stats_fetch_retry(syncp, start); +@@ -203,7 +223,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + */ + static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) ++ preempt_disable(); ++#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) + local_irq_disable(); + #endif + return __u64_stats_fetch_begin(syncp); +@@ -212,7 +234,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync + static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) ++ preempt_enable(); ++#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) + local_irq_enable(); + #endif + return __u64_stats_fetch_retry(syncp, start); +diff --git a/include/net/act_api.h b/include/net/act_api.h +index f19f7f4a463c..b5b624c7e488 100644 +--- a/include/net/act_api.h ++++ b/include/net/act_api.h +@@ -30,13 +30,13 @@ struct tc_action { + atomic_t tcfa_bindcnt; + int tcfa_action; + struct tcf_t tcfa_tm; +- struct gnet_stats_basic_packed tcfa_bstats; +- struct gnet_stats_basic_packed tcfa_bstats_hw; ++ struct gnet_stats_basic_sync tcfa_bstats; ++ struct gnet_stats_basic_sync tcfa_bstats_hw; + struct gnet_stats_queue tcfa_qstats; + struct net_rate_estimator __rcu *tcfa_rate_est; + spinlock_t tcfa_lock; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats_hw; + struct gnet_stats_queue __percpu *cpu_qstats; + struct tc_cookie __rcu *act_cookie; + struct tcf_chain __rcu *goto_chain; +@@ -206,7 +206,7 @@ static inline void tcf_action_update_bstats(struct tc_action *a, + struct sk_buff *skb) + { + if (likely(a->cpu_bstats)) { +- bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(a->cpu_bstats), skb); + return; + } + spin_lock(&a->tcfa_lock); +diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h +index 1424e02cef90..7aa2b8e1fb29 100644 +--- a/include/net/gen_stats.h ++++ b/include/net/gen_stats.h +@@ -7,14 +7,17 @@ + #include + #include + +-/* Note: this used to be in include/uapi/linux/gen_stats.h */ +-struct gnet_stats_basic_packed { +- __u64 bytes; +- __u64 packets; +-}; +- +-struct gnet_stats_basic_cpu { +- struct gnet_stats_basic_packed bstats; ++/* Throughput stats. ++ * Must be initialized beforehand with gnet_stats_basic_sync_init(). ++ * ++ * If no reads can ever occur parallel to writes (e.g. stack-allocated ++ * bstats), then the internal stat values can be written to and read ++ * from directly. Otherwise, use _bstats_set/update() for writes and ++ * gnet_stats_add_basic() for reads. ++ */ ++struct gnet_stats_basic_sync { ++ u64_stats_t bytes; ++ u64_stats_t packets; + struct u64_stats_sync syncp; + } __aligned(2 * sizeof(u64)); + +@@ -34,6 +37,7 @@ struct gnet_dump { + struct tc_stats tc_stats; + }; + ++void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b); + int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + struct gnet_dump *d, int padattr); + +@@ -42,41 +46,38 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, + spinlock_t *lock, struct gnet_dump *d, + int padattr); + +-int gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b); +-void __gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b); +-int gnet_stats_copy_basic_hw(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b); ++int gnet_stats_copy_basic(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running); ++void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running); ++int gnet_stats_copy_basic_hw(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running); + int gnet_stats_copy_rate_est(struct gnet_dump *d, + struct net_rate_estimator __rcu **ptr); + int gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue __percpu *cpu_q, + struct gnet_stats_queue *q, __u32 qlen); +-void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, +- const struct gnet_stats_queue __percpu *cpu_q, +- const struct gnet_stats_queue *q, __u32 qlen); ++void gnet_stats_add_queue(struct gnet_stats_queue *qstats, ++ const struct gnet_stats_queue __percpu *cpu_q, ++ const struct gnet_stats_queue *q); + int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); + + int gnet_stats_finish_copy(struct gnet_dump *d); + +-int gen_new_estimator(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu_bstats, ++int gen_new_estimator(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt); ++ bool running, struct nlattr *opt); + void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); +-int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu_bstats, ++int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **ptr, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt); ++ bool running, struct nlattr *opt); + bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); + bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, + struct gnet_stats_rate_est64 *sample); +diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h +index 832ab69efda5..4c3809e141f4 100644 +--- a/include/net/netfilter/xt_rateest.h ++++ b/include/net/netfilter/xt_rateest.h +@@ -6,7 +6,7 @@ + + struct xt_rateest { + /* keep lock and bstats on same cache line to speedup xt_rateest_tg() */ +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + spinlock_t lock; + + +diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h +index 83a6d0792180..4a5833108083 100644 +--- a/include/net/pkt_cls.h ++++ b/include/net/pkt_cls.h +@@ -765,7 +765,7 @@ struct tc_cookie { + }; + + struct tc_qopt_offload_stats { +- struct gnet_stats_basic_packed *bstats; ++ struct gnet_stats_basic_sync *bstats; + struct gnet_stats_queue *qstats; + }; + +@@ -885,7 +885,7 @@ struct tc_gred_qopt_offload_params { + }; + + struct tc_gred_qopt_offload_stats { +- struct gnet_stats_basic_packed bstats[MAX_DPs]; ++ struct gnet_stats_basic_sync bstats[MAX_DPs]; + struct gnet_stats_queue qstats[MAX_DPs]; + struct red_stats *xstats[MAX_DPs]; + }; +diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h +index 6906da5c733e..e9fe7a613eba 100644 +--- a/include/net/sch_generic.h ++++ b/include/net/sch_generic.h +@@ -40,6 +40,13 @@ enum qdisc_state_t { + __QDISC_STATE_DRAINING, + }; + ++enum qdisc_state2_t { ++ /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. ++ * Use qdisc_run_begin/end() or qdisc_is_running() instead. ++ */ ++ __QDISC_STATE2_RUNNING, ++}; ++ + #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) + #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) + +@@ -97,7 +104,7 @@ struct Qdisc { + struct netdev_queue *dev_queue; + + struct net_rate_estimator __rcu *rate_est; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; + int pad; + refcount_t refcnt; +@@ -107,10 +114,10 @@ struct Qdisc { + */ + struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; + struct qdisc_skb_head q; +- struct gnet_stats_basic_packed bstats; +- seqcount_t running; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + unsigned long state; ++ unsigned long state2; /* must be written under qdisc spinlock */ + struct Qdisc *next_sched; + struct sk_buff_head skb_bad_txq; + +@@ -143,11 +150,15 @@ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) + return NULL; + } + ++/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc ++ * root_lock section, or provide their own memory barriers -- ordering ++ * against qdisc_run_begin/end() atomic bit operations. ++ */ + static inline bool qdisc_is_running(struct Qdisc *qdisc) + { + if (qdisc->flags & TCQ_F_NOLOCK) + return spin_is_locked(&qdisc->seqlock); +- return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; ++ return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + } + + static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) +@@ -167,6 +178,9 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc) + return !READ_ONCE(qdisc->q.qlen); + } + ++/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with ++ * the qdisc root lock acquired. ++ */ + static inline bool qdisc_run_begin(struct Qdisc *qdisc) + { + if (qdisc->flags & TCQ_F_NOLOCK) { +@@ -186,15 +200,8 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) + * when testing it in qdisc_run_end() + */ + return spin_trylock(&qdisc->seqlock); +- } else if (qdisc_is_running(qdisc)) { +- return false; + } +- /* Variant of write_seqcount_begin() telling lockdep a trylock +- * was attempted. +- */ +- raw_write_seqcount_begin(&qdisc->running); +- seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); +- return true; ++ return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + } + + static inline void qdisc_run_end(struct Qdisc *qdisc) +@@ -212,7 +219,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) + &qdisc->state))) + __netif_schedule(qdisc); + } else { +- write_seqcount_end(&qdisc->running); ++ __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + } + } + +@@ -576,14 +583,6 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) + return qdisc_lock(root); + } + +-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) +-{ +- struct Qdisc *root = qdisc_root_sleeping(qdisc); +- +- ASSERT_RTNL(); +- return &root->running; +-} +- + static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) + { + return qdisc->dev_queue->dev; +@@ -833,14 +832,16 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + return sch->enqueue(skb, sch, to_free); + } + +-static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, ++static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, + __u64 bytes, __u32 packets) + { +- bstats->bytes += bytes; +- bstats->packets += packets; ++ u64_stats_update_begin(&bstats->syncp); ++ u64_stats_add(&bstats->bytes, bytes); ++ u64_stats_add(&bstats->packets, packets); ++ u64_stats_update_end(&bstats->syncp); + } + +-static inline void bstats_update(struct gnet_stats_basic_packed *bstats, ++static inline void bstats_update(struct gnet_stats_basic_sync *bstats, + const struct sk_buff *skb) + { + _bstats_update(bstats, +@@ -848,26 +849,10 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, + skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + } + +-static inline void _bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, +- __u64 bytes, __u32 packets) +-{ +- u64_stats_update_begin(&bstats->syncp); +- _bstats_update(&bstats->bstats, bytes, packets); +- u64_stats_update_end(&bstats->syncp); +-} +- +-static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, +- const struct sk_buff *skb) +-{ +- u64_stats_update_begin(&bstats->syncp); +- bstats_update(&bstats->bstats, skb); +- u64_stats_update_end(&bstats->syncp); +-} +- + static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, + const struct sk_buff *skb) + { +- bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); + } + + static inline void qdisc_bstats_update(struct Qdisc *sch, +@@ -956,10 +941,9 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, + __u32 *backlog) + { + struct gnet_stats_queue qstats = { 0 }; +- __u32 len = qdisc_qlen_sum(sch); + +- __gnet_stats_copy_queue(&qstats, sch->cpu_qstats, &sch->qstats, len); +- *qlen = qstats.qlen; ++ gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); ++ *qlen = qstats.qlen + qdisc_qlen(sch); + *backlog = qstats.backlog; + } + +@@ -1304,7 +1288,7 @@ void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); + struct mini_Qdisc { + struct tcf_proto *filter_list; + struct tcf_block *block; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; + struct rcu_head rcu; + }; +@@ -1312,7 +1296,7 @@ struct mini_Qdisc { + static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, + const struct sk_buff *skb) + { +- bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); + } + + static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) +diff --git a/init/Kconfig b/init/Kconfig +index dafc3ba6fa7a..cd852df4e7d4 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -910,7 +910,7 @@ config NUMA_BALANCING + bool "Memory placement aware NUMA scheduler" + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY +- depends on SMP && NUMA && MIGRATION ++ depends on SMP && NUMA && MIGRATION && !PREEMPT_RT + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -1905,6 +1905,7 @@ choice + + config SLAB + bool "SLAB" ++ depends on !PREEMPT_RT + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + The regular slab allocator that is established and known to work +@@ -1925,6 +1926,7 @@ config SLUB + config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" ++ depends on !PREEMPT_RT + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but +diff --git a/init/main.c b/init/main.c +index 649d9e4201a8..ee92d608ffc4 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1606,6 +1606,7 @@ static noinline void __init kernel_init_freeable(void) + + rcu_init_tasks_generic(); + do_pre_smp_initcalls(); ++ rcu_tasks_initiate_self_tests(); + lockup_detector_init(); + + smp_init(); +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index 5876e30c5740..5df0776264c2 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -1,5 +1,11 @@ + # SPDX-License-Identifier: GPL-2.0-only + ++config HAVE_PREEMPT_LAZY ++ bool ++ ++config PREEMPT_LAZY ++ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT ++ + choice + prompt "Preemption Model" + default PREEMPT_NONE +diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c +index 1486768f2318..bb3b805436c4 100644 +--- a/kernel/cgroup/rstat.c ++++ b/kernel/cgroup/rstat.c +@@ -156,8 +156,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); + struct cgroup *pos = NULL; ++ unsigned long flags; + +- raw_spin_lock(cpu_lock); ++ raw_spin_lock_irqsave(cpu_lock, flags); + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + struct cgroup_subsys_state *css; + +@@ -169,7 +170,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } +- raw_spin_unlock(cpu_lock); ++ raw_spin_unlock_irqrestore(cpu_lock, flags); + + /* if @may_sleep, play nice and yield if necessary */ + if (may_sleep && (need_resched() || +diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c +index 7beceb447211..28497c00e63b 100644 +--- a/kernel/debug/debug_core.c ++++ b/kernel/debug/debug_core.c +@@ -239,35 +239,42 @@ NOKPROBE_SYMBOL(kgdb_call_nmi_hook); + static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = + CSD_INIT(kgdb_call_nmi_hook, NULL); + +-void __weak kgdb_roundup_cpus(void) ++void __weak kgdb_roundup_cpu(unsigned int cpu) + { + call_single_data_t *csd; ++ int ret; ++ ++ csd = &per_cpu(kgdb_roundup_csd, cpu); ++ ++ /* ++ * If it didn't round up last time, don't try again ++ * since smp_call_function_single_async() will block. ++ * ++ * If rounding_up is false then we know that the ++ * previous call must have at least started and that ++ * means smp_call_function_single_async() won't block. ++ */ ++ if (kgdb_info[cpu].rounding_up) ++ return; ++ kgdb_info[cpu].rounding_up = true; ++ ++ ret = smp_call_function_single_async(cpu, csd); ++ if (ret) ++ kgdb_info[cpu].rounding_up = false; ++} ++NOKPROBE_SYMBOL(kgdb_roundup_cpu); ++ ++void __weak kgdb_roundup_cpus(void) ++{ + int this_cpu = raw_smp_processor_id(); + int cpu; +- int ret; + + for_each_online_cpu(cpu) { + /* No need to roundup ourselves */ + if (cpu == this_cpu) + continue; + +- csd = &per_cpu(kgdb_roundup_csd, cpu); +- +- /* +- * If it didn't round up last time, don't try again +- * since smp_call_function_single_async() will block. +- * +- * If rounding_up is false then we know that the +- * previous call must have at least started and that +- * means smp_call_function_single_async() won't block. +- */ +- if (kgdb_info[cpu].rounding_up) +- continue; +- kgdb_info[cpu].rounding_up = true; +- +- ret = smp_call_function_single_async(cpu, csd); +- if (ret) +- kgdb_info[cpu].rounding_up = false; ++ kgdb_roundup_cpu(cpu); + } + } + NOKPROBE_SYMBOL(kgdb_roundup_cpus); +diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c +index 6735ac36b718..539a2f0dc89d 100644 +--- a/kernel/debug/kdb/kdb_io.c ++++ b/kernel/debug/kdb/kdb_io.c +@@ -559,23 +559,17 @@ static void kdb_msg_write(const char *msg, int msg_len) + cp++; + } + ++ /* mirror output on atomic consoles */ + for_each_console(c) { + if (!(c->flags & CON_ENABLED)) + continue; + if (c == dbg_io_ops->cons) + continue; +- /* +- * Set oops_in_progress to encourage the console drivers to +- * disregard their internal spin locks: in the current calling +- * context the risk of deadlock is a bigger problem than risks +- * due to re-entering the console driver. We operate directly on +- * oops_in_progress rather than using bust_spinlocks() because +- * the calls bust_spinlocks() makes on exit are not appropriate +- * for this calling context. +- */ +- ++oops_in_progress; +- c->write(c, msg, msg_len); +- --oops_in_progress; ++ ++ if (!c->write_atomic) ++ continue; ++ c->write_atomic(c, msg, msg_len); ++ + touch_nmi_watchdog(); + } + } +diff --git a/kernel/entry/common.c b/kernel/entry/common.c +index e002bea6b4be..51ddfdacfc1f 100644 +--- a/kernel/entry/common.c ++++ b/kernel/entry/common.c +@@ -159,9 +159,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + + local_irq_enable_exit_to_user(ti_work); + +- if (ti_work & _TIF_NEED_RESCHED) ++ if (ti_work & _TIF_NEED_RESCHED_MASK) + schedule(); + ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (unlikely(current->forced_info.si_signo)) { ++ struct task_struct *t = current; ++ force_sig_info(&t->forced_info); ++ t->forced_info.si_signo = 0; ++ } ++#endif ++ + if (ti_work & _TIF_UPROBE) + uprobe_notify_resume(regs); + +@@ -388,7 +396,7 @@ void irqentry_exit_cond_resched(void) + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); +- if (need_resched()) ++ if (should_resched(0)) + preempt_schedule_irq(); + } + } +diff --git a/kernel/exit.c b/kernel/exit.c +index 80efdfda6662..6ff17e977392 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -64,6 +64,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -215,8 +216,14 @@ static void delayed_put_task_struct(struct rcu_head *rhp) + { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + ++ kprobe_flush_task(tsk); + perf_event_delayed_put(tsk); + trace_sched_process_free(tsk); ++ ++ /* RT enabled kernels delay freeing the VMAP'ed task stack */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ put_task_stack(tsk); ++ + put_task_struct(tsk); + } + +diff --git a/kernel/fork.c b/kernel/fork.c +index 1906230a000e..47f647e6a6c3 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -289,7 +289,10 @@ static inline void free_thread_stack(struct task_struct *tsk) + return; + } + +- vfree_atomic(tsk->stack); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ vfree_atomic(tsk->stack); ++ else ++ vfree(tsk->stack); + return; + } + #endif +@@ -709,6 +712,19 @@ void __mmdrop(struct mm_struct *mm) + } + EXPORT_SYMBOL_GPL(__mmdrop); + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is ++ * by far the least expensive way to do that. ++ */ ++void __mmdrop_delayed(struct rcu_head *rhp) ++{ ++ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); ++ ++ __mmdrop(mm); ++} ++#endif ++ + static void mmdrop_async_fn(struct work_struct *work) + { + struct mm_struct *mm; +diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c +index 7a45fd593245..23dc888e0885 100644 +--- a/kernel/irq/irqdesc.c ++++ b/kernel/irq/irqdesc.c +@@ -664,6 +664,29 @@ int generic_handle_irq(unsigned int irq) + } + EXPORT_SYMBOL_GPL(generic_handle_irq); + ++/** ++ * generic_handle_irq_safe - Invoke the handler for a particular irq from any ++ * context. ++ * @irq: The irq number to handle ++ * ++ * Returns: 0 on success, a negative value on error. ++ * ++ * This function can be called from any context (IRQ or process context). It ++ * will report an error if not invoked from IRQ context and the irq has been ++ * marked to enforce IRQ-context only. ++ */ ++int generic_handle_irq_safe(unsigned int irq) ++{ ++ unsigned long flags; ++ int ret; ++ ++ local_irq_save(flags); ++ ret = handle_irq_desc(irq_to_desc(irq)); ++ local_irq_restore(flags); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(generic_handle_irq_safe); ++ + #ifdef CONFIG_IRQ_DOMAIN + /** + * generic_handle_domain_irq - Invoke the handler for a HW irq belonging +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 9862372e0f01..78d90ac0528c 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -1301,6 +1301,8 @@ static int irq_thread(void *data) + + irq_thread_set_ready(desc, action); + ++ sched_set_fifo(current); ++ + if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; +@@ -1466,8 +1468,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) + if (IS_ERR(t)) + return PTR_ERR(t); + +- sched_set_fifo(t); +- + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code +@@ -2861,7 +2861,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); + * This call sets the internal irqchip state of an interrupt, + * depending on the value of @which. + * +- * This function should be called with preemption disabled if the ++ * This function should be called with migration disabled if the + * interrupt controller has per-cpu registers. + */ + int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, +diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c +index c481d8458325..02b2daf07441 100644 +--- a/kernel/irq/spurious.c ++++ b/kernel/irq/spurious.c +@@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); + + static int __init irqfixup_setup(char *str) + { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ pr_warn("irqfixup boot option not supported with PREEMPT_RT\n"); ++ return 1; ++ } + irqfixup = 1; + printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); + printk(KERN_WARNING "This may impact system performance.\n"); +@@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644); + + static int __init irqpoll_setup(char *str) + { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ pr_warn("irqpoll boot option not supported with PREEMPT_RT\n"); ++ return 1; ++ } + irqfixup = 2; + printk(KERN_WARNING "Misrouted IRQ fixup and polling support " + "enabled\n"); +diff --git a/kernel/irq_work.c b/kernel/irq_work.c +index db8c248ebc8c..f7df715ec28e 100644 +--- a/kernel/irq_work.c ++++ b/kernel/irq_work.c +@@ -18,11 +18,36 @@ + #include + #include + #include ++#include + #include + #include + + static DEFINE_PER_CPU(struct llist_head, raised_list); + static DEFINE_PER_CPU(struct llist_head, lazy_list); ++static DEFINE_PER_CPU(struct task_struct *, irq_workd); ++ ++static void wake_irq_workd(void) ++{ ++ struct task_struct *tsk = __this_cpu_read(irq_workd); ++ ++ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) ++ wake_up_process(tsk); ++} ++ ++#ifdef CONFIG_SMP ++static void irq_work_wake(struct irq_work *entry) ++{ ++ wake_irq_workd(); ++} ++ ++static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = ++ IRQ_WORK_INIT_HARD(irq_work_wake); ++#endif ++ ++static int irq_workd_should_run(unsigned int cpu) ++{ ++ return !llist_empty(this_cpu_ptr(&lazy_list)); ++} + + /* + * Claim the entry so that no one else will poke at it. +@@ -52,15 +77,29 @@ void __weak arch_irq_work_raise(void) + /* Enqueue on current CPU, work must already be claimed and preempt disabled */ + static void __irq_work_queue_local(struct irq_work *work) + { ++ struct llist_head *list; ++ bool rt_lazy_work = false; ++ bool lazy_work = false; ++ int work_flags; ++ ++ work_flags = atomic_read(&work->node.a_flags); ++ if (work_flags & IRQ_WORK_LAZY) ++ lazy_work = true; ++ else if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(work_flags & IRQ_WORK_HARD_IRQ)) ++ rt_lazy_work = true; ++ ++ if (lazy_work || rt_lazy_work) ++ list = this_cpu_ptr(&lazy_list); ++ else ++ list = this_cpu_ptr(&raised_list); ++ ++ if (!llist_add(&work->node.llist, list)) ++ return; ++ + /* If the work is "lazy", handle it from next tick if any */ +- if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { +- if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && +- tick_nohz_tick_stopped()) +- arch_irq_work_raise(); +- } else { +- if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) +- arch_irq_work_raise(); +- } ++ if (!lazy_work || tick_nohz_tick_stopped()) ++ arch_irq_work_raise(); + } + + /* Enqueue the irq work @work on the current CPU */ +@@ -104,17 +143,34 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) + if (cpu != smp_processor_id()) { + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); ++ ++ /* ++ * On PREEMPT_RT the items which are not marked as ++ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work ++ * item is used on the remote CPU to wake the thread. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { ++ ++ if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) ++ goto out; ++ ++ work = &per_cpu(irq_work_wakeup, cpu); ++ if (!irq_work_claim(work)) ++ goto out; ++ } ++ + __smp_call_single_queue(cpu, &work->node.llist); + } else { + __irq_work_queue_local(work); + } ++out: + preempt_enable(); + + return true; + #endif /* CONFIG_SMP */ + } + +- + bool irq_work_needs_cpu(void) + { + struct llist_head *raised, *lazy; +@@ -160,6 +216,10 @@ void irq_work_single(void *arg) + * else claimed it meanwhile. + */ + (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); ++ ++ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || ++ !arch_irq_work_has_interrupt()) ++ rcuwait_wake_up(&work->irqwait); + } + + static void irq_work_run_list(struct llist_head *list) +@@ -167,7 +227,12 @@ static void irq_work_run_list(struct llist_head *list) + struct irq_work *work, *tmp; + struct llist_node *llnode; + +- BUG_ON(!irqs_disabled()); ++ /* ++ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed ++ * in a per-CPU thread in preemptible context. Only the items which are ++ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. ++ */ ++ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); + + if (llist_empty(list)) + return; +@@ -184,7 +249,10 @@ static void irq_work_run_list(struct llist_head *list) + void irq_work_run(void) + { + irq_work_run_list(this_cpu_ptr(&raised_list)); +- irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ else ++ wake_irq_workd(); + } + EXPORT_SYMBOL_GPL(irq_work_run); + +@@ -194,7 +262,11 @@ void irq_work_tick(void) + + if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) + irq_work_run_list(raised); +- irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ else ++ wake_irq_workd(); + } + + /* +@@ -204,8 +276,42 @@ void irq_work_tick(void) + void irq_work_sync(struct irq_work *work) + { + lockdep_assert_irqs_enabled(); ++ might_sleep(); ++ ++ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || ++ !arch_irq_work_has_interrupt()) { ++ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), ++ TASK_UNINTERRUPTIBLE); ++ return; ++ } + + while (irq_work_is_busy(work)) + cpu_relax(); + } + EXPORT_SYMBOL_GPL(irq_work_sync); ++ ++static void run_irq_workd(unsigned int cpu) ++{ ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++} ++ ++static void irq_workd_setup(unsigned int cpu) ++{ ++ sched_set_fifo_low(current); ++} ++ ++static struct smp_hotplug_thread irqwork_threads = { ++ .store = &irq_workd, ++ .setup = irq_workd_setup, ++ .thread_should_run = irq_workd_should_run, ++ .thread_fn = run_irq_workd, ++ .thread_comm = "irq_work/%u", ++}; ++ ++static __init int irq_work_init_threads(void) ++{ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); ++ return 0; ++} ++early_initcall(irq_work_init_threads); +diff --git a/kernel/kcov.c b/kernel/kcov.c +index 80bfe71bbe13..36ca640c4f8e 100644 +--- a/kernel/kcov.c ++++ b/kernel/kcov.c +@@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); + + struct kcov_percpu_data { + void *irq_area; ++ local_lock_t lock; + + unsigned int saved_mode; + unsigned int saved_size; +@@ -96,7 +97,9 @@ struct kcov_percpu_data { + int saved_sequence; + }; + +-static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); ++static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + /* Must be called with kcov_remote_lock locked. */ + static struct kcov_remote *kcov_remote_find(u64 handle) +@@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle) + if (!in_task() && !in_serving_softirq()) + return; + +- local_irq_save(flags); ++ local_lock_irqsave(&kcov_percpu_data.lock, flags); + + /* + * Check that kcov_remote_start() is not called twice in background +@@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle) + */ + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + /* +@@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle) + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + + spin_lock(&kcov_remote_lock); + remote = kcov_remote_find(handle); + if (!remote) { +- spin_unlock_irqrestore(&kcov_remote_lock, flags); ++ spin_unlock(&kcov_remote_lock); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + kcov_debug("handle = %llx, context: %s\n", handle, +@@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle) + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } +- spin_unlock_irqrestore(&kcov_remote_lock, flags); ++ spin_unlock(&kcov_remote_lock); + + /* Can only happen when in_task(). */ + if (!area) { ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + area = vmalloc(size * sizeof(unsigned long)); + if (!area) { + kcov_put(kcov); + return; + } ++ local_lock_irqsave(&kcov_percpu_data.lock, flags); + } + +- local_irq_save(flags); +- + /* Reset coverage size. */ + *(u64 *)area = 0; + +@@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle) + } + kcov_start(t, kcov, size, area, mode, sequence); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + + } + EXPORT_SYMBOL(kcov_remote_start); +@@ -965,12 +969,12 @@ void kcov_remote_stop(void) + if (!in_task() && !in_serving_softirq()) + return; + +- local_irq_save(flags); ++ local_lock_irqsave(&kcov_percpu_data.lock, flags); + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + /* +@@ -978,12 +982,12 @@ void kcov_remote_stop(void) + * actually found the remote handle and started collecting coverage. + */ + if (in_serving_softirq() && !t->kcov_softirq) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + /* Make sure that kcov_softirq is only set when in softirq. */ + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + +@@ -1013,7 +1017,7 @@ void kcov_remote_stop(void) + spin_unlock(&kcov_remote_lock); + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + + /* Get in kcov_remote_start(). */ + kcov_put(kcov); +@@ -1034,8 +1038,8 @@ static int __init kcov_init(void) + int cpu; + + for_each_possible_cpu(cpu) { +- void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * +- sizeof(unsigned long)); ++ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE * ++ sizeof(unsigned long), cpu_to_node(cpu)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; +diff --git a/kernel/kprobes.c b/kernel/kprobes.c +index 7e9fa1b7ff67..d83e818ffbdb 100644 +--- a/kernel/kprobes.c ++++ b/kernel/kprobes.c +@@ -1248,10 +1248,10 @@ void kprobe_busy_end(void) + } + + /* +- * This function is called from finish_task_switch when task tk becomes dead, +- * so that we can recycle any function-return probe instances associated +- * with this task. These left over instances represent probed functions +- * that have been called but will never return. ++ * This function is called from delayed_put_task_struct() when a task is ++ * dead and cleaned up to recycle any function-return probe instances ++ * associated with this task. These left over instances represent probed ++ * functions that have been called but will never return. + */ + void kprobe_flush_task(struct task_struct *tk) + { +diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c +index e20c19e3ba49..777168d58f02 100644 +--- a/kernel/ksysfs.c ++++ b/kernel/ksysfs.c +@@ -143,6 +143,15 @@ KERNEL_ATTR_RO(vmcoreinfo); + + #endif /* CONFIG_CRASH_CORE */ + ++#if defined(CONFIG_PREEMPT_RT) ++static ssize_t realtime_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%d\n", 1); ++} ++KERNEL_ATTR_RO(realtime); ++#endif ++ + /* whether file capabilities are enabled */ + static ssize_t fscaps_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +@@ -233,6 +242,9 @@ static struct attribute * kernel_attrs[] = { + #ifndef CONFIG_TINY_RCU + &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, ++#endif ++#ifdef CONFIG_PREEMPT_RT ++ &realtime_attr.attr, + #endif + NULL + }; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index e319a1b62586..c3870b2a150d 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); + + static int kthread(void *_create) + { ++ static const struct sched_param param = { .sched_priority = 0 }; + /* Copy data: it's on kthread's stack */ + struct kthread_create_info *create = _create; + int (*threadfn)(void *data) = create->threadfn; +@@ -300,6 +301,13 @@ static int kthread(void *_create) + init_completion(&self->parked); + current->vfork_done = &self->exited; + ++ /* ++ * The new thread inherited kthreadd's priority and CPU mask. Reset ++ * back to default in case they have been changed. ++ */ ++ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); ++ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); ++ + /* OK, tell user we're spawned, wait for stop or wakeup */ + __set_current_state(TASK_UNINTERRUPTIBLE); + create->result = current; +@@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + } + task = create->result; + if (!IS_ERR(task)) { +- static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; + + /* +@@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); +- /* +- * root may have changed our (kthreadd's) priority or CPU mask. +- * The kernel thread should not inherit these properties. +- */ +- sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); +- set_cpus_allowed_ptr(task, +- housekeeping_cpumask(HK_FLAG_KTHREAD)); + } + kfree(create); + return task; +diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c +index e6a282bc1665..ce3c8a4a5506 100644 +--- a/kernel/locking/lockdep.c ++++ b/kernel/locking/lockdep.c +@@ -5470,6 +5470,7 @@ static noinstr void check_flags(unsigned long flags) + } + } + ++#ifndef CONFIG_PREEMPT_RT + /* + * We dont accurately track softirq state in e.g. + * hardirq contexts (such as on 4KSTACKS), so only +@@ -5484,6 +5485,7 @@ static noinstr void check_flags(unsigned long flags) + DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } + } ++#endif + + if (!debug_locks) + print_irqtrace_events(current); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index b7fa3ee3aa1d..108b963a783b 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1135,8 +1135,26 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, + * which is wrong, as the other waiter is not in a deadlock + * situation. + */ +- if (owner == task) ++ if (owner == task) { ++#if defined(DEBUG_WW_MUTEXES) && defined(CONFIG_DEBUG_LOCKING_API_SELFTESTS) ++ /* ++ * The lockdep selftest for ww-mutex assumes in a few cases ++ * the ww_ctx->contending_lock assignment via ++ * __ww_mutex_check_kill() which does not happen if the rtmutex ++ * detects the deadlock early. ++ */ ++ if (build_ww_mutex() && ww_ctx) { ++ struct rt_mutex *rtm; ++ ++ /* Check whether the waiter should backout immediately */ ++ rtm = container_of(lock, struct rt_mutex, rtmutex); ++ ++ __ww_mutex_add_waiter(waiter, rtm, ww_ctx); ++ __ww_mutex_check_kill(rtm, waiter, ww_ctx); ++ } ++#endif + return -EDEADLK; ++ } + + raw_spin_lock(&task->pi_lock); + waiter->task = task; +diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c +index a461be2f873d..cb9fdff76a8a 100644 +--- a/kernel/locking/rtmutex_api.c ++++ b/kernel/locking/rtmutex_api.c +@@ -21,12 +21,13 @@ int max_lock_depth = 1024; + */ + static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, + unsigned int state, ++ struct lockdep_map *nest_lock, + unsigned int subclass) + { + int ret; + + might_sleep(); +- mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); +@@ -48,10 +49,16 @@ EXPORT_SYMBOL(rt_mutex_base_init); + */ + void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) + { +- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); ++ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); + } + EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + ++void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) ++{ ++ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); ++} ++EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); ++ + #else /* !CONFIG_DEBUG_LOCK_ALLOC */ + + /** +@@ -61,7 +68,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + */ + void __sched rt_mutex_lock(struct rt_mutex *lock) + { +- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); ++ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); + } + EXPORT_SYMBOL_GPL(rt_mutex_lock); + #endif +@@ -77,10 +84,25 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); + */ + int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) + { +- return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); ++ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); + } + EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + ++/** ++ * rt_mutex_lock_killable - lock a rt_mutex killable ++ * ++ * @lock: the rt_mutex to be locked ++ * ++ * Returns: ++ * 0 on success ++ * -EINTR when interrupted by a signal ++ */ ++int __sched rt_mutex_lock_killable(struct rt_mutex *lock) ++{ ++ return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); ++} ++EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); ++ + /** + * rt_mutex_trylock - try to lock a rt_mutex + * +diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c +index d2912e44d61f..9e396a09fe0f 100644 +--- a/kernel/locking/spinlock_rt.c ++++ b/kernel/locking/spinlock_rt.c +@@ -24,6 +24,17 @@ + #define RT_MUTEX_BUILD_SPINLOCKS + #include "rtmutex.c" + ++/* ++ * __might_resched() skips the state check as rtlocks are state ++ * preserving. Take RCU nesting into account as spin/read/write_lock() can ++ * legitimately nest into an RCU read side critical section. ++ */ ++#define RTLOCK_RESCHED_OFFSETS \ ++ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) ++ ++#define rtlock_might_resched() \ ++ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) ++ + static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) + { + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) +@@ -32,7 +43,7 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) + + static __always_inline void __rt_spin_lock(spinlock_t *lock) + { +- ___might_sleep(__FILE__, __LINE__, 0); ++ rtlock_might_resched(); + rtlock_lock(&lock->lock); + rcu_read_lock(); + migrate_disable(); +@@ -210,7 +221,7 @@ EXPORT_SYMBOL(rt_write_trylock); + + void __sched rt_read_lock(rwlock_t *rwlock) + { +- ___might_sleep(__FILE__, __LINE__, 0); ++ rtlock_might_resched(); + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); +@@ -220,7 +231,7 @@ EXPORT_SYMBOL(rt_read_lock); + + void __sched rt_write_lock(rwlock_t *rwlock) + { +- ___might_sleep(__FILE__, __LINE__, 0); ++ rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); +@@ -246,12 +257,6 @@ void __sched rt_write_unlock(rwlock_t *rwlock) + } + EXPORT_SYMBOL(rt_write_unlock); + +-int __sched rt_rwlock_is_contended(rwlock_t *rwlock) +-{ +- return rw_base_is_contended(&rwlock->rwbase); +-} +-EXPORT_SYMBOL(rt_rwlock_is_contended); +- + #ifdef CONFIG_DEBUG_LOCK_ALLOC + void __rt_rwlock_init(rwlock_t *rwlock, const char *name, + struct lock_class_key *key) +diff --git a/kernel/panic.c b/kernel/panic.c +index 47933d4c769b..ea5269f486cc 100644 +--- a/kernel/panic.c ++++ b/kernel/panic.c +@@ -245,12 +245,27 @@ void check_panic_on_warn(const char *origin) + void panic(const char *fmt, ...) + { + static char buf[1024]; ++ va_list args2; + va_list args; + long i, i_next = 0, len; + int state = 0; + int old_cpu, this_cpu; + bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + ++ console_verbose(); ++ pr_emerg("Kernel panic - not syncing:\n"); ++ va_start(args2, fmt); ++ va_copy(args, args2); ++ vprintk(fmt, args2); ++ va_end(args2); ++#ifdef CONFIG_DEBUG_BUGVERBOSE ++ /* ++ * Avoid nested stack-dumping if a panic occurs during oops processing ++ */ ++ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) ++ dump_stack(); ++#endif ++ pr_flush(1000, true); + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. +@@ -291,24 +306,13 @@ void panic(const char *fmt, ...) + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) + panic_smp_self_stop(); + +- console_verbose(); + bust_spinlocks(1); +- va_start(args, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + +- pr_emerg("Kernel panic - not syncing: %s\n", buf); +-#ifdef CONFIG_DEBUG_BUGVERBOSE +- /* +- * Avoid nested stack-dumping if a panic occurs during oops processing +- */ +- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) +- dump_stack(); +-#endif +- + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left +@@ -617,9 +621,11 @@ static u64 oops_id; + + static int init_oops_id(void) + { ++#ifndef CONFIG_PREEMPT_RT + if (!oops_id) + get_random_bytes(&oops_id, sizeof(oops_id)); + else ++#endif + oops_id++; + + return 0; +@@ -630,6 +636,7 @@ static void print_oops_end_marker(void) + { + init_oops_id(); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); ++ pr_flush(1000, true); + } + + /* +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 8d856b7c2e5a..7f27cfee283e 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -44,6 +44,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include + #include + #include +@@ -268,11 +271,6 @@ static void __up_console_sem(unsigned long ip) + */ + static int console_locked, console_suspended; + +-/* +- * If exclusive_console is non-NULL then only this console is to be printed to. +- */ +-static struct console *exclusive_console; +- + /* + * Array of consoles built from command line options (console=) + */ +@@ -352,10 +350,13 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; + * non-prinatable characters are escaped in the "\xff" notation. + */ + ++#ifdef CONFIG_PRINTK + /* syslog_lock protects syslog_* variables and write access to clear_seq. */ + static DEFINE_MUTEX(syslog_lock); + +-#ifdef CONFIG_PRINTK ++/* Set to enable sync mode. Once set, it is never cleared. */ ++static bool sync_mode; ++ + DECLARE_WAIT_QUEUE_HEAD(log_wait); + /* All 3 protected by @syslog_lock. */ + /* the next printk record to read by syslog(READ) or /proc/kmsg */ +@@ -363,17 +364,6 @@ static u64 syslog_seq; + static size_t syslog_partial; + static bool syslog_time; + +-/* All 3 protected by @console_sem. */ +-/* the next printk record to write to the console */ +-static u64 console_seq; +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; +- +-struct latched_seq { +- seqcount_latch_t latch; +- u64 val[2]; +-}; +- + /* + * The next printk record to read after the last 'clear' command. There are + * two copies (updated with seqcount_latch) so that reads can locklessly +@@ -391,9 +381,6 @@ static struct latched_seq clear_seq = { + #define PREFIX_MAX 32 + #endif + +-/* the maximum size of a formatted record (i.e. with prefix added per line) */ +-#define CONSOLE_LOG_MAX 1024 +- + /* the maximum size allowed to be reserved for a record */ + #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) + +@@ -437,7 +424,7 @@ bool printk_percpu_data_ready(void) + return __printk_percpu_data_ready; + } + +-/* Must be called under syslog_lock. */ ++/* Must be called under associated write-protection lock. */ + static void latched_seq_write(struct latched_seq *ls, u64 val) + { + raw_write_seqcount_latch(&ls->latch); +@@ -1771,188 +1758,152 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) + return do_syslog(type, buf, len, SYSLOG_FROM_READER); + } + +-/* +- * Special console_lock variants that help to reduce the risk of soft-lockups. +- * They allow to pass console_lock to another printk() call using a busy wait. +- */ ++int printk_delay_msec __read_mostly; + +-#ifdef CONFIG_LOCKDEP +-static struct lockdep_map console_owner_dep_map = { +- .name = "console_owner" +-}; +-#endif ++static inline void printk_delay(int level) ++{ ++ boot_delay_msec(level); + +-static DEFINE_RAW_SPINLOCK(console_owner_lock); +-static struct task_struct *console_owner; +-static bool console_waiter; ++ if (unlikely(printk_delay_msec)) { ++ int m = printk_delay_msec; + +-/** +- * console_lock_spinning_enable - mark beginning of code where another +- * thread might safely busy wait +- * +- * This basically converts console_lock into a spinlock. This marks +- * the section where the console_lock owner can not sleep, because +- * there may be a waiter spinning (like a spinlock). Also it must be +- * ready to hand over the lock at the end of the section. +- */ +-static void console_lock_spinning_enable(void) ++ while (m--) { ++ mdelay(1); ++ touch_nmi_watchdog(); ++ } ++ } ++} ++ ++static bool kernel_sync_mode(void) + { +- raw_spin_lock(&console_owner_lock); +- console_owner = current; +- raw_spin_unlock(&console_owner_lock); ++ return (oops_in_progress || sync_mode); ++} + +- /* The waiter may spin on us after setting console_owner */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); ++static bool console_may_sync(struct console *con) ++{ ++ if (!(con->flags & CON_ENABLED)) ++ return false; ++ if (con->write_atomic && kernel_sync_mode()) ++ return true; ++ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) ++ return true; ++ if (con->write && (con->flags & CON_BOOT) && !con->thread) ++ return true; ++ return false; + } + +-/** +- * console_lock_spinning_disable_and_check - mark end of code where another +- * thread was able to busy wait and check if there is a waiter +- * +- * This is called at the end of the section where spinning is allowed. +- * It has two functions. First, it is a signal that it is no longer +- * safe to start busy waiting for the lock. Second, it checks if +- * there is a busy waiter and passes the lock rights to her. +- * +- * Important: Callers lose the lock if there was a busy waiter. +- * They must not touch items synchronized by console_lock +- * in this case. +- * +- * Return: 1 if the lock rights were passed, 0 otherwise. +- */ +-static int console_lock_spinning_disable_and_check(void) ++static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) + { +- int waiter; ++ if (!(con->flags & CON_ENABLED)) ++ return false; + +- raw_spin_lock(&console_owner_lock); +- waiter = READ_ONCE(console_waiter); +- console_owner = NULL; +- raw_spin_unlock(&console_owner_lock); ++ if (con->write_atomic && kernel_sync_mode()) { ++ con->write_atomic(con, text, text_len); ++ return true; ++ } + +- if (!waiter) { +- spin_release(&console_owner_dep_map, _THIS_IP_); +- return 0; ++ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) { ++ if (console_trylock()) { ++ con->write_atomic(con, text, text_len); ++ console_unlock(); ++ return true; ++ } ++ ++ } else if (con->write && (con->flags & CON_BOOT) && !con->thread) { ++ if (console_trylock()) { ++ con->write(con, text, text_len); ++ console_unlock(); ++ return true; ++ } + } + +- /* The waiter is now free to continue */ +- WRITE_ONCE(console_waiter, false); ++ return false; ++} + +- spin_release(&console_owner_dep_map, _THIS_IP_); ++static bool have_atomic_console(void) ++{ ++ struct console *con; + +- /* +- * Hand off console_lock to waiter. The waiter will perform +- * the up(). After this, the waiter is the console_lock owner. +- */ +- mutex_release(&console_lock_dep_map, _THIS_IP_); +- return 1; ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ if (con->write_atomic) ++ return true; ++ } ++ return false; + } + +-/** +- * console_trylock_spinning - try to get console_lock by busy waiting +- * +- * This allows to busy wait for the console_lock when the current +- * owner is running in specially marked sections. It means that +- * the current owner is running and cannot reschedule until it +- * is ready to lose the lock. +- * +- * Return: 1 if we got the lock, 0 othrewise +- */ +-static int console_trylock_spinning(void) ++static bool print_sync(struct console *con, u64 *seq) + { +- struct task_struct *owner = NULL; +- bool waiter; +- bool spin = false; +- unsigned long flags; ++ struct printk_info info; ++ struct printk_record r; ++ size_t text_len; + +- if (console_trylock()) +- return 1; ++ prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); + +- printk_safe_enter_irqsave(flags); ++ if (!prb_read_valid(prb, *seq, &r)) ++ return false; + +- raw_spin_lock(&console_owner_lock); +- owner = READ_ONCE(console_owner); +- waiter = READ_ONCE(console_waiter); +- if (!waiter && owner && owner != current) { +- WRITE_ONCE(console_waiter, true); +- spin = true; +- } +- raw_spin_unlock(&console_owner_lock); ++ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + +- /* +- * If there is an active printk() writing to the +- * consoles, instead of having it write our data too, +- * see if we can offload that load from the active +- * printer, and do some printing ourselves. +- * Go into a spin only if there isn't already a waiter +- * spinning, and there is an active printer, and +- * that active printer isn't us (recursive printk?). +- */ +- if (!spin) { +- printk_safe_exit_irqrestore(flags); +- return 0; +- } ++ if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) ++ return false; + +- /* We spin waiting for the owner to release us */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +- /* Owner will clear console_waiter on hand off */ +- while (READ_ONCE(console_waiter)) +- cpu_relax(); +- spin_release(&console_owner_dep_map, _THIS_IP_); ++ *seq = r.info->seq; + +- printk_safe_exit_irqrestore(flags); +- /* +- * The owner passed the console lock to us. +- * Since we did not spin on console lock, annotate +- * this as a trylock. Otherwise lockdep will +- * complain. +- */ +- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); ++ touch_softlockup_watchdog_sync(); ++ clocksource_touch_watchdog(); ++ rcu_cpu_stall_reset(); ++ touch_nmi_watchdog(); + +- return 1; ++ if (text_len) ++ printk_delay(r.info->level); ++ ++ return true; + } + +-/* +- * Call the console drivers, asking them to write out +- * log_buf[start] to log_buf[end - 1]. +- * The console_lock must be held. +- */ +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) ++static u64 read_console_seq(struct console *con) + { +- static char dropped_text[64]; +- size_t dropped_len = 0; +- struct console *con; ++ u64 seq2; ++ u64 seq; + +- trace_console_rcuidle(text, len); ++ seq = latched_seq_read_nolock(&con->printk_seq); ++ seq2 = latched_seq_read_nolock(&con->printk_sync_seq); ++ if (seq2 > seq) ++ seq = seq2; ++#ifdef CONFIG_HAVE_NMI ++ seq2 = latched_seq_read_nolock(&con->printk_sync_nmi_seq); ++ if (seq2 > seq) ++ seq = seq2; ++#endif ++ return seq; ++} + +- if (!console_drivers) +- return; ++static void print_sync_until(struct console *con, u64 seq, bool is_locked) ++{ ++ u64 printk_seq; + +- if (console_dropped) { +- dropped_len = snprintf(dropped_text, sizeof(dropped_text), +- "** %lu printk messages dropped **\n", +- console_dropped); +- console_dropped = 0; +- } ++ while (!__printk_cpu_trylock()) ++ cpu_relax(); + +- for_each_console(con) { +- if (exclusive_console && con != exclusive_console) +- continue; +- if (!(con->flags & CON_ENABLED)) +- continue; +- if (!con->write) +- continue; +- if (!cpu_online(smp_processor_id()) && +- !(con->flags & CON_ANYTIME)) +- continue; +- if (con->flags & CON_EXTENDED) +- con->write(con, ext_text, ext_len); +- else { +- if (dropped_len) +- con->write(con, dropped_text, dropped_len); +- con->write(con, text, len); +- } ++ for (;;) { ++ printk_seq = read_console_seq(con); ++ if (printk_seq >= seq) ++ break; ++ if (!print_sync(con, &printk_seq)) ++ break; ++ ++ if (is_locked) ++ latched_seq_write(&con->printk_seq, printk_seq + 1); ++#ifdef CONFIG_PRINTK_NMI ++ else if (in_nmi()) ++ latched_seq_write(&con->printk_sync_nmi_seq, printk_seq + 1); ++#endif ++ else ++ latched_seq_write(&con->printk_sync_seq, printk_seq + 1); + } ++ ++ __printk_cpu_unlock(); + } + + /* +@@ -2025,20 +1976,6 @@ static u8 *__printk_recursion_counter(void) + local_irq_restore(flags); \ + } while (0) + +-int printk_delay_msec __read_mostly; +- +-static inline void printk_delay(void) +-{ +- if (unlikely(printk_delay_msec)) { +- int m = printk_delay_msec; +- +- while (m--) { +- mdelay(1); +- touch_nmi_watchdog(); +- } +- } +-} +- + static inline u32 printk_caller_id(void) + { + return in_task() ? task_pid_nr(current) : +@@ -2126,6 +2063,7 @@ int vprintk_store(int facility, int level, + const u32 caller_id = printk_caller_id(); + struct prb_reserved_entry e; + enum printk_info_flags flags = 0; ++ bool final_commit = false; + struct printk_record r; + unsigned long irqflags; + u16 trunc_msg_len = 0; +@@ -2136,6 +2074,7 @@ int vprintk_store(int facility, int level, + u16 text_len; + int ret = 0; + u64 ts_nsec; ++ u64 seq; + + /* + * Since the duration of printk() can vary depending on the message +@@ -2174,6 +2113,7 @@ int vprintk_store(int facility, int level, + if (flags & LOG_CONT) { + prb_rec_init_wr(&r, reserve_size); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { ++ seq = r.info->seq; + text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, + facility, &flags, fmt, args); + r.info->text_len += text_len; +@@ -2181,6 +2121,7 @@ int vprintk_store(int facility, int level, + if (flags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); ++ final_commit = true; + } else { + prb_commit(&e); + } +@@ -2204,6 +2145,7 @@ int vprintk_store(int facility, int level, + if (!prb_reserve(&e, prb, &r)) + goto out; + } ++ seq = r.info->seq; + + /* fill message */ + text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); +@@ -2219,13 +2161,25 @@ int vprintk_store(int facility, int level, + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + + /* A message without a trailing newline can be continued. */ +- if (!(flags & LOG_NEWLINE)) ++ if (!(flags & LOG_NEWLINE)) { + prb_commit(&e); +- else ++ } else { + prb_final_commit(&e); ++ final_commit = true; ++ } + + ret = text_len + trunc_msg_len; + out: ++ /* only the kernel may perform synchronous printing */ ++ if (facility == 0 && final_commit) { ++ struct console *con; ++ ++ for_each_console(con) { ++ if (console_may_sync(con)) ++ print_sync_until(con, seq + 1, false); ++ } ++ } ++ + printk_exit_irqrestore(recursion_ptr, irqflags); + return ret; + } +@@ -2235,40 +2189,16 @@ asmlinkage int vprintk_emit(int facility, int level, + const char *fmt, va_list args) + { + int printed_len; +- bool in_sched = false; + + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + +- if (level == LOGLEVEL_SCHED) { ++ if (level == LOGLEVEL_SCHED) + level = LOGLEVEL_DEFAULT; +- in_sched = true; +- } +- +- boot_delay_msec(level); +- printk_delay(); + + printed_len = vprintk_store(facility, level, dev_info, fmt, args); + +- /* If called from the scheduler, we can not call up(). */ +- if (!in_sched) { +- /* +- * Disable preemption to avoid being preempted while holding +- * console_sem which would prevent anyone from printing to +- * console +- */ +- preempt_disable(); +- /* +- * Try to acquire and then immediately release the console +- * semaphore. The release will print out buffers and wake up +- * /dev/kmsg and syslog() users. +- */ +- if (console_trylock_spinning()) +- console_unlock(); +- preempt_enable(); +- } +- + wake_up_klogd(); + return printed_len; + } +@@ -2293,37 +2223,162 @@ asmlinkage __visible int _printk(const char *fmt, ...) + } + EXPORT_SYMBOL(_printk); + +-#else /* CONFIG_PRINTK */ ++static int printk_kthread_func(void *data) ++{ ++ struct console *con = data; ++ unsigned long dropped = 0; ++ char *dropped_text = NULL; ++ struct printk_info info; ++ struct printk_record r; ++ char *ext_text = NULL; ++ size_t dropped_len; ++ int ret = -ENOMEM; ++ char *text = NULL; ++ char *write_text; ++ size_t len; ++ int error; ++ u64 seq; ++ ++ if (con->flags & CON_EXTENDED) { ++ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ if (!ext_text) ++ goto out; ++ } ++ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ dropped_text = kmalloc(64, GFP_KERNEL); ++ if (!text || !dropped_text) ++ goto out; ++ if (con->flags & CON_EXTENDED) ++ write_text = ext_text; ++ else ++ write_text = text; ++ ++ seq = read_console_seq(con); + +-#define CONSOLE_LOG_MAX 0 +-#define printk_time false ++ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + +-#define prb_read_valid(rb, seq, r) false +-#define prb_first_valid_seq(rb) 0 ++ for (;;) { ++ error = wait_event_interruptible(log_wait, ++ prb_read_valid(prb, seq, &r) || kthread_should_stop()); + +-static u64 syslog_seq; +-static u64 console_seq; +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; ++ if (kthread_should_stop()) ++ break; ++ ++ if (error) ++ continue; ++ ++ if (seq != r.info->seq) { ++ dropped += r.info->seq - seq; ++ seq = r.info->seq; ++ } ++ ++ seq++; ++ ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ ++ if (suppress_message_printing(r.info->level)) ++ continue; ++ ++ if (con->flags & CON_EXTENDED) { ++ len = info_print_ext_header(ext_text, ++ CONSOLE_EXT_LOG_MAX, ++ r.info); ++ len += msg_print_ext_body(ext_text + len, ++ CONSOLE_EXT_LOG_MAX - len, ++ &r.text_buf[0], r.info->text_len, ++ &r.info->dev_info); ++ } else { ++ len = record_print_text(&r, ++ console_msg_format & MSG_FORMAT_SYSLOG, ++ printk_time); ++ } ++ ++ console_lock(); ++ ++ /* ++ * Even though the printk kthread is always preemptible, it is ++ * still not allowed to call cond_resched() from within ++ * console drivers. The task may become non-preemptible in the ++ * console driver call chain. For example, vt_console_print() ++ * takes a spinlock and then can call into fbcon_redraw(), ++ * which can conditionally invoke cond_resched(). ++ */ ++ console_may_schedule = 0; ++ ++ if (kernel_sync_mode() && con->write_atomic) { ++ console_unlock(); ++ break; ++ } ++ ++ if (!(con->flags & CON_EXTENDED) && dropped) { ++ dropped_len = snprintf(dropped_text, 64, ++ "** %lu printk messages dropped **\n", ++ dropped); ++ dropped = 0; ++ ++ con->write(con, dropped_text, dropped_len); ++ printk_delay(r.info->level); ++ } ++ ++ con->write(con, write_text, len); ++ if (len) ++ printk_delay(r.info->level); + +-static size_t record_print_text(const struct printk_record *r, +- bool syslog, bool time) ++ latched_seq_write(&con->printk_seq, seq); ++ ++ console_unlock(); ++ } ++ ret = 0; ++out: ++ kfree(dropped_text); ++ kfree(text); ++ kfree(ext_text); ++ pr_info("%sconsole [%s%d]: printing thread stopped\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return ret; ++} ++ ++/* Must be called within console_lock(). */ ++static void start_printk_kthread(struct console *con) + { +- return 0; ++ con->thread = kthread_run(printk_kthread_func, con, ++ "pr/%s%d", con->name, con->index); ++ if (IS_ERR(con->thread)) { ++ pr_err("%sconsole [%s%d]: unable to start printing thread\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return; ++ } ++ pr_info("%sconsole [%s%d]: printing thread started\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); + } +-static ssize_t info_print_ext_header(char *buf, size_t size, +- struct printk_info *info) ++ ++/* protected by console_lock */ ++static bool kthreads_started; ++ ++/* Must be called within console_lock(). */ ++static void console_try_thread(struct console *con) + { +- return 0; ++ if (kthreads_started) { ++ start_printk_kthread(con); ++ return; ++ } ++ ++ /* ++ * The printing threads have not been started yet. If this console ++ * can print synchronously, print all unprinted messages. ++ */ ++ if (console_may_sync(con)) { ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ print_sync_until(con, prb_next_seq(prb), true); ++ local_irq_restore(flags); ++ } + } +-static ssize_t msg_print_ext_body(char *buf, size_t size, +- char *text, size_t text_len, +- struct dev_printk_info *dev_info) { return 0; } +-static void console_lock_spinning_enable(void) { } +-static int console_lock_spinning_disable_and_check(void) { return 0; } +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) {} +-static bool suppress_message_printing(int level) { return false; } + + #endif /* CONFIG_PRINTK */ + +@@ -2580,34 +2635,6 @@ int is_console_locked(void) + } + EXPORT_SYMBOL(is_console_locked); + +-/* +- * Check if we have any console that is capable of printing while cpu is +- * booting or shutting down. Requires console_sem. +- */ +-static int have_callable_console(void) +-{ +- struct console *con; +- +- for_each_console(con) +- if ((con->flags & CON_ENABLED) && +- (con->flags & CON_ANYTIME)) +- return 1; +- +- return 0; +-} +- +-/* +- * Can we actually use the console at this time on this cpu? +- * +- * Console drivers may assume that per-cpu resources have been allocated. So +- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't +- * call them until this CPU is officially up. +- */ +-static inline int can_use_console(void) +-{ +- return cpu_online(raw_smp_processor_id()) || have_callable_console(); +-} +- + /** + * console_unlock - unlock the console system + * +@@ -2624,140 +2651,13 @@ static inline int can_use_console(void) + */ + void console_unlock(void) + { +- static char ext_text[CONSOLE_EXT_LOG_MAX]; +- static char text[CONSOLE_LOG_MAX]; +- unsigned long flags; +- bool do_cond_resched, retry; +- struct printk_info info; +- struct printk_record r; +- u64 __maybe_unused next_seq; +- + if (console_suspended) { + up_console_sem(); + return; + } + +- prb_rec_init_rd(&r, &info, text, sizeof(text)); +- +- /* +- * Console drivers are called with interrupts disabled, so +- * @console_may_schedule should be cleared before; however, we may +- * end up dumping a lot of lines, for example, if called from +- * console registration path, and should invoke cond_resched() +- * between lines if allowable. Not doing so can cause a very long +- * scheduling stall on a slow console leading to RCU stall and +- * softlockup warnings which exacerbate the issue with more +- * messages practically incapacitating the system. +- * +- * console_trylock() is not able to detect the preemptive +- * context reliably. Therefore the value must be stored before +- * and cleared after the "again" goto label. +- */ +- do_cond_resched = console_may_schedule; +-again: +- console_may_schedule = 0; +- +- /* +- * We released the console_sem lock, so we need to recheck if +- * cpu is online and (if not) is there at least one CON_ANYTIME +- * console. +- */ +- if (!can_use_console()) { +- console_locked = 0; +- up_console_sem(); +- return; +- } +- +- for (;;) { +- size_t ext_len = 0; +- int handover; +- size_t len; +- +-skip: +- if (!prb_read_valid(prb, console_seq, &r)) +- break; +- +- if (console_seq != r.info->seq) { +- console_dropped += r.info->seq - console_seq; +- console_seq = r.info->seq; +- } +- +- if (suppress_message_printing(r.info->level)) { +- /* +- * Skip record we have buffered and already printed +- * directly to the console when we received it, and +- * record that has level above the console loglevel. +- */ +- console_seq++; +- goto skip; +- } +- +- /* Output to all consoles once old messages replayed. */ +- if (unlikely(exclusive_console && +- console_seq >= exclusive_console_stop_seq)) { +- exclusive_console = NULL; +- } +- +- /* +- * Handle extended console text first because later +- * record_print_text() will modify the record buffer in-place. +- */ +- if (nr_ext_console_drivers) { +- ext_len = info_print_ext_header(ext_text, +- sizeof(ext_text), +- r.info); +- ext_len += msg_print_ext_body(ext_text + ext_len, +- sizeof(ext_text) - ext_len, +- &r.text_buf[0], +- r.info->text_len, +- &r.info->dev_info); +- } +- len = record_print_text(&r, +- console_msg_format & MSG_FORMAT_SYSLOG, +- printk_time); +- console_seq++; +- +- /* +- * While actively printing out messages, if another printk() +- * were to occur on another CPU, it may wait for this one to +- * finish. This task can not be preempted if there is a +- * waiter waiting to take over. +- * +- * Interrupts are disabled because the hand over to a waiter +- * must not be interrupted until the hand over is completed +- * (@console_waiter is cleared). +- */ +- printk_safe_enter_irqsave(flags); +- console_lock_spinning_enable(); +- +- stop_critical_timings(); /* don't trace print latency */ +- call_console_drivers(ext_text, ext_len, text, len); +- start_critical_timings(); +- +- handover = console_lock_spinning_disable_and_check(); +- printk_safe_exit_irqrestore(flags); +- if (handover) +- return; +- +- if (do_cond_resched) +- cond_resched(); +- } +- +- /* Get consistent value of the next-to-be-used sequence number. */ +- next_seq = console_seq; +- + console_locked = 0; + up_console_sem(); +- +- /* +- * Someone could have filled up the buffer again, so re-check if there's +- * something to flush. In case we cannot trylock the console_sem again, +- * there's a new owner and the console_unlock() from them will do the +- * flush, no worries. +- */ +- retry = prb_read_valid(prb, next_seq, NULL); +- if (retry && console_trylock()) +- goto again; + } + EXPORT_SYMBOL(console_unlock); + +@@ -2807,18 +2707,20 @@ void console_unblank(void) + */ + void console_flush_on_panic(enum con_flush_mode mode) + { +- /* +- * If someone else is holding the console lock, trylock will fail +- * and may_schedule may be set. Ignore and proceed to unlock so +- * that messages are flushed out. As this can be called from any +- * context and we don't want to get preempted while flushing, +- * ensure may_schedule is cleared. +- */ +- console_trylock(); +- console_may_schedule = 0; ++ if (!console_trylock()) ++ return; ++ ++#ifdef CONFIG_PRINTK ++ if (mode == CONSOLE_REPLAY_ALL) { ++ struct console *c; ++ u64 seq; ++ ++ seq = prb_first_valid_seq(prb); ++ for_each_console(c) ++ latched_seq_write(&c->printk_seq, seq); ++ } ++#endif + +- if (mode == CONSOLE_REPLAY_ALL) +- console_seq = prb_first_valid_seq(prb); + console_unlock(); + } + +@@ -2954,6 +2856,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) + void register_console(struct console *newcon) + { + struct console *bcon = NULL; ++ u64 __maybe_unused seq = 0; + int err; + + for_each_console(bcon) { +@@ -2976,6 +2879,8 @@ void register_console(struct console *newcon) + } + } + ++ newcon->thread = NULL; ++ + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + +@@ -3017,8 +2922,10 @@ void register_console(struct console *newcon) + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ +- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) ++ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + newcon->flags &= ~CON_PRINTBUFFER; ++ newcon->flags |= CON_HANDOVER; ++ } + + /* + * Put this console in the list - keep the +@@ -3040,27 +2947,21 @@ void register_console(struct console *newcon) + if (newcon->flags & CON_EXTENDED) + nr_ext_console_drivers++; + +- if (newcon->flags & CON_PRINTBUFFER) { +- /* +- * console_unlock(); will print out the buffered messages +- * for us. +- * +- * We're about to replay the log buffer. Only do this to the +- * just-registered console to avoid excessive message spam to +- * the already-registered consoles. +- * +- * Set exclusive_console with disabled interrupts to reduce +- * race window with eventual console_flush_on_panic() that +- * ignores console_lock. +- */ +- exclusive_console = newcon; +- exclusive_console_stop_seq = console_seq; ++#ifdef CONFIG_PRINTK ++ if (!(newcon->flags & CON_PRINTBUFFER)) ++ seq = prb_next_seq(prb); + +- /* Get a consistent copy of @syslog_seq. */ +- mutex_lock(&syslog_lock); +- console_seq = syslog_seq; +- mutex_unlock(&syslog_lock); +- } ++ seqcount_latch_init(&newcon->printk_seq.latch); ++ latched_seq_write(&newcon->printk_seq, seq); ++ seqcount_latch_init(&newcon->printk_sync_seq.latch); ++ latched_seq_write(&newcon->printk_sync_seq, seq); ++#ifdef CONFIG_HAVE_NMI ++ seqcount_latch_init(&newcon->printk_sync_nmi_seq.latch); ++ latched_seq_write(&newcon->printk_sync_nmi_seq, seq); ++#endif ++ ++ console_try_thread(newcon); ++#endif /* CONFIG_PRINTK */ + console_unlock(); + console_sysfs_notify(); + +@@ -3134,6 +3035,9 @@ int unregister_console(struct console *console) + console_unlock(); + console_sysfs_notify(); + ++ if (console->thread && !IS_ERR(console->thread)) ++ kthread_stop(console->thread); ++ + if (console->exit) + res = console->exit(console); + +@@ -3216,6 +3120,15 @@ static int __init printk_late_init(void) + unregister_console(con); + } + } ++ ++#ifdef CONFIG_PRINTK ++ console_lock(); ++ for_each_console(con) ++ start_printk_kthread(con); ++ kthreads_started = true; ++ console_unlock(); ++#endif ++ + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, + console_cpu_notify); + WARN_ON(ret < 0); +@@ -3239,14 +3152,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) + { + int pending = this_cpu_xchg(printk_pending, 0); + +- if (pending & PRINTK_PENDING_OUTPUT) { +- /* If trylock fails, someone else is doing the printing */ +- if (console_trylock()) +- console_unlock(); +- } +- + if (pending & PRINTK_PENDING_WAKEUP) +- wake_up_interruptible(&log_wait); ++ wake_up_interruptible_all(&log_wait); + } + + static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = +@@ -3293,7 +3200,7 @@ void defer_console_output(void) + + void printk_trigger_flush(void) + { +- defer_console_output(); ++ wake_up_klogd(); + } + + int vprintk_deferred(const char *fmt, va_list args) +@@ -3444,6 +3351,24 @@ void kmsg_dump(enum kmsg_dump_reason reason) + { + struct kmsg_dumper *dumper; + ++ if (!oops_in_progress) { ++ /* ++ * If atomic consoles are available, activate kernel sync mode ++ * to make sure any final messages are visible. The trailing ++ * printk message is important to flush any pending messages. ++ */ ++ if (have_atomic_console()) { ++ sync_mode = true; ++ pr_info("enabled sync mode\n"); ++ } ++ ++ /* ++ * Give the printing threads time to flush, allowing up to ++ * 1s of no printing forward progress before giving up. ++ */ ++ pr_flush(1000, true); ++ } ++ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + enum kmsg_dump_reason max_reason = dumper->max_reason; +@@ -3626,6 +3551,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + #ifdef CONFIG_SMP + static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); + static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); ++static unsigned int kgdb_cpu = -1; + + /** + * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant +@@ -3705,6 +3631,9 @@ EXPORT_SYMBOL(__printk_cpu_trylock); + */ + void __printk_cpu_unlock(void) + { ++ bool trigger_kgdb = false; ++ unsigned int cpu; ++ + if (atomic_read(&printk_cpulock_nested)) { + atomic_dec(&printk_cpulock_nested); + return; +@@ -3715,6 +3644,12 @@ void __printk_cpu_unlock(void) + * LMM(__printk_cpu_unlock:A) + */ + ++ cpu = smp_processor_id(); ++ if (kgdb_cpu == cpu) { ++ trigger_kgdb = true; ++ kgdb_cpu = -1; ++ } ++ + /* + * Guarantee loads and stores from this CPU when it was the + * lock owner are visible to the next lock owner. This pairs +@@ -3735,6 +3670,98 @@ void __printk_cpu_unlock(void) + */ + atomic_set_release(&printk_cpulock_owner, + -1); /* LMM(__printk_cpu_unlock:B) */ ++ ++ if (trigger_kgdb) { ++ pr_warn("re-triggering kgdb roundup for CPU#%d\n", cpu); ++ kgdb_roundup_cpu(cpu); ++ } + } + EXPORT_SYMBOL(__printk_cpu_unlock); ++ ++bool kgdb_roundup_delay(unsigned int cpu) ++{ ++ if (cpu != atomic_read(&printk_cpulock_owner)) ++ return false; ++ ++ kgdb_cpu = cpu; ++ return true; ++} ++EXPORT_SYMBOL(kgdb_roundup_delay); + #endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_PRINTK ++static void pr_msleep(bool may_sleep, int ms) ++{ ++ if (may_sleep) { ++ msleep(ms); ++ } else { ++ while (ms--) ++ udelay(1000); ++ } ++} ++ ++/** ++ * pr_flush() - Wait for printing threads to catch up. ++ * ++ * @timeout_ms: The maximum time (in ms) to wait. ++ * @reset_on_progress: Reset the timeout if forward progress is seen. ++ * ++ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 ++ * represents infinite waiting. ++ * ++ * If @reset_on_progress is true, the timeout will be reset whenever any ++ * printer has been seen to make some forward progress. ++ * ++ * Context: Any context. ++ * Return: true if all enabled printers are caught up. ++ */ ++bool pr_flush(int timeout_ms, bool reset_on_progress) ++{ ++ int remaining = timeout_ms; ++ struct console *con; ++ u64 last_diff = 0; ++ bool may_sleep; ++ u64 printk_seq; ++ u64 diff; ++ u64 seq; ++ ++ may_sleep = (preemptible() && ++ !in_softirq() && ++ system_state >= SYSTEM_RUNNING); ++ ++ seq = prb_next_seq(prb); ++ ++ for (;;) { ++ diff = 0; ++ ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ printk_seq = read_console_seq(con); ++ if (printk_seq < seq) ++ diff += seq - printk_seq; ++ } ++ ++ if (diff != last_diff && reset_on_progress) ++ remaining = timeout_ms; ++ ++ if (diff == 0 || remaining == 0) ++ break; ++ ++ if (remaining < 0) { ++ pr_msleep(may_sleep, 100); ++ } else if (remaining < 100) { ++ pr_msleep(may_sleep, remaining); ++ remaining = 0; ++ } else { ++ pr_msleep(may_sleep, 100); ++ remaining -= 100; ++ } ++ ++ last_diff = diff; ++ } ++ ++ return (diff == 0); ++} ++EXPORT_SYMBOL(pr_flush); ++#endif /* CONFIG_PRINTK */ +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index 0cf547531ddf..0df2de214daa 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -197,7 +197,18 @@ static bool ptrace_freeze_traced(struct task_struct *task) + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && + !__fatal_signal_pending(task)) { ++#ifdef CONFIG_PREEMPT_RT ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (READ_ONCE(task->__state) & __TASK_TRACED) ++ WRITE_ONCE(task->__state, __TASK_TRACED); ++ else ++ task->saved_state = __TASK_TRACED; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++#else + WRITE_ONCE(task->__state, __TASK_TRACED); ++#endif + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); +@@ -207,7 +218,11 @@ static bool ptrace_freeze_traced(struct task_struct *task) + + static void ptrace_unfreeze_traced(struct task_struct *task) + { +- if (READ_ONCE(task->__state) != __TASK_TRACED) ++ unsigned long flags; ++ bool frozen = true; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && ++ READ_ONCE(task->__state) != __TASK_TRACED) + return; + + WARN_ON(!task->ptrace || task->parent != current); +@@ -217,12 +232,21 @@ static void ptrace_unfreeze_traced(struct task_struct *task) + * Recheck state under the lock to close this race. + */ + spin_lock_irq(&task->sighand->siglock); +- if (READ_ONCE(task->__state) == __TASK_TRACED) { +- if (__fatal_signal_pending(task)) +- wake_up_state(task, __TASK_TRACED); +- else +- WRITE_ONCE(task->__state, TASK_TRACED); +- } ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (READ_ONCE(task->__state) == __TASK_TRACED) ++ WRITE_ONCE(task->__state, TASK_TRACED); ++ ++#ifdef CONFIG_PREEMPT_RT ++ else if (task->saved_state == __TASK_TRACED) ++ task->saved_state = TASK_TRACED; ++#endif ++ else ++ frozen = false; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++ ++ if (frozen && __fatal_signal_pending(task)) ++ wake_up_state(task, __TASK_TRACED); ++ + spin_unlock_irq(&task->sighand->siglock); + } + +diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h +index 94b8ee84bc78..509ea934305c 100644 +--- a/kernel/rcu/tasks.h ++++ b/kernel/rcu/tasks.h +@@ -1362,7 +1362,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) + rttd->notrun = true; + } + +-static void rcu_tasks_initiate_self_tests(void) ++void rcu_tasks_initiate_self_tests(void) + { + pr_info("Running RCU-tasks wait API self tests\n"); + #ifdef CONFIG_TASKS_RCU +@@ -1399,9 +1399,7 @@ static int rcu_tasks_verify_self_tests(void) + return ret; + } + late_initcall(rcu_tasks_verify_self_tests); +-#else /* #ifdef CONFIG_PROVE_RCU */ +-static void rcu_tasks_initiate_self_tests(void) { } +-#endif /* #else #ifdef CONFIG_PROVE_RCU */ ++#endif /* #ifdef CONFIG_PROVE_RCU */ + + void __init rcu_init_tasks_generic(void) + { +@@ -1416,9 +1414,6 @@ void __init rcu_init_tasks_generic(void) + #ifdef CONFIG_TASKS_TRACE_RCU + rcu_spawn_tasks_trace_kthread(); + #endif +- +- // Run the self-tests. +- rcu_tasks_initiate_self_tests(); + } + + #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index df016f6d0662..8ea272f7eb18 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -2280,13 +2280,13 @@ rcu_report_qs_rdp(struct rcu_data *rdp) + { + unsigned long flags; + unsigned long mask; +- bool needwake = false; +- const bool offloaded = rcu_rdp_is_offloaded(rdp); ++ bool offloaded, needwake = false; + struct rcu_node *rnp; + + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); ++ offloaded = rcu_rdp_is_offloaded(rdp); + if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || + rdp->gpwrap) { + +@@ -2448,7 +2448,7 @@ static void rcu_do_batch(struct rcu_data *rdp) + int div; + bool __maybe_unused empty; + unsigned long flags; +- const bool offloaded = rcu_rdp_is_offloaded(rdp); ++ bool offloaded; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + long bl, count = 0; +@@ -2474,6 +2474,7 @@ static void rcu_do_batch(struct rcu_data *rdp) + rcu_nocb_lock(rdp); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + pending = rcu_segcblist_n_cbs(&rdp->cblist); ++ offloaded = rcu_rdp_is_offloaded(rdp); + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index d34a56f16d13..cd0983900823 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -75,7 +75,11 @@ __read_mostly int sysctl_resched_latency_warn_once = 1; + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ ++#ifdef CONFIG_PREEMPT_RT ++const_debug unsigned int sysctl_sched_nr_migrate = 8; ++#else + const_debug unsigned int sysctl_sched_nr_migrate = 32; ++#endif + + /* + * period over which we measure -rt task CPU usage in us. +@@ -983,6 +987,46 @@ void resched_curr(struct rq *rq) + trace_sched_wake_idle_without_ipi(cpu); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++ ++static int tsk_is_polling(struct task_struct *p) ++{ ++#ifdef TIF_POLLING_NRFLAG ++ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); ++#else ++ return 0; ++#endif ++} ++ ++void resched_curr_lazy(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ if (!sched_feat(PREEMPT_LAZY)) { ++ resched_curr(rq); ++ return; ++ } ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ if (test_tsk_need_resched_lazy(curr)) ++ return; ++ ++ set_tsk_need_resched_lazy(curr); ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) ++ return; ++ ++ /* NEED_RESCHED_LAZY must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(curr)) ++ smp_send_reschedule(cpu); ++} ++#endif ++ + void resched_cpu(int cpu) + { + struct rq *rq = cpu_rq(cpu); +@@ -2141,6 +2185,7 @@ void migrate_disable(void) + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; ++ preempt_lazy_disable(); + preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_disable); +@@ -2152,6 +2197,8 @@ void migrate_enable(void) + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; ++ } else if (WARN_ON_ONCE(p->migration_disabled == 0)) { ++ return; + } + + /* +@@ -2169,6 +2216,7 @@ void migrate_enable(void) + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; ++ preempt_lazy_enable(); + preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_enable); +@@ -3235,7 +3283,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { +- if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) ++ if (match_state && !task_match_state_lock(p, match_state)) + return 0; + cpu_relax(); + } +@@ -3250,7 +3298,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state + running = task_running(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; +- if (!match_state || READ_ONCE(p->__state) == match_state) ++ if (!match_state || task_match_state_or_saved(p, match_state)) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &rf); + +@@ -3284,7 +3332,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + +@@ -4427,6 +4475,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + p->on_cpu = 0; + #endif + init_task_preempt_count(p); ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(p)->preempt_lazy_count = 0; ++#endif + #ifdef CONFIG_SMP + plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +@@ -4922,20 +4973,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) + */ + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); +- mmdrop(mm); ++ mmdrop_sched(mm); + } + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); + + /* +- * Remove function-return probe instances associated with this +- * task and put them back on the free list. ++ * Release VMAP'ed task stack immediate for reuse. On RT ++ * enabled kernels this is delayed for latency reasons. + */ +- kprobe_flush_task(prev); +- +- /* Task is done with its stack. */ +- put_task_stack(prev); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ put_task_stack(prev); + + put_task_struct_rcu_user(prev); + } +@@ -6335,6 +6384,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) + + next = pick_next_task(rq, prev, &rf); + clear_tsk_need_resched(prev); ++ clear_tsk_need_resched_lazy(prev); + clear_preempt_need_resched(); + #ifdef CONFIG_SCHED_DEBUG + rq->last_seen_need_resched_ns = 0; +@@ -6556,6 +6606,30 @@ static void __sched notrace preempt_schedule_common(void) + } while (need_resched()); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++/* ++ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is ++ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as ++ * preempt_lazy_count counter >0. ++ */ ++static __always_inline int preemptible_lazy(void) ++{ ++ if (test_thread_flag(TIF_NEED_RESCHED)) ++ return 1; ++ if (current_thread_info()->preempt_lazy_count) ++ return 0; ++ return 1; ++} ++ ++#else ++ ++static inline int preemptible_lazy(void) ++{ ++ return 1; ++} ++ ++#endif ++ + #ifdef CONFIG_PREEMPTION + /* + * This is the entry point to schedule() from in-kernel preemption +@@ -6569,7 +6643,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) + */ + if (likely(!preemptible())) + return; +- ++ if (!preemptible_lazy()) ++ return; + preempt_schedule_common(); + } + NOKPROBE_SYMBOL(preempt_schedule); +@@ -6602,6 +6677,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) + if (likely(!preemptible())) + return; + ++ if (!preemptible_lazy()) ++ return; ++ + do { + /* + * Because the function tracer can trace preempt_count_sub() +@@ -8754,7 +8832,9 @@ void __init init_idle(struct task_struct *idle, int cpu) + + /* Set the preempt count _outside_ the spinlocks! */ + init_idle_preempt_count(idle, cpu); +- ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(idle)->preempt_lazy_count = 0; ++#endif + /* + * The idle tasks have their own, simple scheduling class: + */ +@@ -9555,14 +9635,8 @@ void __init sched_init(void) + } + + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP +-static inline int preempt_count_equals(int preempt_offset) +-{ +- int nested = preempt_count() + rcu_preempt_depth(); +- +- return (nested == preempt_offset); +-} + +-void __might_sleep(const char *file, int line, int preempt_offset) ++void __might_sleep(const char *file, int line) + { + unsigned int state = get_current_state(); + /* +@@ -9576,11 +9650,32 @@ void __might_sleep(const char *file, int line, int preempt_offset) + (void *)current->task_state_change, + (void *)current->task_state_change); + +- ___might_sleep(file, line, preempt_offset); ++ __might_resched(file, line, 0); + } + EXPORT_SYMBOL(__might_sleep); + +-void ___might_sleep(const char *file, int line, int preempt_offset) ++static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) ++{ ++ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) ++ return; ++ ++ if (preempt_count() == preempt_offset) ++ return; ++ ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, ip); ++} ++ ++static inline bool resched_offsets_ok(unsigned int offsets) ++{ ++ unsigned int nested = preempt_count(); ++ ++ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; ++ ++ return nested == offsets; ++} ++ ++void __might_resched(const char *file, int line, unsigned int offsets) + { + /* Ratelimiting timestamp: */ + static unsigned long prev_jiffy; +@@ -9590,7 +9685,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) + /* WARN_ON_ONCE() by default, no rate limit required: */ + rcu_sleep_check(); + +- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ if ((resched_offsets_ok(offsets) && !irqs_disabled() && + !is_idle_task(current) && !current->non_block_count) || + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || + oops_in_progress) +@@ -9603,29 +9698,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset) + /* Save this before calling printk(), since that will clobber it: */ + preempt_disable_ip = get_preempt_disable_ip(current); + +- printk(KERN_ERR +- "BUG: sleeping function called from invalid context at %s:%d\n", +- file, line); +- printk(KERN_ERR +- "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", +- in_atomic(), irqs_disabled(), current->non_block_count, +- current->pid, current->comm); ++ pr_err("BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ pr_err("preempt_count: %x, expected: %x\n", preempt_count(), ++ offsets & MIGHT_RESCHED_PREEMPT_MASK); ++ ++ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { ++ pr_err("RCU nest depth: %d, expected: %u\n", ++ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); ++ } + + if (task_stack_end_corrupted(current)) +- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ pr_emerg("Thread overran stack, or stack corrupted\n"); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); +- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) +- && !preempt_count_equals(preempt_offset)) { +- pr_err("Preemption disabled at:"); +- print_ip_sym(KERN_ERR, preempt_disable_ip); +- } ++ ++ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, ++ preempt_disable_ip); ++ + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + } +-EXPORT_SYMBOL(___might_sleep); ++EXPORT_SYMBOL(__might_resched); + + void __cant_sleep(const char *file, int line, int preempt_offset) + { +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 646a6ae4b250..c02ecc105f0c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4651,7 +4651,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + ideal_runtime = sched_slice(cfs_rq, curr); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. +@@ -4675,7 +4675,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + return; + + if (delta > ideal_runtime) +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + } + + static void +@@ -4821,7 +4821,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) + * validating it and just reschedule. + */ + if (queued) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + return; + } + /* +@@ -4961,7 +4961,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) + * hierarchy can be throttled + */ + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + } + + static __always_inline +@@ -5724,7 +5724,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) + + if (delta < 0) { + if (task_current(rq, p)) +- resched_curr(rq); ++ resched_curr_lazy(rq); + return; + } + hrtick_start(rq, delta); +@@ -7449,7 +7449,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + return; + + preempt: +- resched_curr(rq); ++ resched_curr_lazy(rq); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved +@@ -11508,7 +11508,7 @@ static void task_fork_fair(struct task_struct *p) + * 'current' within the tree based on its new key value. + */ + swap(curr->vruntime, se->vruntime); +- resched_curr(rq); ++ resched_curr_lazy(rq); + } + + se->vruntime -= cfs_rq->min_vruntime; +@@ -11535,7 +11535,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) + */ + if (task_current(rq, p)) { + if (p->prio > oldprio) +- resched_curr(rq); ++ resched_curr_lazy(rq); + } else + check_preempt_curr(rq, p, 0); + } +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index c4947c1b5edb..e13090e33f3c 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -46,11 +46,19 @@ SCHED_FEAT(DOUBLE_TICK, false) + */ + SCHED_FEAT(NONTASK_CAPACITY, true) + ++#ifdef CONFIG_PREEMPT_RT ++SCHED_FEAT(TTWU_QUEUE, false) ++# ifdef CONFIG_PREEMPT_LAZY ++SCHED_FEAT(PREEMPT_LAZY, true) ++# endif ++#else ++ + /* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ + SCHED_FEAT(TTWU_QUEUE, true) ++#endif + + /* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 6312f1904825..36483b794a00 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2327,6 +2327,15 @@ extern void reweight_task(struct task_struct *p, int prio); + extern void resched_curr(struct rq *rq); + extern void resched_cpu(int cpu); + ++#ifdef CONFIG_PREEMPT_LAZY ++extern void resched_curr_lazy(struct rq *rq); ++#else ++static inline void resched_curr_lazy(struct rq *rq) ++{ ++ resched_curr(rq); ++} ++#endif ++ + extern struct rt_bandwidth def_rt_bandwidth; + extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c +index e1c655f928c7..f230b1ac7f91 100644 +--- a/kernel/sched/swait.c ++++ b/kernel/sched/swait.c +@@ -64,6 +64,7 @@ void swake_up_all(struct swait_queue_head *q) + struct swait_queue *curr; + LIST_HEAD(tmp); + ++ WARN_ON(irqs_disabled()); + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 4e8698e62f07..3d0157bd4e14 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -526,7 +526,7 @@ static int init_rootdomain(struct root_domain *rd) + #ifdef HAVE_RT_PUSH_IPI + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); +- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); ++ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); + #endif + + rd->visit_gen = 0; +diff --git a/kernel/signal.c b/kernel/signal.c +index c7dbb19219b9..0bbd89fbf240 100644 +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -1324,6 +1324,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, + struct k_sigaction *action; + int sig = info->si_signo; + ++ /* ++ * On some archs, PREEMPT_RT has to delay sending a signal from a trap ++ * since it can not enable preemption, and the signal code's spin_locks ++ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will ++ * send the signal on exit of the trap. ++ */ ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (in_atomic()) { ++ struct task_struct *t = current; ++ ++ if (WARN_ON_ONCE(t->forced_info.si_signo)) ++ return 0; ++ ++ if (is_si_special(info)) { ++ WARN_ON_ONCE(info != SEND_SIG_PRIV); ++ t->forced_info.si_signo = info->si_signo; ++ t->forced_info.si_errno = 0; ++ t->forced_info.si_code = SI_KERNEL; ++ t->forced_info.si_pid = 0; ++ t->forced_info.si_uid = 0; ++ } else { ++ t->forced_info = *info; ++ } ++ ++ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); ++ return 0; ++ } ++#endif + spin_lock_irqsave(&t->sighand->siglock, flags); + action = &t->sighand->action[sig-1]; + ignored = action->sa.sa_handler == SIG_IGN; +@@ -2308,16 +2336,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t + if (gstop_done && ptrace_reparented(current)) + do_notify_parent_cldstop(current, false, why); + +- /* +- * Don't want to allow preemption here, because +- * sys_ptrace() needs this task to be inactive. +- * +- * XXX: implement read_unlock_no_resched(). +- */ +- preempt_disable(); + read_unlock(&tasklist_lock); + cgroup_enter_frozen(); +- preempt_enable_no_resched(); + freezable_schedule(); + cgroup_leave_frozen(true); + } else { +diff --git a/kernel/smp.c b/kernel/smp.c +index 82825345432c..9d3c8c56d904 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -690,10 +690,20 @@ void flush_smp_call_function_from_idle(void) + + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, + smp_processor_id(), CFD_SEQ_IDLE); ++ + local_irq_save(flags); + flush_smp_call_function_queue(true); +- if (local_softirq_pending()) +- do_softirq(); ++ ++ if (local_softirq_pending()) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ do_softirq(); ++ } else { ++ struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); ++ ++ if (ksoftirqd && !task_is_running(ksoftirqd)) ++ wake_up_process(ksoftirqd); ++ } ++ } + + local_irq_restore(flags); + } +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 0202f23ae960..7fc118c87b9d 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2646,7 +2646,13 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; +- return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | ++#ifdef CONFIG_PREEMPT_LAZY ++ if (need_resched_lazy()) ++ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; ++#endif ++ ++ return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) | ++ (preempt_lazy_count() & 0xff) << 16 | + (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; + } + +@@ -4227,15 +4233,17 @@ unsigned long trace_total_entries(struct trace_array *tr) + + static void print_lat_help_header(struct seq_file *m) + { +- seq_puts(m, "# _------=> CPU# \n" +- "# / _-----=> irqs-off \n" +- "# | / _----=> need-resched \n" +- "# || / _---=> hardirq/softirq \n" +- "# ||| / _--=> preempt-depth \n" +- "# |||| / _-=> migrate-disable \n" +- "# ||||| / delay \n" +- "# cmd pid |||||| time | caller \n" +- "# \\ / |||||| \\ | / \n"); ++ seq_puts(m, "# _--------=> CPU# \n" ++ "# / _-------=> irqs-off \n" ++ "# | / _------=> need-resched \n" ++ "# || / _-----=> need-resched-lazy\n" ++ "# ||| / _----=> hardirq/softirq \n" ++ "# |||| / _---=> preempt-depth \n" ++ "# ||||| / _--=> preempt-lazy-depth\n" ++ "# |||||| / _-=> migrate-disable \n" ++ "# ||||||| / delay \n" ++ "# cmd pid |||||||| time | caller \n" ++ "# \\ / |||||||| \\ | / \n"); + } + + static void print_event_info(struct array_buffer *buf, struct seq_file *m) +@@ -4269,14 +4277,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file + + print_event_info(buf, m); + +- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); +- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); +- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); +- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); +- seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); +- seq_printf(m, "# %.*s|||| / delay\n", prec, space); +- seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); +- seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); ++ seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space); ++ seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); ++ seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); ++ seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); ++ seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); ++ seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); ++ seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); ++ seq_printf(m, "# %.*s|||||| / delay\n", prec, space); ++ seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); ++ seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); + } + + void +diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c +index 160298d285c0..9ec3c6c38cc3 100644 +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -193,6 +193,7 @@ static int trace_define_common_fields(void) + /* Holds both preempt_count and migrate_disable */ + __common_field(unsigned char, preempt_count); + __common_field(int, pid); ++ __common_field(unsigned char, preempt_lazy_count); + + return ret; + } +diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c +index 6b4d3f3abdae..460bc8245e4a 100644 +--- a/kernel/trace/trace_output.c ++++ b/kernel/trace/trace_output.c +@@ -451,6 +451,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + { + char hardsoft_irq; + char need_resched; ++ char need_resched_lazy; + char irqs_off; + int hardirq; + int softirq; +@@ -481,6 +482,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + break; + } + ++ need_resched_lazy = ++ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; ++ + hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : +@@ -489,14 +493,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + softirq ? 's' : + '.' ; + +- trace_seq_printf(s, "%c%c%c", +- irqs_off, need_resched, hardsoft_irq); ++ trace_seq_printf(s, "%c%c%c%c", ++ irqs_off, need_resched, need_resched_lazy, ++ hardsoft_irq); + + if (entry->preempt_count & 0xf) + trace_seq_printf(s, "%x", entry->preempt_count & 0xf); + else + trace_seq_putc(s, '.'); + ++ if (entry->preempt_lazy_count) ++ trace_seq_printf(s, "%x", entry->preempt_lazy_count); ++ else ++ trace_seq_putc(s, '.'); ++ + if (entry->preempt_count & 0xf0) + trace_seq_printf(s, "%x", entry->preempt_count >> 4); + else +diff --git a/lib/bug.c b/lib/bug.c +index 45a0584f6541..03a87df69ed2 100644 +--- a/lib/bug.c ++++ b/lib/bug.c +@@ -206,6 +206,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) + else + pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", + (void *)bugaddr); ++ pr_flush(1000, true); + + return BUG_TRAP_TYPE_BUG; + } +diff --git a/lib/dump_stack.c b/lib/dump_stack.c +index 6b7f1bf6715d..6e8ae42c7e27 100644 +--- a/lib/dump_stack.c ++++ b/lib/dump_stack.c +@@ -102,9 +102,9 @@ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) + * Permit this cpu to perform nested stack dumps while serialising + * against other CPUs + */ +- printk_cpu_lock_irqsave(flags); ++ raw_printk_cpu_lock_irqsave(flags); + __dump_stack(log_lvl); +- printk_cpu_unlock_irqrestore(flags); ++ raw_printk_cpu_unlock_irqrestore(flags); + } + EXPORT_SYMBOL(dump_stack_lvl); + +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 2f17b488d58e..2b9f797642f6 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -191,11 +191,13 @@ static int irq_poll_cpu_dead(unsigned int cpu) + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ ++ local_bh_disable(); + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + this_cpu_ptr(&blk_cpu_iopoll)); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_enable(); ++ local_bh_enable(); + + return 0; + } +diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c +index 161108e5d2fe..1266ea3726d7 100644 +--- a/lib/locking-selftest.c ++++ b/lib/locking-selftest.c +@@ -26,6 +26,12 @@ + #include + #include + ++#ifdef CONFIG_PREEMPT_RT ++# define NON_RT(...) ++#else ++# define NON_RT(...) __VA_ARGS__ ++#endif ++ + /* + * Change this to 1 if you want to see the failure printouts: + */ +@@ -139,7 +145,7 @@ static DEFINE_RT_MUTEX(rtmutex_Z2); + + #endif + +-static local_lock_t local_A = INIT_LOCAL_LOCK(local_A); ++static DEFINE_PER_CPU(local_lock_t, local_A); + + /* + * non-inlined runtime initializers, to let separate locks share +@@ -712,12 +718,18 @@ GENERATE_TESTCASE(ABCDBCDA_rtmutex); + + #undef E + ++#ifdef CONFIG_PREEMPT_RT ++# define RT_PREPARE_DBL_UNLOCK() { migrate_disable(); rcu_read_lock(); } ++#else ++# define RT_PREPARE_DBL_UNLOCK() ++#endif + /* + * Double unlock: + */ + #define E() \ + \ + LOCK(A); \ ++ RT_PREPARE_DBL_UNLOCK(); \ + UNLOCK(A); \ + UNLOCK(A); /* fail */ + +@@ -802,6 +814,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_spin) + +@@ -810,10 +823,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) ++#endif + + #undef E1 + #undef E2 + ++#ifndef CONFIG_PREEMPT_RT + /* + * Enabling hardirqs with a softirq-safe lock held: + */ +@@ -846,6 +861,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) + #undef E1 + #undef E2 + ++#endif ++ + /* + * Enabling irqs with an irq-safe lock held: + */ +@@ -875,6 +892,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_spin) + +@@ -883,6 +901,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -921,6 +940,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_spin) + +@@ -929,6 +949,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -969,6 +990,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_spin) + +@@ -977,6 +999,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -1031,6 +1054,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_spin) + +@@ -1039,6 +1063,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -1206,12 +1231,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_rlock) + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-softirq.h" + #include "locking-selftest-rlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_rlock) + + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -1252,12 +1279,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_rlock) + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-softirq.h" + #include "locking-selftest-rlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_rlock) + + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -1306,12 +1335,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_rlock) + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-softirq.h" + #include "locking-selftest-rlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_rlock) + + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) ++#endif + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) +@@ -1320,7 +1351,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) + # define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map) + # define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map) + # define I_WW(x) lockdep_reset_lock(&x.dep_map) +-# define I_LOCAL_LOCK(x) lockdep_reset_lock(&local_##x.dep_map) ++# define I_LOCAL_LOCK(x) lockdep_reset_lock(this_cpu_ptr(&local_##x.dep_map)) + #ifdef CONFIG_RT_MUTEXES + # define I_RTMUTEX(x) lockdep_reset_lock(&rtmutex_##x.dep_map) + #endif +@@ -1380,7 +1411,7 @@ static void reset_locks(void) + init_shared_classes(); + raw_spin_lock_init(&raw_lock_A); + raw_spin_lock_init(&raw_lock_B); +- local_lock_init(&local_A); ++ local_lock_init(this_cpu_ptr(&local_A)); + + ww_mutex_init(&o, &ww_lockdep); ww_mutex_init(&o2, &ww_lockdep); ww_mutex_init(&o3, &ww_lockdep); + memset(&t, 0, sizeof(t)); memset(&t2, 0, sizeof(t2)); +@@ -1398,7 +1429,13 @@ static int unexpected_testcase_failures; + + static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) + { +- unsigned long saved_preempt_count = preempt_count(); ++ int saved_preempt_count = preempt_count(); ++#ifdef CONFIG_PREEMPT_RT ++#ifdef CONFIG_SMP ++ int saved_mgd_count = current->migration_disabled; ++#endif ++ int saved_rcu_count = current->rcu_read_lock_nesting; ++#endif + + WARN_ON(irqs_disabled()); + +@@ -1432,6 +1469,18 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) + * count, so restore it: + */ + preempt_count_set(saved_preempt_count); ++ ++#ifdef CONFIG_PREEMPT_RT ++#ifdef CONFIG_SMP ++ while (current->migration_disabled > saved_mgd_count) ++ migrate_enable(); ++#endif ++ ++ while (current->rcu_read_lock_nesting > saved_rcu_count) ++ rcu_read_unlock(); ++ WARN_ON_ONCE(current->rcu_read_lock_nesting < saved_rcu_count); ++#endif ++ + #ifdef CONFIG_TRACE_IRQFLAGS + if (softirq_count()) + current->softirqs_enabled = 0; +@@ -1499,7 +1548,7 @@ static inline void print_testname(const char *testname) + + #define DO_TESTCASE_2x2RW(desc, name, nr) \ + DO_TESTCASE_2RW("hard-"desc, name##_hard, nr) \ +- DO_TESTCASE_2RW("soft-"desc, name##_soft, nr) \ ++ NON_RT(DO_TESTCASE_2RW("soft-"desc, name##_soft, nr)) \ + + #define DO_TESTCASE_6x2x2RW(desc, name) \ + DO_TESTCASE_2x2RW(desc, name, 123); \ +@@ -1547,19 +1596,19 @@ static inline void print_testname(const char *testname) + + #define DO_TESTCASE_2I(desc, name, nr) \ + DO_TESTCASE_1("hard-"desc, name##_hard, nr); \ +- DO_TESTCASE_1("soft-"desc, name##_soft, nr); ++ NON_RT(DO_TESTCASE_1("soft-"desc, name##_soft, nr)); + + #define DO_TESTCASE_2IB(desc, name, nr) \ + DO_TESTCASE_1B("hard-"desc, name##_hard, nr); \ +- DO_TESTCASE_1B("soft-"desc, name##_soft, nr); ++ NON_RT(DO_TESTCASE_1B("soft-"desc, name##_soft, nr)); + + #define DO_TESTCASE_6I(desc, name, nr) \ + DO_TESTCASE_3("hard-"desc, name##_hard, nr); \ +- DO_TESTCASE_3("soft-"desc, name##_soft, nr); ++ NON_RT(DO_TESTCASE_3("soft-"desc, name##_soft, nr)); + + #define DO_TESTCASE_6IRW(desc, name, nr) \ + DO_TESTCASE_3RW("hard-"desc, name##_hard, nr); \ +- DO_TESTCASE_3RW("soft-"desc, name##_soft, nr); ++ NON_RT(DO_TESTCASE_3RW("soft-"desc, name##_soft, nr)); + + #define DO_TESTCASE_2x3(desc, name) \ + DO_TESTCASE_3(desc, name, 12); \ +@@ -1651,6 +1700,20 @@ static void ww_test_fail_acquire(void) + #endif + } + ++#ifdef CONFIG_PREEMPT_RT ++#define ww_mutex_base_lock(b) rt_mutex_lock(b) ++#define ww_mutex_base_lock_nest_lock(b, b2) rt_mutex_lock_nest_lock(b, b2) ++#define ww_mutex_base_lock_interruptible(b) rt_mutex_lock_interruptible(b) ++#define ww_mutex_base_lock_killable(b) rt_mutex_lock_killable(b) ++#define ww_mutex_base_unlock(b) rt_mutex_unlock(b) ++#else ++#define ww_mutex_base_lock(b) mutex_lock(b) ++#define ww_mutex_base_lock_nest_lock(b, b2) mutex_lock_nest_lock(b, b2) ++#define ww_mutex_base_lock_interruptible(b) mutex_lock_interruptible(b) ++#define ww_mutex_base_lock_killable(b) mutex_lock_killable(b) ++#define ww_mutex_base_unlock(b) mutex_unlock(b) ++#endif ++ + static void ww_test_normal(void) + { + int ret; +@@ -1665,50 +1728,50 @@ static void ww_test_normal(void) + + /* mutex_lock (and indirectly, mutex_lock_nested) */ + o.ctx = (void *)~0UL; +- mutex_lock(&o.base); +- mutex_unlock(&o.base); ++ ww_mutex_base_lock(&o.base); ++ ww_mutex_base_unlock(&o.base); + WARN_ON(o.ctx != (void *)~0UL); + + /* mutex_lock_interruptible (and *_nested) */ + o.ctx = (void *)~0UL; +- ret = mutex_lock_interruptible(&o.base); ++ ret = ww_mutex_base_lock_interruptible(&o.base); + if (!ret) +- mutex_unlock(&o.base); ++ ww_mutex_base_unlock(&o.base); + else + WARN_ON(1); + WARN_ON(o.ctx != (void *)~0UL); + + /* mutex_lock_killable (and *_nested) */ + o.ctx = (void *)~0UL; +- ret = mutex_lock_killable(&o.base); ++ ret = ww_mutex_base_lock_killable(&o.base); + if (!ret) +- mutex_unlock(&o.base); ++ ww_mutex_base_unlock(&o.base); + else + WARN_ON(1); + WARN_ON(o.ctx != (void *)~0UL); + + /* trylock, succeeding */ + o.ctx = (void *)~0UL; +- ret = mutex_trylock(&o.base); ++ ret = ww_mutex_base_trylock(&o.base); + WARN_ON(!ret); + if (ret) +- mutex_unlock(&o.base); ++ ww_mutex_base_unlock(&o.base); + else + WARN_ON(1); + WARN_ON(o.ctx != (void *)~0UL); + + /* trylock, failing */ + o.ctx = (void *)~0UL; +- mutex_lock(&o.base); +- ret = mutex_trylock(&o.base); ++ ww_mutex_base_lock(&o.base); ++ ret = ww_mutex_base_trylock(&o.base); + WARN_ON(ret); +- mutex_unlock(&o.base); ++ ww_mutex_base_unlock(&o.base); + WARN_ON(o.ctx != (void *)~0UL); + + /* nest_lock */ + o.ctx = (void *)~0UL; +- mutex_lock_nest_lock(&o.base, &t); +- mutex_unlock(&o.base); ++ ww_mutex_base_lock_nest_lock(&o.base, &t); ++ ww_mutex_base_unlock(&o.base); + WARN_ON(o.ctx != (void *)~0UL); + } + +@@ -1721,7 +1784,7 @@ static void ww_test_two_contexts(void) + static void ww_test_diff_class(void) + { + WWAI(&t); +-#ifdef CONFIG_DEBUG_MUTEXES ++#ifdef DEBUG_WW_MUTEXES + t.ww_class = NULL; + #endif + WWL(&o, &t); +@@ -1785,7 +1848,7 @@ static void ww_test_edeadlk_normal(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + o2.ctx = &t2; + mutex_release(&o2.base.dep_map, _THIS_IP_); + +@@ -1801,7 +1864,7 @@ static void ww_test_edeadlk_normal(void) + + o2.ctx = NULL; + mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); +- mutex_unlock(&o2.base); ++ ww_mutex_base_unlock(&o2.base); + WWU(&o); + + WWL(&o2, &t); +@@ -1811,7 +1874,7 @@ static void ww_test_edeadlk_normal_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -1827,7 +1890,7 @@ static void ww_test_edeadlk_normal_slow(void) + + o2.ctx = NULL; + mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); +- mutex_unlock(&o2.base); ++ ww_mutex_base_unlock(&o2.base); + WWU(&o); + + ww_mutex_lock_slow(&o2, &t); +@@ -1837,7 +1900,7 @@ static void ww_test_edeadlk_no_unlock(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + o2.ctx = &t2; + mutex_release(&o2.base.dep_map, _THIS_IP_); + +@@ -1853,7 +1916,7 @@ static void ww_test_edeadlk_no_unlock(void) + + o2.ctx = NULL; + mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); +- mutex_unlock(&o2.base); ++ ww_mutex_base_unlock(&o2.base); + + WWL(&o2, &t); + } +@@ -1862,7 +1925,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -1878,7 +1941,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) + + o2.ctx = NULL; + mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); +- mutex_unlock(&o2.base); ++ ww_mutex_base_unlock(&o2.base); + + ww_mutex_lock_slow(&o2, &t); + } +@@ -1887,7 +1950,7 @@ static void ww_test_edeadlk_acquire_more(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -1908,7 +1971,7 @@ static void ww_test_edeadlk_acquire_more_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -1929,11 +1992,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +- mutex_lock(&o3.base); ++ ww_mutex_base_lock(&o3.base); + mutex_release(&o3.base.dep_map, _THIS_IP_); + o3.ctx = &t2; + +@@ -1955,11 +2018,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +- mutex_lock(&o3.base); ++ ww_mutex_base_lock(&o3.base); + mutex_release(&o3.base.dep_map, _THIS_IP_); + o3.ctx = &t2; + +@@ -1980,7 +2043,7 @@ static void ww_test_edeadlk_acquire_wrong(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -2005,7 +2068,7 @@ static void ww_test_edeadlk_acquire_wrong_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -2646,8 +2709,8 @@ static void wait_context_tests(void) + + static void local_lock_2(void) + { +- local_lock_acquire(&local_A); /* IRQ-ON */ +- local_lock_release(&local_A); ++ local_lock(&local_A); /* IRQ-ON */ ++ local_unlock(&local_A); + + HARDIRQ_ENTER(); + spin_lock(&lock_A); /* IN-IRQ */ +@@ -2656,18 +2719,18 @@ static void local_lock_2(void) + + HARDIRQ_DISABLE(); + spin_lock(&lock_A); +- local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ +- local_lock_release(&local_A); ++ local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ ++ local_unlock(&local_A); + spin_unlock(&lock_A); + HARDIRQ_ENABLE(); + } + + static void local_lock_3A(void) + { +- local_lock_acquire(&local_A); /* IRQ-ON */ ++ local_lock(&local_A); /* IRQ-ON */ + spin_lock(&lock_B); /* IRQ-ON */ + spin_unlock(&lock_B); +- local_lock_release(&local_A); ++ local_unlock(&local_A); + + HARDIRQ_ENTER(); + spin_lock(&lock_A); /* IN-IRQ */ +@@ -2676,18 +2739,18 @@ static void local_lock_3A(void) + + HARDIRQ_DISABLE(); + spin_lock(&lock_A); +- local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ +- local_lock_release(&local_A); ++ local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ ++ local_unlock(&local_A); + spin_unlock(&lock_A); + HARDIRQ_ENABLE(); + } + + static void local_lock_3B(void) + { +- local_lock_acquire(&local_A); /* IRQ-ON */ ++ local_lock(&local_A); /* IRQ-ON */ + spin_lock(&lock_B); /* IRQ-ON */ + spin_unlock(&lock_B); +- local_lock_release(&local_A); ++ local_unlock(&local_A); + + HARDIRQ_ENTER(); + spin_lock(&lock_A); /* IN-IRQ */ +@@ -2696,8 +2759,8 @@ static void local_lock_3B(void) + + HARDIRQ_DISABLE(); + spin_lock(&lock_A); +- local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ +- local_lock_release(&local_A); ++ local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ ++ local_unlock(&local_A); + spin_unlock(&lock_A); + HARDIRQ_ENABLE(); + +@@ -2812,7 +2875,7 @@ void locking_selftest(void) + printk("------------------------\n"); + printk("| Locking API testsuite:\n"); + printk("----------------------------------------------------------------------------\n"); +- printk(" | spin |wlock |rlock |mutex | wsem | rsem |\n"); ++ printk(" | spin |wlock |rlock |mutex | wsem | rsem |rtmutex\n"); + printk(" --------------------------------------------------------------------------\n"); + + init_shared_classes(); +@@ -2885,12 +2948,11 @@ void locking_selftest(void) + DO_TESTCASE_6x1RR("rlock W1R2/R2R3/W3W1", W1R2_R2R3_W3W1); + + printk(" --------------------------------------------------------------------------\n"); +- + /* + * irq-context testcases: + */ + DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); +- DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); ++ NON_RT(DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A)); + DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); + DO_TESTCASE_6x6("safe-A + unsafe-B #1", irqsafe3); + DO_TESTCASE_6x6("safe-A + unsafe-B #2", irqsafe4); +diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c +index 199ab201d501..06410209197a 100644 +--- a/lib/nmi_backtrace.c ++++ b/lib/nmi_backtrace.c +@@ -99,7 +99,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) + * Allow nested NMI backtraces while serializing + * against other CPUs. + */ +- printk_cpu_lock_irqsave(flags); ++ raw_printk_cpu_lock_irqsave(flags); + if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { + pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", + cpu, (void *)instruction_pointer(regs)); +@@ -110,7 +110,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) + else + dump_stack(); + } +- printk_cpu_unlock_irqrestore(flags); ++ raw_printk_cpu_unlock_irqrestore(flags); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return true; + } +diff --git a/lib/scatterlist.c b/lib/scatterlist.c +index abb3432ed744..d5e82e4a57ad 100644 +--- a/lib/scatterlist.c ++++ b/lib/scatterlist.c +@@ -828,8 +828,7 @@ static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) + * stops @miter. + * + * Context: +- * Don't care if @miter is stopped, or not proceeded yet. +- * Otherwise, preemption disabled if the SG_MITER_ATOMIC is set. ++ * Don't care. + * + * Returns: + * true if @miter contains the valid mapping. false if end of sg +@@ -865,8 +864,7 @@ EXPORT_SYMBOL(sg_miter_skip); + * @miter->addr and @miter->length point to the current mapping. + * + * Context: +- * Preemption disabled if SG_MITER_ATOMIC. Preemption must stay disabled +- * till @miter is stopped. May sleep if !SG_MITER_ATOMIC. ++ * May sleep if !SG_MITER_ATOMIC. + * + * Returns: + * true if @miter contains the next mapping. false if end of sg +@@ -906,8 +904,7 @@ EXPORT_SYMBOL(sg_miter_next); + * need to be released during iteration. + * + * Context: +- * Preemption disabled if the SG_MITER_ATOMIC is set. Don't care +- * otherwise. ++ * Don't care otherwise. + */ + void sg_miter_stop(struct sg_mapping_iter *miter) + { +@@ -922,7 +919,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) + flush_dcache_page(miter->page); + + if (miter->__flags & SG_MITER_ATOMIC) { +- WARN_ON_ONCE(preemptible()); ++ WARN_ON_ONCE(!pagefault_disabled()); + kunmap_atomic(miter->addr); + } else + kunmap(miter->page); +diff --git a/localversion-rt b/localversion-rt +new file mode 100644 +index 000000000000..e2eb19782d4c +--- /dev/null ++++ b/localversion-rt +@@ -0,0 +1 @@ ++-rt65 +diff --git a/mm/Kconfig b/mm/Kconfig +index c048dea7e342..88778414465b 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -371,7 +371,7 @@ config NOMMU_INITIAL_TRIM_EXCESS + + config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" +- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE ++ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT + select COMPACTION + select XARRAY_MULTI + help +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index b68b2fe639fd..71b7b7371595 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -654,6 +654,35 @@ static u64 flush_next_time; + + #define FLUSH_TIME (2UL*HZ) + ++/* ++ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can ++ * not rely on this as part of an acquired spinlock_t lock. These functions are ++ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion ++ * is sufficient. ++ */ ++static void memcg_stats_lock(void) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ preempt_disable(); ++#else ++ VM_BUG_ON(!irqs_disabled()); ++#endif ++} ++ ++static void __memcg_stats_lock(void) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ preempt_disable(); ++#endif ++} ++ ++static void memcg_stats_unlock(void) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ preempt_enable(); ++#endif ++} ++ + static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) + { + unsigned int x; +@@ -737,6 +766,27 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + ++ /* ++ * The caller from rmap relay on disabled preemption becase they never ++ * update their counter from in-interrupt context. For these two ++ * counters we check that the update is never performed from an ++ * interrupt context while other caller need to have disabled interrupt. ++ */ ++ __memcg_stats_lock(); ++ if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ switch (idx) { ++ case NR_ANON_MAPPED: ++ case NR_FILE_MAPPED: ++ case NR_ANON_THPS: ++ case NR_SHMEM_PMDMAPPED: ++ case NR_FILE_PMDMAPPED: ++ WARN_ON_ONCE(!in_task()); ++ break; ++ default: ++ WARN_ON_ONCE(!irqs_disabled()); ++ } ++ } ++ + /* Update memcg */ + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + +@@ -744,6 +794,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg, val); ++ memcg_stats_unlock(); + } + + /** +@@ -844,8 +895,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + if (mem_cgroup_disabled()) + return; + ++ memcg_stats_lock(); + __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + memcg_rstat_updated(memcg, count); ++ memcg_stats_unlock(); + } + + static unsigned long memcg_events(struct mem_cgroup *memcg, int event) +@@ -909,6 +962,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + */ + static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) + { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ return; ++ + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { +@@ -2102,39 +2158,37 @@ void unlock_page_memcg(struct page *page) + } + EXPORT_SYMBOL(unlock_page_memcg); + +-struct obj_stock { ++struct memcg_stock_pcp { ++ local_lock_t stock_lock; ++ struct mem_cgroup *cached; /* this never be root cgroup */ ++ unsigned int nr_pages; ++ + #ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *cached_objcg; + struct pglist_data *cached_pgdat; + unsigned int nr_bytes; + int nr_slab_reclaimable_b; + int nr_slab_unreclaimable_b; +-#else +- int dummy[0]; + #endif +-}; +- +-struct memcg_stock_pcp { +- struct mem_cgroup *cached; /* this never be root cgroup */ +- unsigned int nr_pages; +- struct obj_stock task_obj; +- struct obj_stock irq_obj; + + struct work_struct work; + unsigned long flags; + #define FLUSHING_CACHED_CHARGE 0 + }; +-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); ++static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { ++ .stock_lock = INIT_LOCAL_LOCK(stock_lock), ++}; + static DEFINE_MUTEX(percpu_charge_mutex); + + #ifdef CONFIG_MEMCG_KMEM +-static void drain_obj_stock(struct obj_stock *stock); ++static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg); + + #else +-static inline void drain_obj_stock(struct obj_stock *stock) ++static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) + { ++ return NULL; + } + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +@@ -2144,41 +2198,6 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + #endif + + /* +- * Most kmem_cache_alloc() calls are from user context. The irq disable/enable +- * sequence used in this case to access content from object stock is slow. +- * To optimize for user context access, there are now two object stocks for +- * task context and interrupt context access respectively. +- * +- * The task context object stock can be accessed by disabling preemption only +- * which is cheap in non-preempt kernel. The interrupt context object stock +- * can only be accessed after disabling interrupt. User context code can +- * access interrupt object stock, but not vice versa. +- */ +-static inline struct obj_stock *get_obj_stock(unsigned long *pflags) +-{ +- struct memcg_stock_pcp *stock; +- +- if (likely(in_task())) { +- *pflags = 0UL; +- preempt_disable(); +- stock = this_cpu_ptr(&memcg_stock); +- return &stock->task_obj; +- } +- +- local_irq_save(*pflags); +- stock = this_cpu_ptr(&memcg_stock); +- return &stock->irq_obj; +-} +- +-static inline void put_obj_stock(unsigned long flags) +-{ +- if (likely(in_task())) +- preempt_enable(); +- else +- local_irq_restore(flags); +-} +- +-/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. +@@ -2198,7 +2217,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + if (nr_pages > MEMCG_CHARGE_BATCH) + return ret; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { +@@ -2206,7 +2225,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + ret = true; + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + return ret; + } +@@ -2235,6 +2254,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) + static void drain_local_stock(struct work_struct *dummy) + { + struct memcg_stock_pcp *stock; ++ struct obj_cgroup *old = NULL; + unsigned long flags; + + /* +@@ -2242,28 +2262,25 @@ static void drain_local_stock(struct work_struct *dummy) + * drain_stock races is that we always operate on local CPU stock + * here with IRQ disabled + */ +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); +- drain_obj_stock(&stock->irq_obj); +- if (in_task()) +- drain_obj_stock(&stock->task_obj); ++ old = drain_obj_stock(stock); + drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); ++ if (old) ++ obj_cgroup_put(old); + } + + /* + * Cache charges(val) to local per_cpu area. + * This will be consumed by consume_stock() function, later. + */ +-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ++static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + { + struct memcg_stock_pcp *stock; +- unsigned long flags; +- +- local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached != memcg) { /* reset if necessary */ +@@ -2275,8 +2292,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + + if (stock->nr_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock); ++} + +- local_irq_restore(flags); ++static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ++{ ++ unsigned long flags; ++ ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ __refill_stock(memcg, nr_pages); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + } + + /* +@@ -2296,7 +2320,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + * as well as workers from this path always operate on the local + * per-cpu data. CPU up doesn't touch memcg_stock at all. + */ +- curcpu = get_cpu(); ++ migrate_disable(); ++ curcpu = smp_processor_id(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + struct mem_cgroup *memcg; +@@ -2319,7 +2344,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + schedule_work_on(cpu, &stock->work); + } + } +- put_cpu(); ++ migrate_enable(); + mutex_unlock(&percpu_charge_mutex); + } + +@@ -3084,17 +3109,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) + void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr) + { ++ struct memcg_stock_pcp *stock; ++ struct obj_cgroup *old = NULL; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); + int *bytes; + ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ stock = this_cpu_ptr(&memcg_stock); ++ + /* + * Save vmstat data in stock and skip vmstat array update unless + * accumulating over a page of vmstat data or when pgdat or idx + * changes. + */ + if (stock->cached_objcg != objcg) { +- drain_obj_stock(stock); ++ old = drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; +@@ -3138,38 +3167,53 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + if (nr) + mod_objcg_mlstate(objcg, pgdat, idx, nr); + +- put_obj_stock(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); ++ if (old) ++ obj_cgroup_put(old); + } + + static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + { ++ struct memcg_stock_pcp *stock; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); + bool ret = false; + ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ ++ stock = this_cpu_ptr(&memcg_stock); + if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { + stock->nr_bytes -= nr_bytes; + ret = true; + } + +- put_obj_stock(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + return ret; + } + +-static void drain_obj_stock(struct obj_stock *stock) ++static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) + { + struct obj_cgroup *old = stock->cached_objcg; + + if (!old) +- return; ++ return NULL; + + if (stock->nr_bytes) { + unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; + unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); + +- if (nr_pages) +- obj_cgroup_uncharge_pages(old, nr_pages); ++ if (nr_pages) { ++ struct mem_cgroup *memcg; ++ ++ memcg = get_mem_cgroup_from_objcg(old); ++ ++ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) ++ page_counter_uncharge(&memcg->kmem, nr_pages); ++ ++ __refill_stock(memcg, nr_pages); ++ ++ css_put(&memcg->css); ++ } + + /* + * The leftover is flushed to the centralized per-memcg value. +@@ -3204,8 +3248,12 @@ static void drain_obj_stock(struct obj_stock *stock) + stock->cached_pgdat = NULL; + } + +- obj_cgroup_put(old); + stock->cached_objcg = NULL; ++ /* ++ * The `old' objects needs to be released by the caller via ++ * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. ++ */ ++ return old; + } + + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +@@ -3213,13 +3261,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + { + struct mem_cgroup *memcg; + +- if (in_task() && stock->task_obj.cached_objcg) { +- memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); +- if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) +- return true; +- } +- if (stock->irq_obj.cached_objcg) { +- memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); ++ if (stock->cached_objcg) { ++ memcg = obj_cgroup_memcg(stock->cached_objcg); + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + return true; + } +@@ -3230,12 +3273,16 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + bool allow_uncharge) + { ++ struct memcg_stock_pcp *stock; ++ struct obj_cgroup *old = NULL; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); + unsigned int nr_pages = 0; + ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ ++ stock = this_cpu_ptr(&memcg_stock); + if (stock->cached_objcg != objcg) { /* reset if necessary */ +- drain_obj_stock(stock); ++ old = drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->cached_objcg = objcg; + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) +@@ -3249,7 +3296,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + stock->nr_bytes &= (PAGE_SIZE - 1); + } + +- put_obj_stock(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); ++ if (old) ++ obj_cgroup_put(old); + + if (nr_pages) + obj_cgroup_uncharge_pages(objcg, nr_pages); +@@ -3816,8 +3865,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + } + break; + case RES_SOFT_LIMIT: +- memcg->soft_limit = nr_pages; +- ret = 0; ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ ret = -EOPNOTSUPP; ++ } else { ++ memcg->soft_limit = nr_pages; ++ ret = 0; ++ } + break; + } + return ret ?: nbytes; +@@ -4798,6 +4851,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *endp; + int ret; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ return -EOPNOTSUPP; ++ + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); +@@ -6889,7 +6945,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) + unsigned long nr_pages; + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; +- bool use_objcg = PageMemcgKmem(page); + + VM_BUG_ON_PAGE(PageLRU(page), page); + +@@ -6898,7 +6953,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) + * page memcg or objcg at this point, we have fully + * exclusive access to the page. + */ +- if (use_objcg) { ++ if (PageMemcgKmem(page)) { + objcg = __page_objcg(page); + /* + * This get matches the put at the end of the function and +@@ -6926,7 +6981,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) + + nr_pages = compound_nr(page); + +- if (use_objcg) { ++ if (PageMemcgKmem(page)) { + ug->nr_memory += nr_pages; + ug->nr_kmem += nr_pages; + +@@ -7256,8 +7311,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + * important here to have the interrupts disabled because it is the + * only synchronisation we have for updating the per-CPU variables. + */ +- VM_BUG_ON(!irqs_disabled()); ++ memcg_stats_lock(); + mem_cgroup_charge_statistics(memcg, page, -nr_entries); ++ memcg_stats_unlock(); + memcg_check_events(memcg, page); + + css_put(&memcg->css); +diff --git a/mm/memory.c b/mm/memory.c +index 8d71a82462dd..e2a9f89bbcf2 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5305,7 +5305,7 @@ void __might_fault(const char *file, int line) + return; + if (pagefault_disabled()) + return; +- __might_sleep(file, line, 0); ++ __might_sleep(file, line); + #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) + if (current->mm) + might_lock_read(¤t->mm->mmap_lock); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index f320ee2bd34a..33355028122a 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3149,9 +3149,9 @@ static void drain_local_pages_wq(struct work_struct *work) + * cpu which is alright but we also have to make sure to not move to + * a different one. + */ +- preempt_disable(); ++ migrate_disable(); + drain_local_pages(drain->zone); +- preempt_enable(); ++ migrate_enable(); + } + + /* +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index 3e482209a1c4..1a59b7b4ff67 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -1918,11 +1918,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) + return ERR_PTR(err); + } + +- vbq = &get_cpu_var(vmap_block_queue); ++ get_cpu_light(); ++ vbq = this_cpu_ptr(&vmap_block_queue); + spin_lock(&vbq->lock); + list_add_tail_rcu(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); +- put_cpu_var(vmap_block_queue); ++ put_cpu_light(); + + return vaddr; + } +@@ -2001,7 +2002,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + order = get_order(size); + + rcu_read_lock(); +- vbq = &get_cpu_var(vmap_block_queue); ++ get_cpu_light(); ++ vbq = this_cpu_ptr(&vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + unsigned long pages_off; + +@@ -2024,7 +2026,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + break; + } + +- put_cpu_var(vmap_block_queue); ++ put_cpu_light(); + rcu_read_unlock(); + + /* Allocate new block if nothing was found */ +diff --git a/mm/workingset.c b/mm/workingset.c +index 880d882f3325..2a9ed5aeb6fa 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -433,6 +433,8 @@ static struct list_lru shadow_nodes; + + void workingset_update_node(struct xa_node *node) + { ++ struct address_space *mapping; ++ + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. +@@ -441,7 +443,8 @@ void workingset_update_node(struct xa_node *node) + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by the i_pages lock. + */ +- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ ++ mapping = container_of(node->array, struct address_space, i_pages); ++ lockdep_assert_held(&mapping->i_pages.xa_lock); + + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) { +diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c +index 439deb8decbc..a66431853394 100644 +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -57,6 +57,7 @@ + #include + #include + #include ++#include + + #define ZSPAGE_MAGIC 0x58 + +@@ -77,6 +78,20 @@ + + #define ZS_HANDLE_SIZE (sizeof(unsigned long)) + ++#ifdef CONFIG_PREEMPT_RT ++ ++struct zsmalloc_handle { ++ unsigned long addr; ++ spinlock_t lock; ++}; ++ ++#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) ++ ++#else ++ ++#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) ++#endif ++ + /* + * Object location (, ) is encoded as + * a single (unsigned long) handle value. +@@ -293,6 +308,7 @@ struct zspage { + }; + + struct mapping_area { ++ local_lock_t lock; + char *vm_buf; /* copy buffer for objects that span pages */ + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +@@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} + + static int create_cache(struct zs_pool *pool) + { +- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, ++ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, + 0, 0, NULL); + if (!pool->handle_cachep) + return 1; +@@ -346,10 +362,27 @@ static void destroy_cache(struct zs_pool *pool) + + static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) + { +- return (unsigned long)kmem_cache_alloc(pool->handle_cachep, +- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); ++ void *p; ++ ++ p = kmem_cache_alloc(pool->handle_cachep, ++ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); ++#ifdef CONFIG_PREEMPT_RT ++ if (p) { ++ struct zsmalloc_handle *zh = p; ++ ++ spin_lock_init(&zh->lock); ++ } ++#endif ++ return (unsigned long)p; + } + ++#ifdef CONFIG_PREEMPT_RT ++static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) ++{ ++ return (void *)(handle & ~((1 << OBJ_TAG_BITS) - 1)); ++} ++#endif ++ + static void cache_free_handle(struct zs_pool *pool, unsigned long handle) + { + kmem_cache_free(pool->handle_cachep, (void *)handle); +@@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) + + static void record_obj(unsigned long handle, unsigned long obj) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ WRITE_ONCE(zh->addr, obj); ++#else + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); ++#endif + } + + /* zpool driver */ +@@ -455,7 +494,9 @@ MODULE_ALIAS("zpool-zsmalloc"); + #endif /* CONFIG_ZPOOL */ + + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +-static DEFINE_PER_CPU(struct mapping_area, zs_map_area); ++static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static bool is_zspage_isolated(struct zspage *zspage) + { +@@ -862,7 +903,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) + + static unsigned long handle_to_obj(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return zh->addr; ++#else + return *(unsigned long *)handle; ++#endif + } + + static unsigned long obj_to_head(struct page *page, void *obj) +@@ -876,22 +923,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) + + static inline int testpin_tag(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_is_locked(&zh->lock); ++#else + return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static inline int trypin_tag(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_trylock(&zh->lock); ++#else + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void pin_tag(unsigned long handle) __acquires(bitlock) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_lock(&zh->lock); ++#else + bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void unpin_tag(unsigned long handle) __releases(bitlock) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_unlock(&zh->lock); ++#else + bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void reset_page(struct page *page) +@@ -1274,7 +1345,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + class = pool->size_class[class_idx]; + off = (class->size * obj_idx) & ~PAGE_MASK; + +- area = &get_cpu_var(zs_map_area); ++ local_lock(&zs_map_area.lock); ++ area = this_cpu_ptr(&zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ +@@ -1328,7 +1400,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) + + __zs_unmap_object(area, pages, off, class->size); + } +- put_cpu_var(zs_map_area); ++ local_unlock(&zs_map_area.lock); + + migrate_read_unlock(zspage); + unpin_tag(handle); +diff --git a/net/Kconfig b/net/Kconfig +index 76a3385943e5..bd7386eede23 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -292,7 +292,7 @@ config CGROUP_NET_CLASSID + + config NET_RX_BUSY_POLL + bool +- default y ++ default y if !PREEMPT_RT + + config BQL + bool +diff --git a/net/core/dev.c b/net/core/dev.c +index 4d698ccf4172..4bed27338ed9 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -225,14 +225,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) + static inline void rps_lock(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- spin_lock(&sd->input_pkt_queue.lock); ++ raw_spin_lock(&sd->input_pkt_queue.raw_lock); + #endif + } + + static inline void rps_unlock(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- spin_unlock(&sd->input_pkt_queue.lock); ++ raw_spin_unlock(&sd->input_pkt_queue.raw_lock); + #endif + } + +@@ -3046,6 +3046,7 @@ static void __netif_reschedule(struct Qdisc *q) + sd->output_queue_tailp = &q->next_sched; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + + void __netif_schedule(struct Qdisc *q) +@@ -3108,6 +3109,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(__dev_kfree_skb_irq); + +@@ -3841,7 +3843,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + * This permits qdisc->running owner to get the lock more + * often and dequeue packets faster. + */ ++#ifdef CONFIG_PREEMPT_RT ++ contended = true; ++#else + contended = qdisc_is_running(q); ++#endif + if (unlikely(contended)) + spin_lock(&q->busylock); + +@@ -4669,6 +4675,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + rps_unlock(sd); + + local_irq_restore(flags); ++ preempt_check_resched_rt(); + + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); +@@ -4909,7 +4916,7 @@ static int netif_rx_internal(struct sk_buff *skb) + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; + +- preempt_disable(); ++ migrate_disable(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); +@@ -4919,14 +4926,14 @@ static int netif_rx_internal(struct sk_buff *skb) + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + + rcu_read_unlock(); +- preempt_enable(); ++ migrate_enable(); + } else + #endif + { + unsigned int qtail; + +- ret = enqueue_to_backlog(skb, get_cpu(), &qtail); +- put_cpu(); ++ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); ++ put_cpu_light(); + } + return ret; + } +@@ -4965,11 +4972,9 @@ int netif_rx_ni(struct sk_buff *skb) + + trace_netif_rx_ni_entry(skb); + +- preempt_disable(); ++ local_bh_disable(); + err = netif_rx_internal(skb); +- if (local_softirq_pending()) +- do_softirq(); +- preempt_enable(); ++ local_bh_enable(); + trace_netif_rx_ni_exit(err); + + return err; +@@ -6413,12 +6418,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) + sd->rps_ipi_list = NULL; + + local_irq_enable(); ++ preempt_check_resched_rt(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + net_rps_send_ipi(remsd); + } else + #endif + local_irq_enable(); ++ preempt_check_resched_rt(); + } + + static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +@@ -6496,6 +6503,7 @@ void __napi_schedule(struct napi_struct *n) + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(__napi_schedule); + +@@ -11316,6 +11324,7 @@ static int dev_cpu_dead(unsigned int oldcpu) + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); ++ preempt_check_resched_rt(); + + #ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; +@@ -11329,7 +11338,7 @@ static int dev_cpu_dead(unsigned int oldcpu) + netif_rx_ni(skb); + input_queue_head_incr(oldsd); + } +- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { ++ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx_ni(skb); + input_queue_head_incr(oldsd); + } +@@ -11644,7 +11653,7 @@ static int __init net_dev_init(void) + + INIT_WORK(flush, flush_backlog); + +- skb_queue_head_init(&sd->input_pkt_queue); ++ skb_queue_head_init_raw(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + #ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c +index 8e582e29a41e..4fcbdd71c59f 100644 +--- a/net/core/gen_estimator.c ++++ b/net/core/gen_estimator.c +@@ -40,10 +40,10 @@ + */ + + struct net_rate_estimator { +- struct gnet_stats_basic_packed *bstats; ++ struct gnet_stats_basic_sync *bstats; + spinlock_t *stats_lock; +- seqcount_t *running; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats; ++ bool running; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats; + u8 ewma_log; + u8 intvl_log; /* period : (250ms << intvl_log) */ + +@@ -60,13 +60,13 @@ struct net_rate_estimator { + }; + + static void est_fetch_counters(struct net_rate_estimator *e, +- struct gnet_stats_basic_packed *b) ++ struct gnet_stats_basic_sync *b) + { +- memset(b, 0, sizeof(*b)); ++ gnet_stats_basic_sync_init(b); + if (e->stats_lock) + spin_lock(e->stats_lock); + +- __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); ++ gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); + + if (e->stats_lock) + spin_unlock(e->stats_lock); +@@ -76,14 +76,18 @@ static void est_fetch_counters(struct net_rate_estimator *e, + static void est_timer(struct timer_list *t) + { + struct net_rate_estimator *est = from_timer(est, t, timer); +- struct gnet_stats_basic_packed b; ++ struct gnet_stats_basic_sync b; ++ u64 b_bytes, b_packets; + u64 rate, brate; + + est_fetch_counters(est, &b); +- brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); ++ b_bytes = u64_stats_read(&b.bytes); ++ b_packets = u64_stats_read(&b.packets); ++ ++ brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); + +- rate = (b.packets - est->last_packets) << (10 - est->intvl_log); ++ rate = (b_packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); + + write_seqcount_begin(&est->seq); +@@ -91,8 +95,8 @@ static void est_timer(struct timer_list *t) + est->avpps += rate; + write_seqcount_end(&est->seq); + +- est->last_bytes = b.bytes; +- est->last_packets = b.packets; ++ est->last_bytes = b_bytes; ++ est->last_packets = b_packets; + + est->next_jiffies += ((HZ/4) << est->intvl_log); + +@@ -109,7 +113,9 @@ static void est_timer(struct timer_list *t) + * @cpu_bstats: bstats per cpu + * @rate_est: rate estimator statistics + * @lock: lock for statistics and control path +- * @running: qdisc running seqcount ++ * @running: true if @bstats represents a running qdisc, thus @bstats' ++ * internal values might change during basic reads. Only used ++ * if @bstats_cpu is NULL + * @opt: rate estimator configuration TLV + * + * Creates a new rate estimator with &bstats as source and &rate_est +@@ -121,16 +127,16 @@ static void est_timer(struct timer_list *t) + * Returns 0 on success or a negative error code. + * + */ +-int gen_new_estimator(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu_bstats, ++int gen_new_estimator(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, ++ bool running, + struct nlattr *opt) + { + struct gnet_estimator *parm = nla_data(opt); + struct net_rate_estimator *old, *est; +- struct gnet_stats_basic_packed b; ++ struct gnet_stats_basic_sync b; + int intvl_log; + + if (nla_len(opt) < sizeof(*parm)) +@@ -164,8 +170,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + est_fetch_counters(est, &b); + if (lock) + local_bh_enable(); +- est->last_bytes = b.bytes; +- est->last_packets = b.packets; ++ est->last_bytes = u64_stats_read(&b.bytes); ++ est->last_packets = u64_stats_read(&b.packets); + + if (lock) + spin_lock_bh(lock); +@@ -214,7 +220,9 @@ EXPORT_SYMBOL(gen_kill_estimator); + * @cpu_bstats: bstats per cpu + * @rate_est: rate estimator statistics + * @lock: lock for statistics and control path +- * @running: qdisc running seqcount (might be NULL) ++ * @running: true if @bstats represents a running qdisc, thus @bstats' ++ * internal values might change during basic reads. Only used ++ * if @cpu_bstats is NULL + * @opt: rate estimator configuration TLV + * + * Replaces the configuration of a rate estimator by calling +@@ -222,11 +230,11 @@ EXPORT_SYMBOL(gen_kill_estimator); + * + * Returns 0 on success or a negative error code. + */ +-int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu_bstats, ++int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt) ++ bool running, struct nlattr *opt) + { + return gen_new_estimator(bstats, cpu_bstats, rate_est, + lock, running, opt); +diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c +index e491b083b348..a10335b4ba2d 100644 +--- a/net/core/gen_stats.c ++++ b/net/core/gen_stats.c +@@ -18,7 +18,7 @@ + #include + #include + #include +- ++#include + + static inline int + gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) +@@ -114,63 +114,112 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + } + EXPORT_SYMBOL(gnet_stats_start_copy); + +-static void +-__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu) ++/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ ++void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) + { ++ u64_stats_set(&b->bytes, 0); ++ u64_stats_set(&b->packets, 0); ++ u64_stats_init(&b->syncp); ++} ++EXPORT_SYMBOL(gnet_stats_basic_sync_init); ++ ++static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu) ++{ ++ u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { +- struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); ++ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); +- bytes = bcpu->bstats.bytes; +- packets = bcpu->bstats.packets; ++ bytes = u64_stats_read(&bcpu->bytes); ++ packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + +- bstats->bytes += bytes; +- bstats->packets += packets; ++ t_bytes += bytes; ++ t_packets += packets; ++ } ++ _bstats_update(bstats, t_bytes, t_packets); ++} ++ ++void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running) ++{ ++ unsigned int start; ++ u64 bytes = 0; ++ u64 packets = 0; ++ ++ WARN_ON_ONCE((cpu || running) && in_hardirq()); ++ ++ if (cpu) { ++ gnet_stats_add_basic_cpu(bstats, cpu); ++ return; + } ++ do { ++ if (running) ++ start = u64_stats_fetch_begin_irq(&b->syncp); ++ bytes = u64_stats_read(&b->bytes); ++ packets = u64_stats_read(&b->packets); ++ } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); ++ ++ _bstats_update(bstats, bytes, packets); + } ++EXPORT_SYMBOL(gnet_stats_add_basic); + +-void +-__gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b) ++static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running) + { +- unsigned int seq; ++ unsigned int start; + + if (cpu) { +- __gnet_stats_copy_basic_cpu(bstats, cpu); ++ u64 t_bytes = 0, t_packets = 0; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); ++ unsigned int start; ++ u64 bytes, packets; ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&bcpu->syncp); ++ bytes = u64_stats_read(&bcpu->bytes); ++ packets = u64_stats_read(&bcpu->packets); ++ } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); ++ ++ t_bytes += bytes; ++ t_packets += packets; ++ } ++ *ret_bytes = t_bytes; ++ *ret_packets = t_packets; + return; + } + do { + if (running) +- seq = read_seqcount_begin(running); +- bstats->bytes = b->bytes; +- bstats->packets = b->packets; +- } while (running && read_seqcount_retry(running, seq)); ++ start = u64_stats_fetch_begin_irq(&b->syncp); ++ *ret_bytes = u64_stats_read(&b->bytes); ++ *ret_packets = u64_stats_read(&b->packets); ++ } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + } +-EXPORT_SYMBOL(__gnet_stats_copy_basic); + + static int +-___gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b, +- int type) ++___gnet_stats_copy_basic(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, ++ int type, bool running) + { +- struct gnet_stats_basic_packed bstats = {0}; ++ u64 bstats_bytes, bstats_packets; + +- __gnet_stats_copy_basic(running, &bstats, cpu, b); ++ gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); + + if (d->compat_tc_stats && type == TCA_STATS_BASIC) { +- d->tc_stats.bytes = bstats.bytes; +- d->tc_stats.packets = bstats.packets; ++ d->tc_stats.bytes = bstats_bytes; ++ d->tc_stats.packets = bstats_packets; + } + + if (d->tail) { +@@ -178,24 +227,28 @@ ___gnet_stats_copy_basic(const seqcount_t *running, + int res; + + memset(&sb, 0, sizeof(sb)); +- sb.bytes = bstats.bytes; +- sb.packets = bstats.packets; ++ sb.bytes = bstats_bytes; ++ sb.packets = bstats_packets; + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); +- if (res < 0 || sb.packets == bstats.packets) ++ if (res < 0 || sb.packets == bstats_packets) + return res; + /* emit 64bit stats only if needed */ +- return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, +- sizeof(bstats.packets), TCA_STATS_PAD); ++ return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, ++ sizeof(bstats_packets), TCA_STATS_PAD); + } + return 0; + } + + /** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV +- * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics ++ * @running: true if @b represents a running qdisc, thus @b's ++ * internal values might change during basic reads. ++ * Only used if @cpu is NULL ++ * ++ * Context: task; must not be run from IRQ or BH contexts + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). +@@ -204,22 +257,25 @@ ___gnet_stats_copy_basic(const seqcount_t *running, + * if the room in the socket buffer was not sufficient. + */ + int +-gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b) ++gnet_stats_copy_basic(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, ++ bool running) + { +- return ___gnet_stats_copy_basic(running, d, cpu, b, +- TCA_STATS_BASIC); ++ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); + } + EXPORT_SYMBOL(gnet_stats_copy_basic); + + /** + * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV +- * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics ++ * @running: true if @b represents a running qdisc, thus @b's ++ * internal values might change during basic reads. ++ * Only used if @cpu is NULL ++ * ++ * Context: task; must not be run from IRQ or BH contexts + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). +@@ -228,13 +284,12 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); + * if the room in the socket buffer was not sufficient. + */ + int +-gnet_stats_copy_basic_hw(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b) ++gnet_stats_copy_basic_hw(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, ++ bool running) + { +- return ___gnet_stats_copy_basic(running, d, cpu, b, +- TCA_STATS_BASIC_HW); ++ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); + } + EXPORT_SYMBOL(gnet_stats_copy_basic_hw); + +@@ -282,16 +337,15 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, + } + EXPORT_SYMBOL(gnet_stats_copy_rate_est); + +-static void +-__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, +- const struct gnet_stats_queue __percpu *q) ++static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, ++ const struct gnet_stats_queue __percpu *q) + { + int i; + + for_each_possible_cpu(i) { + const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + +- qstats->qlen = 0; ++ qstats->qlen += qcpu->backlog; + qstats->backlog += qcpu->backlog; + qstats->drops += qcpu->drops; + qstats->requeues += qcpu->requeues; +@@ -299,24 +353,21 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, + } + } + +-void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, +- const struct gnet_stats_queue __percpu *cpu, +- const struct gnet_stats_queue *q, +- __u32 qlen) ++void gnet_stats_add_queue(struct gnet_stats_queue *qstats, ++ const struct gnet_stats_queue __percpu *cpu, ++ const struct gnet_stats_queue *q) + { + if (cpu) { +- __gnet_stats_copy_queue_cpu(qstats, cpu); ++ gnet_stats_add_queue_cpu(qstats, cpu); + } else { +- qstats->qlen = q->qlen; +- qstats->backlog = q->backlog; +- qstats->drops = q->drops; +- qstats->requeues = q->requeues; +- qstats->overlimits = q->overlimits; ++ qstats->qlen += q->qlen; ++ qstats->backlog += q->backlog; ++ qstats->drops += q->drops; ++ qstats->requeues += q->requeues; ++ qstats->overlimits += q->overlimits; + } +- +- qstats->qlen = qlen; + } +-EXPORT_SYMBOL(__gnet_stats_copy_queue); ++EXPORT_SYMBOL(gnet_stats_add_queue); + + /** + * gnet_stats_copy_queue - copy queue statistics into statistics TLV +@@ -339,7 +390,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, + { + struct gnet_stats_queue qstats = {0}; + +- __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); ++ gnet_stats_add_queue(&qstats, cpu_q, q); ++ qstats.qlen = qlen; + + if (d->compat_tc_stats) { + d->tc_stats.drops = qstats.drops; +diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c +index 0d5c422f8745..8aec1b529364 100644 +--- a/net/netfilter/xt_RATEEST.c ++++ b/net/netfilter/xt_RATEEST.c +@@ -94,11 +94,11 @@ static unsigned int + xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) + { + const struct xt_rateest_target_info *info = par->targinfo; +- struct gnet_stats_basic_packed *stats = &info->est->bstats; ++ struct gnet_stats_basic_sync *stats = &info->est->bstats; + + spin_lock_bh(&info->est->lock); +- stats->bytes += skb->len; +- stats->packets++; ++ u64_stats_add(&stats->bytes, skb->len); ++ u64_stats_inc(&stats->packets); + spin_unlock_bh(&info->est->lock); + + return XT_CONTINUE; +@@ -143,6 +143,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) + if (!est) + goto err1; + ++ gnet_stats_basic_sync_init(&est->bstats); + strlcpy(est->name, info->name, sizeof(est->name)); + spin_lock_init(&est->lock); + est->refcnt = 1; +diff --git a/net/sched/act_api.c b/net/sched/act_api.c +index d775676956bf..94c05713ecf8 100644 +--- a/net/sched/act_api.c ++++ b/net/sched/act_api.c +@@ -486,16 +486,18 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + atomic_set(&p->tcfa_bindcnt, 1); + + if (cpustats) { +- p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); ++ p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); + if (!p->cpu_bstats) + goto err1; +- p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); ++ p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); + if (!p->cpu_bstats_hw) + goto err2; + p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!p->cpu_qstats) + goto err3; + } ++ gnet_stats_basic_sync_init(&p->tcfa_bstats); ++ gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); + spin_lock_init(&p->tcfa_lock); + p->tcfa_index = index; + p->tcfa_tm.install = jiffies; +@@ -505,7 +507,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + if (est) { + err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, + &p->tcfa_rate_est, +- &p->tcfa_lock, NULL, est); ++ &p->tcfa_lock, false, est); + if (err) + goto err4; + } +@@ -1141,13 +1143,13 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, bool hw) + { + if (a->cpu_bstats) { +- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); ++ _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + + this_cpu_ptr(a->cpu_qstats)->drops += drops; + + if (hw) +- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), +- bytes, packets); ++ _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), ++ bytes, packets); + return; + } + +@@ -1186,9 +1188,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, + if (err < 0) + goto errout; + +- if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || +- gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, +- &p->tcfa_bstats_hw) < 0 || ++ if (gnet_stats_copy_basic(&d, p->cpu_bstats, ++ &p->tcfa_bstats, false) < 0 || ++ gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, ++ &p->tcfa_bstats_hw, false) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || + gnet_stats_copy_queue(&d, p->cpu_qstats, + &p->tcfa_qstats, +diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c +index 2a05bad56ef3..a77d8908e737 100644 +--- a/net/sched/act_bpf.c ++++ b/net/sched/act_bpf.c +@@ -41,7 +41,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, + int action, filter_res; + + tcf_lastuse_update(&prog->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); + + filter = rcu_dereference(prog->filter); + if (at_ingress) { +diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c +index ec987ec75807..41ba55e60b1b 100644 +--- a/net/sched/act_ife.c ++++ b/net/sched/act_ife.c +@@ -718,7 +718,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, + u8 *tlv_data; + u16 metalen; + +- bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + tcf_lastuse_update(&ife->tcf_tm); + + if (skb_at_tc_ingress(skb)) +@@ -806,7 +806,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, + exceed_mtu = true; + } + +- bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + tcf_lastuse_update(&ife->tcf_tm); + + if (!metalen) { /* no metadata to send */ +diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c +index d010c5b8e83b..d39b74331c26 100644 +--- a/net/sched/act_mpls.c ++++ b/net/sched/act_mpls.c +@@ -59,7 +59,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, + int ret, mac_len; + + tcf_lastuse_update(&m->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); + + /* Ensure 'data' points at mac_header prior calling mpls manipulating + * functions. +diff --git a/net/sched/act_police.c b/net/sched/act_police.c +index db1d021c16be..d4ac56e4579c 100644 +--- a/net/sched/act_police.c ++++ b/net/sched/act_police.c +@@ -125,7 +125,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, + police->common.cpu_bstats, + &police->tcf_rate_est, + &police->tcf_lock, +- NULL, est); ++ false, est); + if (err) + goto failure; + } else if (tb[TCA_POLICE_AVRATE] && +@@ -262,7 +262,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, + int ret; + + tcf_lastuse_update(&police->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); + + ret = READ_ONCE(police->tcf_action); + p = rcu_dereference_bh(police->params); +diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c +index ca67d9644917..ef35df94182f 100644 +--- a/net/sched/act_sample.c ++++ b/net/sched/act_sample.c +@@ -170,7 +170,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, + int retval; + + tcf_lastuse_update(&s->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); + retval = READ_ONCE(s->tcf_action); + + psample_group = rcu_dereference_bh(s->psample_group); +diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c +index 788527154025..8c1d60bde93e 100644 +--- a/net/sched/act_simple.c ++++ b/net/sched/act_simple.c +@@ -36,7 +36,8 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, + * then it would look like "hello_3" (without quotes) + */ + pr_info("simple: %s_%llu\n", +- (char *)d->tcfd_defdata, d->tcf_bstats.packets); ++ (char *)d->tcfd_defdata, ++ u64_stats_read(&d->tcf_bstats.packets)); + spin_unlock(&d->tcf_lock); + return d->tcf_action; + } +diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c +index 6088ceaf582e..f6df717b9f17 100644 +--- a/net/sched/act_skbedit.c ++++ b/net/sched/act_skbedit.c +@@ -31,7 +31,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, + int action; + + tcf_lastuse_update(&d->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); + + params = rcu_dereference_bh(d->params); + action = READ_ONCE(d->tcf_action); +diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c +index ee9cc0abf9e1..2083612d8780 100644 +--- a/net/sched/act_skbmod.c ++++ b/net/sched/act_skbmod.c +@@ -31,7 +31,7 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, + u64 flags; + + tcf_lastuse_update(&d->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); + + action = READ_ONCE(d->tcf_action); + if (unlikely(action == TC_ACT_SHOT)) +diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c +index 328db5e1b0ea..c910046bbe4f 100644 +--- a/net/sched/sch_api.c ++++ b/net/sched/sch_api.c +@@ -884,7 +884,7 @@ static void qdisc_offload_graft_root(struct net_device *dev, + static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, + u32 portid, u32 seq, u16 flags, int event) + { +- struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; + struct tcmsg *tcm; + struct nlmsghdr *nlh; +@@ -942,8 +942,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, + cpu_qstats = q->cpu_qstats; + } + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), +- &d, cpu_bstats, &q->bstats) < 0 || ++ if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || + gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || + gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) + goto nla_put_failure; +@@ -1275,26 +1274,17 @@ static struct Qdisc *qdisc_create(struct net_device *dev, + rcu_assign_pointer(sch->stab, stab); + } + if (tca[TCA_RATE]) { +- seqcount_t *running; +- + err = -EOPNOTSUPP; + if (sch->flags & TCQ_F_MQROOT) { + NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); + goto err_out4; + } + +- if (sch->parent != TC_H_ROOT && +- !(sch->flags & TCQ_F_INGRESS) && +- (!p || !(p->flags & TCQ_F_MQROOT))) +- running = qdisc_root_sleeping_running(sch); +- else +- running = &sch->running; +- + err = gen_new_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + NULL, +- running, ++ true, + tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); +@@ -1370,7 +1360,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, + sch->cpu_bstats, + &sch->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + } + out: +diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c +index 33737169cc2d..28e1897e0da7 100644 +--- a/net/sched/sch_atm.c ++++ b/net/sched/sch_atm.c +@@ -52,7 +52,7 @@ struct atm_flow_data { + struct atm_qdisc_data *parent; /* parent qdisc */ + struct socket *sock; /* for closing */ + int ref; /* reference count */ +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct list_head list; + struct atm_flow_data *excess; /* flow for excess traffic; +@@ -551,6 +551,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, + pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + INIT_LIST_HEAD(&p->flows); + INIT_LIST_HEAD(&p->link.list); ++ gnet_stats_basic_sync_init(&p->link.bstats); + list_add(&p->link.list, &p->flows); + p->link.q = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, sch->handle, extack); +@@ -654,8 +655,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + { + struct atm_flow_data *flow = (struct atm_flow_data *)arg; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &flow->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || + gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) + return -1; + +diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c +index 46b3dd71777d..c3a74a2266b0 100644 +--- a/net/sched/sch_cbq.c ++++ b/net/sched/sch_cbq.c +@@ -116,7 +116,7 @@ struct cbq_class { + long avgidle; + long deficit; /* Saved deficit for WRR */ + psched_time_t penalized; +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct net_rate_estimator __rcu *rate_est; + struct tc_cbq_xstats xstats; +@@ -565,8 +565,7 @@ cbq_update(struct cbq_sched_data *q) + long avgidle = cl->avgidle; + long idle; + +- cl->bstats.packets++; +- cl->bstats.bytes += len; ++ _bstats_update(&cl->bstats, len, 1); + + /* + * (now - last) is total time between packet right edges. +@@ -1383,8 +1382,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + if (cl->undertime != PSCHED_PASTPERFECT) + cl->xstats.undertime = cl->undertime - q->now; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) + return -1; +@@ -1518,7 +1516,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); +@@ -1610,6 +1608,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t + if (cl == NULL) + goto failure; + ++ gnet_stats_basic_sync_init(&cl->bstats); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); + if (err) { + kfree(cl); +@@ -1618,9 +1617,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, +- NULL, +- qdisc_root_sleeping_running(sch), +- tca[TCA_RATE]); ++ NULL, true, tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); + tcf_block_put(cl->block); +diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c +index 80a88e208d2b..4e5b1cf11b85 100644 +--- a/net/sched/sch_drr.c ++++ b/net/sched/sch_drr.c +@@ -19,7 +19,7 @@ struct drr_class { + struct Qdisc_class_common common; + unsigned int filter_cnt; + +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct net_rate_estimator __rcu *rate_est; + struct list_head alist; +@@ -85,8 +85,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, +- NULL, +- qdisc_root_sleeping_running(sch), ++ NULL, true, + tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace estimator"); +@@ -106,6 +105,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + if (cl == NULL) + return -ENOBUFS; + ++ gnet_stats_basic_sync_init(&cl->bstats); + cl->common.classid = classid; + cl->quantum = quantum; + cl->qdisc = qdisc_create_dflt(sch->dev_queue, +@@ -118,9 +118,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, +- NULL, +- qdisc_root_sleeping_running(sch), +- tca[TCA_RATE]); ++ NULL, true, tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace estimator"); + qdisc_put(cl->qdisc); +@@ -267,8 +265,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, + if (qlen) + xstats.deficit = cl->deficit; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) + return -1; +diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c +index 175e07b3d25c..8de4365886e8 100644 +--- a/net/sched/sch_ets.c ++++ b/net/sched/sch_ets.c +@@ -41,7 +41,7 @@ struct ets_class { + struct Qdisc *qdisc; + u32 quantum; + u32 deficit; +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + }; + +@@ -325,8 +325,7 @@ static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct Qdisc *cl_q = cl->qdisc; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl_q->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + +@@ -661,7 +660,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + + q->nbands = nbands; + for (i = nstrict; i < q->nstrict; i++) { +- INIT_LIST_HEAD(&q->classes[i].alist); + if (q->classes[i].qdisc->q.qlen) { + list_add_tail(&q->classes[i].alist, &q->active); + q->classes[i].deficit = quanta[i]; +@@ -689,7 +687,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + ets_offload_change(sch); + for (i = q->nbands; i < oldbands; i++) { + qdisc_put(q->classes[i].qdisc); +- memset(&q->classes[i], 0, sizeof(q->classes[i])); ++ q->classes[i].qdisc = NULL; ++ q->classes[i].quantum = 0; ++ q->classes[i].deficit = 0; ++ gnet_stats_basic_sync_init(&q->classes[i].bstats); ++ memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); + } + return 0; + } +@@ -698,7 +700,7 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) + { + struct ets_sched *q = qdisc_priv(sch); +- int err; ++ int err, i; + + if (!opt) + return -EINVAL; +@@ -708,6 +710,9 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + return err; + + INIT_LIST_HEAD(&q->active); ++ for (i = 0; i < TCQ_ETS_MAX_BANDS; i++) ++ INIT_LIST_HEAD(&q->classes[i].alist); ++ + return ets_qdisc_change(sch, opt, extack); + } + +diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c +index 02299785209c..b979ae2f551c 100644 +--- a/net/sched/sch_generic.c ++++ b/net/sched/sch_generic.c +@@ -304,8 +304,8 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, + + /* + * Transmit possibly several skbs, and handle the return status as +- * required. Owning running seqcount bit guarantees that +- * only one CPU can execute this function. ++ * required. Owning qdisc running bit guarantees that only one CPU ++ * can execute this function. + * + * Returns to the caller: + * false - hardware queue frozen backoff +@@ -606,7 +606,6 @@ struct Qdisc noop_qdisc = { + .ops = &noop_qdisc_ops, + .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), + .dev_queue = &noop_netdev_queue, +- .running = SEQCNT_ZERO(noop_qdisc.running), + .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), + .gso_skb = { + .next = (struct sk_buff *)&noop_qdisc.gso_skb, +@@ -867,7 +866,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { + EXPORT_SYMBOL(pfifo_fast_ops); + + static struct lock_class_key qdisc_tx_busylock; +-static struct lock_class_key qdisc_running_key; + + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + const struct Qdisc_ops *ops, +@@ -892,11 +890,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + __skb_queue_head_init(&sch->gso_skb); + __skb_queue_head_init(&sch->skb_bad_txq); + qdisc_skb_head_init(&sch->q); ++ gnet_stats_basic_sync_init(&sch->bstats); + spin_lock_init(&sch->q.lock); + + if (ops->static_flags & TCQ_F_CPUSTATS) { + sch->cpu_bstats = +- netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); ++ netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); + if (!sch->cpu_bstats) + goto errout1; + +@@ -916,10 +915,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + lockdep_set_class(&sch->seqlock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + +- seqcount_init(&sch->running); +- lockdep_set_class(&sch->running, +- dev->qdisc_running_key ?: &qdisc_running_key); +- + sch->ops = ops; + sch->flags = ops->static_flags; + sch->enqueue = ops->enqueue; +diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c +index 621dc6afde8f..1073c76d05c4 100644 +--- a/net/sched/sch_gred.c ++++ b/net/sched/sch_gred.c +@@ -56,6 +56,7 @@ struct gred_sched { + u32 DPs; + u32 def; + struct red_vars wred_set; ++ struct tc_gred_qopt_offload *opt; + }; + + static inline int gred_wred_mode(struct gred_sched *table) +@@ -311,48 +312,50 @@ static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) + { + struct gred_sched *table = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); +- struct tc_gred_qopt_offload opt = { +- .command = command, +- .handle = sch->handle, +- .parent = sch->parent, +- }; ++ struct tc_gred_qopt_offload *opt = table->opt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + ++ memset(opt, 0, sizeof(*opt)); ++ opt->command = command; ++ opt->handle = sch->handle; ++ opt->parent = sch->parent; ++ + if (command == TC_GRED_REPLACE) { + unsigned int i; + +- opt.set.grio_on = gred_rio_mode(table); +- opt.set.wred_on = gred_wred_mode(table); +- opt.set.dp_cnt = table->DPs; +- opt.set.dp_def = table->def; ++ opt->set.grio_on = gred_rio_mode(table); ++ opt->set.wred_on = gred_wred_mode(table); ++ opt->set.dp_cnt = table->DPs; ++ opt->set.dp_def = table->def; + + for (i = 0; i < table->DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + + if (!q) + continue; +- opt.set.tab[i].present = true; +- opt.set.tab[i].limit = q->limit; +- opt.set.tab[i].prio = q->prio; +- opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; +- opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; +- opt.set.tab[i].is_ecn = gred_use_ecn(q); +- opt.set.tab[i].is_harddrop = gred_use_harddrop(q); +- opt.set.tab[i].probability = q->parms.max_P; +- opt.set.tab[i].backlog = &q->backlog; ++ opt->set.tab[i].present = true; ++ opt->set.tab[i].limit = q->limit; ++ opt->set.tab[i].prio = q->prio; ++ opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; ++ opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; ++ opt->set.tab[i].is_ecn = gred_use_ecn(q); ++ opt->set.tab[i].is_harddrop = gred_use_harddrop(q); ++ opt->set.tab[i].probability = q->parms.max_P; ++ opt->set.tab[i].backlog = &q->backlog; + } +- opt.set.qstats = &sch->qstats; ++ opt->set.qstats = &sch->qstats; + } + +- dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); ++ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt); + } + + static int gred_offload_dump_stats(struct Qdisc *sch) + { + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_qopt_offload *hw_stats; ++ u64 bytes = 0, packets = 0; + unsigned int i; + int ret; + +@@ -364,9 +367,11 @@ static int gred_offload_dump_stats(struct Qdisc *sch) + hw_stats->handle = sch->handle; + hw_stats->parent = sch->parent; + +- for (i = 0; i < MAX_DPs; i++) ++ for (i = 0; i < MAX_DPs; i++) { ++ gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); + if (table->tab[i]) + hw_stats->stats.xstats[i] = &table->tab[i]->stats; ++ } + + ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); + /* Even if driver returns failure adjust the stats - in case offload +@@ -375,19 +380,19 @@ static int gred_offload_dump_stats(struct Qdisc *sch) + for (i = 0; i < MAX_DPs; i++) { + if (!table->tab[i]) + continue; +- table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; +- table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; ++ table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); ++ table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); + table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; + +- _bstats_update(&sch->bstats, +- hw_stats->stats.bstats[i].bytes, +- hw_stats->stats.bstats[i].packets); ++ bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); ++ packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); + sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; + sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; + sch->qstats.drops += hw_stats->stats.qstats[i].drops; + sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; + sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; + } ++ _bstats_update(&sch->bstats, bytes, packets); + + kfree(hw_stats); + return ret; +@@ -728,6 +733,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, + static int gred_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) + { ++ struct gred_sched *table = qdisc_priv(sch); + struct nlattr *tb[TCA_GRED_MAX + 1]; + int err; + +@@ -751,6 +757,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, + sch->limit = qdisc_dev(sch)->tx_queue_len + * psched_mtu(qdisc_dev(sch)); + ++ if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { ++ table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL); ++ if (!table->opt) ++ return -ENOMEM; ++ } ++ + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); + } + +@@ -907,6 +919,7 @@ static void gred_destroy(struct Qdisc *sch) + gred_destroy_vq(table->tab[i]); + } + gred_offload(sch, TC_GRED_DESTROY); ++ kfree(table->opt); + } + + static struct Qdisc_ops gred_qdisc_ops __read_mostly = { +diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c +index c802a027b4f3..03efc40e42fc 100644 +--- a/net/sched/sch_hfsc.c ++++ b/net/sched/sch_hfsc.c +@@ -111,7 +111,7 @@ enum hfsc_class_flags { + struct hfsc_class { + struct Qdisc_class_common cl_common; + +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct net_rate_estimator __rcu *rate_est; + struct tcf_proto __rcu *filter_list; /* filter list */ +@@ -965,7 +965,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) + return err; +@@ -1033,9 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, +- NULL, +- qdisc_root_sleeping_running(sch), +- tca[TCA_RATE]); ++ NULL, true, tca[TCA_RATE]); + if (err) { + tcf_block_put(cl->block); + kfree(cl); +@@ -1328,7 +1326,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + xstats.work = cl->cl_total; + xstats.rtwork = cl->cl_cumul; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) + return -1; +@@ -1406,6 +1404,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt, + if (err) + return err; + ++ gnet_stats_basic_sync_init(&q->root.bstats); + q->root.cl_common.classid = sch->handle; + q->root.sched = q; + q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, +diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c +index 8ce999e4ca32..a90e4fffdfd9 100644 +--- a/net/sched/sch_htb.c ++++ b/net/sched/sch_htb.c +@@ -113,8 +113,8 @@ struct htb_class { + /* + * Written often fields + */ +- struct gnet_stats_basic_packed bstats; +- struct gnet_stats_basic_packed bstats_bias; ++ struct gnet_stats_basic_sync bstats; ++ struct gnet_stats_basic_sync bstats_bias; + struct tc_htb_xstats xstats; /* our special stats */ + + /* token bucket parameters */ +@@ -1309,10 +1309,11 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, + static void htb_offload_aggregate_stats(struct htb_sched *q, + struct htb_class *cl) + { ++ u64 bytes = 0, packets = 0; + struct htb_class *c; + unsigned int i; + +- memset(&cl->bstats, 0, sizeof(cl->bstats)); ++ gnet_stats_basic_sync_init(&cl->bstats); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) { +@@ -1324,14 +1325,15 @@ static void htb_offload_aggregate_stats(struct htb_sched *q, + if (p != cl) + continue; + +- cl->bstats.bytes += c->bstats_bias.bytes; +- cl->bstats.packets += c->bstats_bias.packets; ++ bytes += u64_stats_read(&c->bstats_bias.bytes); ++ packets += u64_stats_read(&c->bstats_bias.packets); + if (c->level == 0) { +- cl->bstats.bytes += c->leaf.q->bstats.bytes; +- cl->bstats.packets += c->leaf.q->bstats.packets; ++ bytes += u64_stats_read(&c->leaf.q->bstats.bytes); ++ packets += u64_stats_read(&c->leaf.q->bstats.packets); + } + } + } ++ _bstats_update(&cl->bstats, bytes, packets); + } + + static int +@@ -1358,16 +1360,16 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) + if (cl->leaf.q) + cl->bstats = cl->leaf.q->bstats; + else +- memset(&cl->bstats, 0, sizeof(cl->bstats)); +- cl->bstats.bytes += cl->bstats_bias.bytes; +- cl->bstats.packets += cl->bstats_bias.packets; ++ gnet_stats_basic_sync_init(&cl->bstats); ++ _bstats_update(&cl->bstats, ++ u64_stats_read(&cl->bstats_bias.bytes), ++ u64_stats_read(&cl->bstats_bias.packets)); + } else { + htb_offload_aggregate_stats(q, cl); + } + } + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) + return -1; +@@ -1582,8 +1584,9 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, + } + + if (cl->parent) { +- cl->parent->bstats_bias.bytes += q->bstats.bytes; +- cl->parent->bstats_bias.packets += q->bstats.packets; ++ _bstats_update(&cl->parent->bstats_bias, ++ u64_stats_read(&q->bstats.bytes), ++ u64_stats_read(&q->bstats.packets)); + } + + offload_opt = (struct tc_htb_qopt_offload) { +@@ -1875,6 +1878,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, + if (!cl) + goto failure; + ++ gnet_stats_basic_sync_init(&cl->bstats); ++ gnet_stats_basic_sync_init(&cl->bstats_bias); ++ + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); + if (err) { + kfree(cl); +@@ -1884,7 +1890,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE] ? : &est.nla); + if (err) + goto err_block_put; +@@ -1948,8 +1954,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, + htb_graft_helper(dev_queue, old_q); + goto err_kill_estimator; + } +- parent->bstats_bias.bytes += old_q->bstats.bytes; +- parent->bstats_bias.packets += old_q->bstats.packets; ++ _bstats_update(&parent->bstats_bias, ++ u64_stats_read(&old_q->bstats.bytes), ++ u64_stats_read(&old_q->bstats.packets)); + qdisc_put(old_q); + } + new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, +@@ -2009,7 +2016,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) + return err; +diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c +index db18d8a860f9..24c5d97d88dd 100644 +--- a/net/sched/sch_mq.c ++++ b/net/sched/sch_mq.c +@@ -153,10 +153,9 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + unsigned int ntx; +- __u32 qlen = 0; + + sch->q.qlen = 0; +- memset(&sch->bstats, 0, sizeof(sch->bstats)); ++ gnet_stats_basic_sync_init(&sch->bstats); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + /* MQ supports lockless qdiscs. However, statistics accounting needs +@@ -168,25 +167,11 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; + spin_lock_bh(qdisc_lock(qdisc)); + +- if (qdisc_is_percpu_stats(qdisc)) { +- qlen = qdisc_qlen_sum(qdisc); +- __gnet_stats_copy_basic(NULL, &sch->bstats, +- qdisc->cpu_bstats, +- &qdisc->bstats); +- __gnet_stats_copy_queue(&sch->qstats, +- qdisc->cpu_qstats, +- &qdisc->qstats, qlen); +- sch->q.qlen += qlen; +- } else { +- sch->q.qlen += qdisc->q.qlen; +- sch->bstats.bytes += qdisc->bstats.bytes; +- sch->bstats.packets += qdisc->bstats.packets; +- sch->qstats.qlen += qdisc->qstats.qlen; +- sch->qstats.backlog += qdisc->qstats.backlog; +- sch->qstats.drops += qdisc->qstats.drops; +- sch->qstats.requeues += qdisc->qstats.requeues; +- sch->qstats.overlimits += qdisc->qstats.overlimits; +- } ++ gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, ++ &qdisc->bstats, false); ++ gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, ++ &qdisc->qstats); ++ sch->q.qlen += qdisc_qlen(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + } +@@ -269,8 +254,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; +- if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, +- &sch->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || + qdisc_qstats_copy(d, sch) < 0) + return -1; + return 0; +diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c +index 50e15add6068..42d4101e4f3d 100644 +--- a/net/sched/sch_mqprio.c ++++ b/net/sched/sch_mqprio.c +@@ -412,7 +412,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) + unsigned int ntx, tc; + + sch->q.qlen = 0; +- memset(&sch->bstats, 0, sizeof(sch->bstats)); ++ gnet_stats_basic_sync_init(&sch->bstats); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + /* MQ supports lockless qdiscs. However, statistics accounting needs +@@ -424,25 +424,11 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; + spin_lock_bh(qdisc_lock(qdisc)); + +- if (qdisc_is_percpu_stats(qdisc)) { +- __u32 qlen = qdisc_qlen_sum(qdisc); +- +- __gnet_stats_copy_basic(NULL, &sch->bstats, +- qdisc->cpu_bstats, +- &qdisc->bstats); +- __gnet_stats_copy_queue(&sch->qstats, +- qdisc->cpu_qstats, +- &qdisc->qstats, qlen); +- sch->q.qlen += qlen; +- } else { +- sch->q.qlen += qdisc->q.qlen; +- sch->bstats.bytes += qdisc->bstats.bytes; +- sch->bstats.packets += qdisc->bstats.packets; +- sch->qstats.backlog += qdisc->qstats.backlog; +- sch->qstats.drops += qdisc->qstats.drops; +- sch->qstats.requeues += qdisc->qstats.requeues; +- sch->qstats.overlimits += qdisc->qstats.overlimits; +- } ++ gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, ++ &qdisc->bstats, false); ++ gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, ++ &qdisc->qstats); ++ sch->q.qlen += qdisc_qlen(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + } +@@ -534,12 +520,13 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + { + if (cl >= TC_H_MIN_PRIORITY) { + int i; +- __u32 qlen = 0; ++ __u32 qlen; + struct gnet_stats_queue qstats = {0}; +- struct gnet_stats_basic_packed bstats = {0}; ++ struct gnet_stats_basic_sync bstats; + struct net_device *dev = qdisc_dev(sch); + struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; + ++ gnet_stats_basic_sync_init(&bstats); + /* Drop lock here it will be reclaimed before touching + * statistics this is required because the d->lock we + * hold here is the look on dev_queue->qdisc_sleeping +@@ -554,40 +541,28 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + + spin_lock_bh(qdisc_lock(qdisc)); + +- if (qdisc_is_percpu_stats(qdisc)) { +- qlen = qdisc_qlen_sum(qdisc); +- +- __gnet_stats_copy_basic(NULL, &bstats, +- qdisc->cpu_bstats, +- &qdisc->bstats); +- __gnet_stats_copy_queue(&qstats, +- qdisc->cpu_qstats, +- &qdisc->qstats, +- qlen); +- } else { +- qlen += qdisc->q.qlen; +- bstats.bytes += qdisc->bstats.bytes; +- bstats.packets += qdisc->bstats.packets; +- qstats.backlog += qdisc->qstats.backlog; +- qstats.drops += qdisc->qstats.drops; +- qstats.requeues += qdisc->qstats.requeues; +- qstats.overlimits += qdisc->qstats.overlimits; +- } ++ gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, ++ &qdisc->bstats, false); ++ gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, ++ &qdisc->qstats); ++ sch->q.qlen += qdisc_qlen(qdisc); ++ + spin_unlock_bh(qdisc_lock(qdisc)); + } ++ qlen = qdisc_qlen(sch) + qstats.qlen; + + /* Reclaim root sleeping lock before completing stats */ + if (d->lock) + spin_lock_bh(d->lock); +- if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || + gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) + return -1; + } else { + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, +- sch->cpu_bstats, &sch->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, sch->cpu_bstats, ++ &sch->bstats, true) < 0 || + qdisc_qstats_copy(d, sch) < 0) + return -1; + } +diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c +index 8b99f07aa3a7..f28050c7f12d 100644 +--- a/net/sched/sch_multiq.c ++++ b/net/sched/sch_multiq.c +@@ -337,8 +337,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + +diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c +index 2e0b1e7f5466..c03a11dd990f 100644 +--- a/net/sched/sch_prio.c ++++ b/net/sched/sch_prio.c +@@ -359,8 +359,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, ++ &cl_q->bstats, true) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + +diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c +index 4c51aeb78f14..e591c3547b12 100644 +--- a/net/sched/sch_qfq.c ++++ b/net/sched/sch_qfq.c +@@ -131,7 +131,7 @@ struct qfq_class { + + unsigned int filter_cnt; + +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct net_rate_estimator __rcu *rate_est; + struct Qdisc *qdisc; +@@ -452,7 +452,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) + return err; +@@ -466,6 +466,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + if (cl == NULL) + return -ENOBUFS; + ++ gnet_stats_basic_sync_init(&cl->bstats); + cl->common.classid = classid; + cl->deficit = lmax; + +@@ -478,7 +479,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) + goto destroy_class; +@@ -640,8 +641,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + xstats.weight = cl->agg->class_weight; + xstats.lmax = cl->agg->lmax; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + qdisc_qstats_copy(d, cl->qdisc) < 0) + return -1; +@@ -1235,8 +1235,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + return err; + } + +- cl->bstats.bytes += len; +- cl->bstats.packets += gso_segs; ++ _bstats_update(&cl->bstats, len, gso_segs); + sch->qstats.backlog += len; + ++sch->q.qlen; + +diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c +index e203deacc953..30497d446af5 100644 +--- a/net/sched/sch_taprio.c ++++ b/net/sched/sch_taprio.c +@@ -1987,7 +1987,7 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; +- if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || + qdisc_qstats_copy(d, sch) < 0) + return -1; + return 0; +diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c +index 5ff8f902f14d..2ea5c3f18fd4 100644 +--- a/net/sunrpc/svc_xprt.c ++++ b/net/sunrpc/svc_xprt.c +@@ -441,7 +441,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + return; + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + + atomic_long_inc(&pool->sp_stats.packets); +@@ -465,7 +465,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) + rqstp = NULL; + out_unlock: + rcu_read_unlock(); +- put_cpu(); ++ put_cpu_light(); + trace_svc_xprt_do_enqueue(xprt, rqstp); + } + EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); +diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c +index 5a90aa527877..642d0748c169 100644 +--- a/samples/kfifo/bytestream-example.c ++++ b/samples/kfifo/bytestream-example.c +@@ -22,10 +22,10 @@ + #define PROC_FIFO "bytestream-fifo" + + /* lock for procfs read access */ +-static DEFINE_MUTEX(read_lock); ++static DEFINE_MUTEX(read_access); + + /* lock for procfs write access */ +-static DEFINE_MUTEX(write_lock); ++static DEFINE_MUTEX(write_access); + + /* + * define DYNAMIC in this example for a dynamically allocated fifo. +@@ -116,12 +116,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&write_lock)) ++ if (mutex_lock_interruptible(&write_access)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + +- mutex_unlock(&write_lock); ++ mutex_unlock(&write_access); + if (ret) + return ret; + +@@ -134,12 +134,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&read_lock)) ++ if (mutex_lock_interruptible(&read_access)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + +- mutex_unlock(&read_lock); ++ mutex_unlock(&read_access); + if (ret) + return ret; + +diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c +index e5403d8c971a..c61482ba94f4 100644 +--- a/samples/kfifo/inttype-example.c ++++ b/samples/kfifo/inttype-example.c +@@ -22,10 +22,10 @@ + #define PROC_FIFO "int-fifo" + + /* lock for procfs read access */ +-static DEFINE_MUTEX(read_lock); ++static DEFINE_MUTEX(read_access); + + /* lock for procfs write access */ +-static DEFINE_MUTEX(write_lock); ++static DEFINE_MUTEX(write_access); + + /* + * define DYNAMIC in this example for a dynamically allocated fifo. +@@ -109,12 +109,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&write_lock)) ++ if (mutex_lock_interruptible(&write_access)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + +- mutex_unlock(&write_lock); ++ mutex_unlock(&write_access); + if (ret) + return ret; + +@@ -127,12 +127,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&read_lock)) ++ if (mutex_lock_interruptible(&read_access)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + +- mutex_unlock(&read_lock); ++ mutex_unlock(&read_access); + if (ret) + return ret; + +diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c +index f64f3d62d6c2..e4087b2d3fc4 100644 +--- a/samples/kfifo/record-example.c ++++ b/samples/kfifo/record-example.c +@@ -22,10 +22,10 @@ + #define PROC_FIFO "record-fifo" + + /* lock for procfs read access */ +-static DEFINE_MUTEX(read_lock); ++static DEFINE_MUTEX(read_access); + + /* lock for procfs write access */ +-static DEFINE_MUTEX(write_lock); ++static DEFINE_MUTEX(write_access); + + /* + * define DYNAMIC in this example for a dynamically allocated fifo. +@@ -123,12 +123,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&write_lock)) ++ if (mutex_lock_interruptible(&write_access)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + +- mutex_unlock(&write_lock); ++ mutex_unlock(&write_access); + if (ret) + return ret; + +@@ -141,12 +141,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&read_lock)) ++ if (mutex_lock_interruptible(&read_access)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + +- mutex_unlock(&read_lock); ++ mutex_unlock(&read_access); + if (ret) + return ret; + +diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c +index 3f3f56f6be4d..5dbcdc5b22b5 100644 +--- a/security/smack/smack_lsm.c ++++ b/security/smack/smack_lsm.c +@@ -51,8 +51,10 @@ + #define SMK_RECEIVING 1 + #define SMK_SENDING 2 + ++#ifdef SMACK_IPV6_PORT_LABELING + static DEFINE_MUTEX(smack_ipv6_lock); + static LIST_HEAD(smk_ipv6_port_list); ++#endif + struct kmem_cache *smack_rule_cache; + int smack_enabled __initdata; + +@@ -2603,7 +2605,6 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) + mutex_unlock(&smack_ipv6_lock); + return; + } +-#endif + + /** + * smk_ipv6_port_check - check Smack port access +@@ -2666,6 +2667,7 @@ static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, + + return smk_ipv6_check(skp, object, address, act); + } ++#endif + + /** + * smack_inode_setsecurity - set smack xattrs +@@ -2852,8 +2854,9 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, + rc = smk_ipv6_check(ssp->smk_out, rsp, sip, + SMK_CONNECTING); + } +- if (__is_defined(SMACK_IPV6_PORT_LABELING)) +- rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); ++#ifdef SMACK_IPV6_PORT_LABELING ++ rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); ++#endif + + return rc; + } +diff --git a/sound/soc/mediatek/common/mtk-afe-fe-dai.c b/sound/soc/mediatek/common/mtk-afe-fe-dai.c +index e95c7c018e7d..4f2c2379531b 100644 +--- a/sound/soc/mediatek/common/mtk-afe-fe-dai.c ++++ b/sound/soc/mediatek/common/mtk-afe-fe-dai.c +@@ -288,7 +288,6 @@ const struct snd_soc_dai_ops mtk_afe_fe_ops = { + }; + EXPORT_SYMBOL_GPL(mtk_afe_fe_ops); + +-static DEFINE_MUTEX(irqs_lock); + int mtk_dynamic_irq_acquire(struct mtk_base_afe *afe) + { + int i; diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey_5.15.bb b/meta-digi-arm/recipes-kernel/linux/linux-dey_5.15.bb index cc65d7536..21efc5450 100644 --- a/meta-digi-arm/recipes-kernel/linux/linux-dey_5.15.bb +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey_5.15.bb @@ -7,6 +7,24 @@ SRCBRANCH:stm32mpcommon = "v5.15.118/stm/master" SRCREV = "${AUTOREV}" SRCREV:stm32mpcommon = "${AUTOREV}" +STM_RT_PATCHES = " \ + file://patch-5.15.119-rt65.patch \ + file://0023-5.15-stm32mp-rt-49-r1-CLOCK.patch \ + file://0024-5.15-stm32mp-rt-49-r1-DMA.patch \ + file://0025-5.15-stm32mp-rt-49-r1-MFD.patch \ + file://0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch \ + file://0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch \ + file://0028-5.15-stm32mp-rt-49-r1-CONFIG.patch \ +" + +SRC_URI:append:stm32mpcommon = " \ + ${@bb.utils.contains('DISTRO_FEATURES', 'rt', '${STM_RT_PATCHES}', '', d)} \ +" + +KERNEL_CONFIG_FRAGMENTS:append:stm32mpcommon = " ${@bb.utils.contains('DISTRO_FEATURES', 'rt', '${S}/arch/arm/configs/fragment-07-rt.config', '', d)}" +KERNEL_CONFIG_FRAGMENTS:append:stm32mpcommon = " ${@bb.utils.contains('DISTRO_FEATURES', 'rt', '${S}/arch/arm/configs/fragment-07-rt-sysvinit.config', '', d)}" +KERNEL_CONFIG_FRAGMENTS:append:ccmp13 = " ${@bb.utils.contains('DISTRO_FEATURES', 'rt', '${S}/arch/arm/configs/fragment-08-rt-mp13.config', '', d)}" + do_assemble_fitimage:append:ccmp1() { # # Step 9: Add public keys to the different U-Boot dtb files