From cc8c038401856ba16e7f246c5e505625b5636518 Mon Sep 17 00:00:00 2001 From: Mike Engel Date: Wed, 28 Feb 2024 16:10:59 +0100 Subject: [PATCH] linux-dey-5.15: add RT functionality to CCMP1 This commit adds RT functionality to CCMP1. The patches have been extracted from STM RT expansion package and includes the maineline RT patches and the STM RT driver patches and RT Kernel defconfig changes. https://onedigi.atlassian.net/browse/DEL-8880 Signed-off-by: Mike Engel --- .../recipes-kernel/linux/linux-dey.inc | 14 + .../0023-5.15-stm32mp-rt-49-r1-CLOCK.patch | 26 + .../0024-5.15-stm32mp-rt-49-r1-DMA.patch | 131 + .../0025-5.15-stm32mp-rt-49-r1-MFD.patch | 27 + .../0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch | 64 + ...027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch | 25 + .../0028-5.15-stm32mp-rt-49-r1-CONFIG.patch | 82 + .../linux-dey/ccmp1/patch-5.15.119-rt65.patch | 10968 ++++++++++++++++ .../recipes-kernel/linux/linux-dey_5.15.bb | 18 + 9 files changed, 11355 insertions(+) create mode 100644 meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0023-5.15-stm32mp-rt-49-r1-CLOCK.patch create mode 100644 meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0024-5.15-stm32mp-rt-49-r1-DMA.patch create mode 100644 meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0025-5.15-stm32mp-rt-49-r1-MFD.patch create mode 100644 meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch create mode 100644 meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch create mode 100644 meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0028-5.15-stm32mp-rt-49-r1-CONFIG.patch create mode 100644 meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/patch-5.15.119-rt65.patch diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey.inc b/meta-digi-arm/recipes-kernel/linux/linux-dey.inc index d9c4603c1..87f6ec74c 100644 --- a/meta-digi-arm/recipes-kernel/linux/linux-dey.inc +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey.inc @@ -71,6 +71,20 @@ do_configure:append() { if [ -n "${@' '.join(find_cfgs(d))}" ]; then ${S}/scripts/kconfig/merge_config.sh -m -O ${B} ${B}/.config ${@" ".join(find_cfgs(d))} fi + # Apply ST-specific config fragments (ending in .config and stored in a different folder) + if [ ! -z "${KERNEL_CONFIG_FRAGMENTS}" ]; then + for f in ${KERNEL_CONFIG_FRAGMENTS} + do + # Check if the config fragment was copied into the WORKDIR from + # the OE meta data + if [ ! -e "$f" ]; then + bb_warn "Could not find kernel config fragment $f" + exit 1 + fi + done + # Now that all the fragments are located merge them. + (${S}/scripts/kconfig/merge_config.sh -m -r -O ${B} ${B}/.config ${KERNEL_CONFIG_FRAGMENTS} 1>&2 ) + fi } # Don't create custom folder for kernel artifacts diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0023-5.15-stm32mp-rt-49-r1-CLOCK.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0023-5.15-stm32mp-rt-49-r1-CLOCK.patch new file mode 100644 index 000000000..41180995b --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0023-5.15-stm32mp-rt-49-r1-CLOCK.patch @@ -0,0 +1,26 @@ +From 63e709173a20b85b473bbf4832f4e909692fd361 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:54:24 +0100 +Subject: [PATCH 23/28] 5.15-stm32mp-rt-49-r1 CLOCK + +Signed-off-by: Lionel VITTE +--- + drivers/clk/stm32/clk-stm32mp13.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/clk/stm32/clk-stm32mp13.c b/drivers/clk/stm32/clk-stm32mp13.c +index 15ee05df8..2f7a823bf 100644 +--- a/drivers/clk/stm32/clk-stm32mp13.c ++++ b/drivers/clk/stm32/clk-stm32mp13.c +@@ -840,7 +840,7 @@ static CLK_STM32_GATE(sai1, "pclk2", 0, GATE_SAI1); + static CLK_STM32_GATE(sai2, "pclk2", 0, GATE_SAI2); + static CLK_STM32_GATE(spi1, "pclk2", 0, GATE_SPI1); + +-static CLK_STM32_GATE(syscfg, "pclk3", 0, GATE_SYSCFG); ++static CLK_STM32_GATE(syscfg, "pclk3", CLK_IS_CRITICAL, GATE_SYSCFG); + static CLK_STM32_GATE(vref, "pclk3", 0, GATE_VREF); + static CLK_STM32_GATE(dts, "pclk3", 0, GATE_DTS); + static CLK_STM32_GATE(pmbctrl, "pclk3", 0, GATE_PMBCTRL); +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0024-5.15-stm32mp-rt-49-r1-DMA.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0024-5.15-stm32mp-rt-49-r1-DMA.patch new file mode 100644 index 000000000..af368b2d0 --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0024-5.15-stm32mp-rt-49-r1-DMA.patch @@ -0,0 +1,131 @@ +From 5a55de398d12848f13f7df59fb2f1853b7dd9ee8 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:56:07 +0100 +Subject: [PATCH 24/28] 5.15-stm32mp-rt-49-r1 DMA + +Signed-off-by: Lionel VITTE +--- + drivers/dma/stm32-dma.c | 35 +++++++++++++++++++++++++---------- + drivers/dma/stm32-mdma.c | 4 ++++ + 2 files changed, 29 insertions(+), 10 deletions(-) + +diff --git a/drivers/dma/stm32-dma.c b/drivers/dma/stm32-dma.c +index 7c6078c6c..128edfb4f 100644 +--- a/drivers/dma/stm32-dma.c ++++ b/drivers/dma/stm32-dma.c +@@ -238,6 +238,7 @@ struct stm32_dma_chan { + u32 residue_after_drain; + struct workqueue_struct *mdma_wq; + struct work_struct mdma_work; ++ struct completion mdma_drain_completion; + }; + + struct stm32_dma_device { +@@ -570,8 +571,9 @@ static u32 stm32_dma_get_remaining_bytes(struct stm32_dma_chan *chan) + return ndtr << width; + } + +-static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan) ++static void stm32_dma_mdma_drain_worker(struct work_struct *work) + { ++ struct stm32_dma_chan *chan = container_of(work, struct stm32_dma_chan, mdma_work); + struct stm32_dma_mdma *mchan = &chan->mchan; + struct stm32_dma_sg_req *sg_req; + struct dma_device *ddev = mchan->chan->device; +@@ -583,14 +585,12 @@ static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan) + int ret; + unsigned long flags; + +- flush_workqueue(chan->mdma_wq); +- + /* DMA/MDMA chain: drain remaining data in SRAM */ + + /* Get the residue on MDMA side */ + status = dmaengine_tx_status(mchan->chan, mchan->chan->cookie, &state); + if (status == DMA_COMPLETE) +- return status; ++ goto mdma_complete; + + mdma_residue = state.residue; + sg_req = &chan->desc->sg_req[chan->next_sg - 1]; +@@ -623,24 +623,25 @@ static int stm32_dma_mdma_drain(struct stm32_dma_chan *chan) + desc = ddev->device_prep_dma_memcpy(mchan->chan, dst_buf, src_buf, dma_to_write, + DMA_PREP_INTERRUPT); + if (!desc) +- return -EINVAL; ++ return; + + ret = dma_submit_error(dmaengine_submit(desc)); + if (ret < 0) +- return ret; ++ return; + + status = dma_wait_for_async_tx(desc); + if (status != DMA_COMPLETE) { + dev_err(chan2dev(chan), "%s dma_wait_for_async_tx error\n", __func__); + dmaengine_terminate_async(mchan->chan); +- return -EBUSY; ++ return; + } + + /* We need to store residue for tx_status() */ + chan->residue_after_drain = len - (mdma_wrote + dma_to_write); + } + +- return 0; ++mdma_complete: ++ complete(&chan->mdma_drain_completion); + } + + static void stm32_dma_synchronize(struct dma_chan *c) +@@ -648,9 +649,22 @@ static void stm32_dma_synchronize(struct dma_chan *c) + struct stm32_dma_chan *chan = to_stm32_dma_chan(c); + struct stm32_dma_mdma *mchan = &chan->mchan; + +- if (chan->desc && chan->use_mdma && mchan->dir == DMA_DEV_TO_MEM) +- if (stm32_dma_mdma_drain(chan)) ++ if (chan->desc && chan->use_mdma && mchan->dir == DMA_DEV_TO_MEM) { ++ unsigned long ms = 5000 + 100; /* dma_sync_wait_timeout + extra 100ms */ ++ ++ reinit_completion(&chan->mdma_drain_completion); ++ ++ flush_workqueue(chan->mdma_wq); ++ INIT_WORK(&chan->mdma_work, stm32_dma_mdma_drain_worker); ++ ++ if (!queue_work(chan->mdma_wq, &chan->mdma_work)) ++ dev_warn(chan2dev(chan), "Work already queued\n"); ++ ++ ms = wait_for_completion_timeout(&chan->mdma_drain_completion, ++ msecs_to_jiffies(ms)); ++ if (ms == 0) + dev_err(chan2dev(chan), "%s: can't drain DMA\n", __func__); ++ } + + if (chan->use_mdma) + dmaengine_synchronize(mchan->chan); +@@ -2338,6 +2352,7 @@ static int stm32_dma_probe(struct platform_device *pdev) + dev_warn(&pdev->dev, + "can't alloc MDMA workqueue for %s\n", name); + } ++ init_completion(&chan->mdma_drain_completion); + } + } + } +diff --git a/drivers/dma/stm32-mdma.c b/drivers/dma/stm32-mdma.c +index 133534663..a08c94638 100644 +--- a/drivers/dma/stm32-mdma.c ++++ b/drivers/dma/stm32-mdma.c +@@ -1270,6 +1270,10 @@ static int stm32_mdma_resume(struct dma_chan *c) + unsigned long flags; + u32 status, reg; + ++ /* Transfer can be terminated */ ++ if (!chan->desc || (stm32_mdma_read(dmadev, STM32_MDMA_CCR(chan->id)) & STM32_MDMA_CCR_EN)) ++ return -EPERM; ++ + hwdesc = chan->desc->node[chan->curr_hwdesc].hwdesc; + + spin_lock_irqsave(&chan->vchan.lock, flags); +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0025-5.15-stm32mp-rt-49-r1-MFD.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0025-5.15-stm32mp-rt-49-r1-MFD.patch new file mode 100644 index 000000000..80d4f531e --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0025-5.15-stm32mp-rt-49-r1-MFD.patch @@ -0,0 +1,27 @@ +From be5ec688053e6d136bc8ea54ed1e93d523b24580 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:56:45 +0100 +Subject: [PATCH 25/28] 5.15-stm32mp-rt-49-r1 MFD + +Signed-off-by: Lionel VITTE +--- + drivers/mfd/syscon.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/mfd/syscon.c b/drivers/mfd/syscon.c +index 191fdb87c..24530dfe5 100644 +--- a/drivers/mfd/syscon.c ++++ b/drivers/mfd/syscon.c +@@ -38,6 +38,9 @@ static const struct regmap_config syscon_regmap_config = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, ++#ifdef CONFIG_PREEMPT_RT ++ .use_raw_spinlock = true, ++#endif + }; + + static struct syscon *of_syscon_register(struct device_node *np, bool check_clk) +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch new file mode 100644 index 000000000..4bcaaebfb --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch @@ -0,0 +1,64 @@ +From 1f4b70cda804c4f3771902254a2614d87a1d366c Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:57:06 +0100 +Subject: [PATCH 26/28] 5.15-stm32mp-rt-49-r1 NET-TTY + +Signed-off-by: Lionel VITTE +--- + drivers/tty/serial/stm32-usart.c | 32 +++++++++++--------------------- + 1 file changed, 11 insertions(+), 21 deletions(-) + +diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c +index 4d7a31664..0cd8e9672 100644 +--- a/drivers/tty/serial/stm32-usart.c ++++ b/drivers/tty/serial/stm32-usart.c +@@ -772,26 +772,16 @@ static irqreturn_t stm32_usart_interrupt(int irq, void *ptr) + } + + if ((sr & USART_SR_RTOF) && !(stm32_port->throttled) && +- stm32_usart_rx_dma_started(stm32_port)) +- return IRQ_WAKE_THREAD; +- else +- return IRQ_HANDLED; +-} +- +-static irqreturn_t stm32_usart_threaded_interrupt(int irq, void *ptr) +-{ +- struct uart_port *port = ptr; +- struct tty_port *tport = &port->state->port; +- unsigned int size; +- unsigned long flags; +- +- /* Receiver timeout irq for DMA RX */ +- spin_lock_irqsave(&port->lock, flags); +- size = stm32_usart_receive_chars(port, false); +- uart_unlock_and_check_sysrq_irqrestore(port, flags); +- if (size) +- tty_flip_buffer_push(tport); ++ stm32_usart_rx_dma_started(stm32_port)) { ++ unsigned long flags; + ++ spin_lock_irqsave(&port->lock, flags); ++ /* Receiver timeout irq for DMA RX */ ++ size = stm32_usart_receive_chars(port, false); ++ uart_unlock_and_check_sysrq_irqrestore(port, flags); ++ if (size) ++ tty_flip_buffer_push(tport); ++ } + return IRQ_HANDLED; + } + +@@ -980,8 +970,8 @@ static int stm32_usart_startup(struct uart_port *port) + u32 val; + int ret; + +- ret = request_threaded_irq(port->irq, stm32_usart_interrupt, +- stm32_usart_threaded_interrupt, ++ ret = request_threaded_irq(port->irq, NULL, ++ stm32_usart_interrupt, + IRQF_ONESHOT | IRQF_NO_SUSPEND, + name, port); + if (ret) +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch new file mode 100644 index 000000000..dac422be8 --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch @@ -0,0 +1,25 @@ +From e1bd8bc5502e661be4feaadfca1889da1d48cd73 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:57:43 +0100 +Subject: [PATCH 27/28] 5.15-stm32mp-rt-49-r1 DEVICETREE + +Signed-off-by: Lionel VITTE +--- + arch/arm/boot/dts/stm32mp131.dtsi | 1 - + 1 file changed, 1 deletion(-) + +diff --git a/arch/arm/boot/dts/stm32mp131.dtsi b/arch/arm/boot/dts/stm32mp131.dtsi +index 8121ddc97..3fc06961a 100644 +--- a/arch/arm/boot/dts/stm32mp131.dtsi ++++ b/arch/arm/boot/dts/stm32mp131.dtsi +@@ -1241,7 +1241,6 @@ exti-interrupt-map { + syscfg: syscon@50020000 { + compatible = "st,stm32mp157-syscfg", "syscon"; + reg = <0x50020000 0x400>; +- clocks = <&rcc SYSCFG>; + }; + + lptimer2: timer@50021000 { +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0028-5.15-stm32mp-rt-49-r1-CONFIG.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0028-5.15-stm32mp-rt-49-r1-CONFIG.patch new file mode 100644 index 000000000..e697ee7e5 --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/0028-5.15-stm32mp-rt-49-r1-CONFIG.patch @@ -0,0 +1,82 @@ +From 05ea3c26ccad3359d94dbe3c7ba758c2ba2f7dd9 Mon Sep 17 00:00:00 2001 +From: Lionel VITTE +Date: Wed, 8 Feb 2023 09:59:08 +0100 +Subject: [PATCH 28/28] 5.15-stm32mp-rt-49-r1 CONFIG + +Signed-off-by: Lionel VITTE +--- + .../configs/fragment-07-rt-sysvinit.config | 12 +++++++ + arch/arm/configs/fragment-07-rt.config | 32 +++++++++++++++++++ + arch/arm/configs/fragment-08-rt-mp13.config | 2 ++ + 3 files changed, 46 insertions(+) + create mode 100644 arch/arm/configs/fragment-07-rt-sysvinit.config + create mode 100644 arch/arm/configs/fragment-07-rt.config + create mode 100644 arch/arm/configs/fragment-08-rt-mp13.config + +diff --git a/arch/arm/configs/fragment-07-rt-sysvinit.config b/arch/arm/configs/fragment-07-rt-sysvinit.config +new file mode 100644 +index 000000000..49a4baf60 +--- /dev/null ++++ b/arch/arm/configs/fragment-07-rt-sysvinit.config +@@ -0,0 +1,12 @@ ++CONFIG_CGROUPS=y ++# CONFIG_CGROUP_SCHED is not set ++# CONFIG_CGROUP_PIDS is not set ++# CONFIG_CGROUP_RDMA is not set ++# CONFIG_CGROUP_FREEZER is not set ++# CONFIG_CGROUP_DEVICE is not set ++# CONFIG_CGROUP_CPUACCT is not set ++# CONFIG_CGROUP_PERF is not set ++# CONFIG_CGROUP_DEBUG is not set ++# CONFIG_CGROUP_NET_PRIO is not set ++# CONFIG_CGROUP_NET_CLASSID is not set ++ +diff --git a/arch/arm/configs/fragment-07-rt.config b/arch/arm/configs/fragment-07-rt.config +new file mode 100644 +index 000000000..98bb8735f +--- /dev/null ++++ b/arch/arm/configs/fragment-07-rt.config +@@ -0,0 +1,32 @@ ++CONFIG_PREEMPT_RT=y ++ ++# disable SCHED_MC ++# CONFIG_MCPM is not set ++ ++# Disable CPUFREQ and CPUIDLE ++# CONFIG_CPU_FREQ is not set ++# CONFIG_CPU_IDLE is not set ++ ++# Force to have HIGH_RES_TIMERS ++CONFIG_HIGH_RES_TIMERS=y ++ ++# force do not go to sleep ++# For multiple core, you should set the specific boot options ++# for isolate the core and render it tickless: "isolcpus=2,3 nohz_full=2,3" ++# Warning: to active only if SMP are present ++# CONFIG_HZ_PERIODIC=y ++ ++# to Enable ftrace, you need to enable the following configuraiton: ++# CONFIG_FTRACE=y ++# CONFIG_IRQSOFF_TRACER=y ++# CONFIG_PREEMPT_TRACER=y ++# CONFIG_SCHED_TRACER=y ++# CONFIG_FUNCTION_TRACER=y ++# By default, the ftrace for RT kernel are disabled ++# CONFIG_FTRACE is not set ++# CONFIG_IRQSOFF_TRACER is not set ++# CONFIG_PREEMPT_TRACER is not set ++# CONFIG_SCHED_TRACER is not set ++# CONFIG_FUNCTION_TRACER is not set ++ ++ +diff --git a/arch/arm/configs/fragment-08-rt-mp13.config b/arch/arm/configs/fragment-08-rt-mp13.config +new file mode 100644 +index 000000000..c70d7adc6 +--- /dev/null ++++ b/arch/arm/configs/fragment-08-rt-mp13.config +@@ -0,0 +1,2 @@ ++# Disable SMP on MP13 ++# CONFIG_SMP is not set +-- +2.34.1 + diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/patch-5.15.119-rt65.patch b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/patch-5.15.119-rt65.patch new file mode 100644 index 000000000..b2cf5ed68 --- /dev/null +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey/ccmp1/patch-5.15.119-rt65.patch @@ -0,0 +1,10968 @@ +diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst +index dd913eefbf31..33d3c988b951 100644 +--- a/Documentation/admin-guide/cgroup-v1/memory.rst ++++ b/Documentation/admin-guide/cgroup-v1/memory.rst +@@ -64,6 +64,7 @@ Brief summary of control files. + threads + cgroup.procs show list of processes + cgroup.event_control an interface for event_fd() ++ This knob is not available on CONFIG_PREEMPT_RT systems. + memory.usage_in_bytes show current usage for memory + (See 5.5 for details) + memory.memsw.usage_in_bytes show current usage for memory+Swap +@@ -75,6 +76,7 @@ Brief summary of control files. + memory.max_usage_in_bytes show max memory usage recorded + memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded + memory.soft_limit_in_bytes set/show soft limit of memory usage ++ This knob is not available on CONFIG_PREEMPT_RT systems. + memory.stat show various statistics + memory.use_hierarchy set/show hierarchical account enabled + This knob is deprecated and shouldn't be +diff --git a/Documentation/dev-tools/kcov.rst b/Documentation/dev-tools/kcov.rst +index d2c4c27e1702..d83c9ab49427 100644 +--- a/Documentation/dev-tools/kcov.rst ++++ b/Documentation/dev-tools/kcov.rst +@@ -50,6 +50,7 @@ program using kcov: + #include + #include + #include ++ #include + + #define KCOV_INIT_TRACE _IOR('c', 1, unsigned long) + #define KCOV_ENABLE _IO('c', 100) +@@ -177,6 +178,8 @@ Comparison operands collection is similar to coverage collection: + /* Read number of comparisons collected. */ + n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED); + for (i = 0; i < n; i++) { ++ uint64_t ip; ++ + type = cover[i * KCOV_WORDS_PER_CMP + 1]; + /* arg1 and arg2 - operands of the comparison. */ + arg1 = cover[i * KCOV_WORDS_PER_CMP + 2]; +@@ -251,6 +254,8 @@ selectively from different subsystems. + + .. code-block:: c + ++ /* Same includes and defines as above. */ ++ + struct kcov_remote_arg { + __u32 trace_mode; + __u32 area_size; +diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h +index 1d5716bc060b..2526fd3be5fd 100644 +--- a/arch/alpha/include/asm/spinlock_types.h ++++ b/arch/alpha/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ALPHA_SPINLOCK_TYPES_H + #define _ALPHA_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig +index a8ae17f5740d..0e8631b96e0f 100644 +--- a/arch/arm/Kconfig ++++ b/arch/arm/Kconfig +@@ -32,6 +32,7 @@ config ARM + select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_HUGETLBFS if ARM_LPAE ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_MEMTEST +@@ -68,7 +69,7 @@ config ARM + select HARDIRQS_SW_RESEND + select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT + select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 +- select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU ++ select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT + select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL + select HAVE_ARCH_MMAP_RND_BITS if MMU +@@ -109,6 +110,7 @@ config ARM + select HAVE_PERF_EVENTS + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RSEQ +@@ -124,6 +126,7 @@ config ARM + select OLD_SIGSUSPEND3 + select PCI_SYSCALL if PCI + select PERF_USE_VMALLOC ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select RTC_LIB + select SYS_SUPPORTS_APM_EMULATION + select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M +diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h +index 5976958647fe..0c14b36ef101 100644 +--- a/arch/arm/include/asm/spinlock_types.h ++++ b/arch/arm/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h +index b682189a2b5d..e5e2ceb59544 100644 +--- a/arch/arm/include/asm/thread_info.h ++++ b/arch/arm/include/asm/thread_info.h +@@ -52,6 +52,7 @@ struct cpu_context_save { + struct thread_info { + unsigned long flags; /* low level flags */ + int preempt_count; /* 0 => preemptable, <0 => bug */ ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + struct task_struct *task; /* main task structure */ + __u32 cpu; /* cpu */ + __u32 cpu_domain; /* cpu domain */ +@@ -130,6 +131,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ + #define TIF_UPROBE 3 /* breakpointed or singlestepping */ + #define TIF_NOTIFY_SIGNAL 4 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 9 + + #define TIF_USING_IWMMXT 17 + #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ +@@ -149,6 +151,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) + #define _TIF_SECCOMP (1 << TIF_SECCOMP) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) + + /* Checks for any syscall work in entry-common.S */ +@@ -158,7 +161,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, + /* + * Change these and you break ASM code in entry-common.S + */ +-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ ++#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NOTIFY_SIGNAL) + +diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c +index a646a3f6440f..beb09d74684f 100644 +--- a/arch/arm/kernel/asm-offsets.c ++++ b/arch/arm/kernel/asm-offsets.c +@@ -43,6 +43,7 @@ int main(void) + BLANK(); + DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); + DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); ++ DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); + DEFINE(TI_TASK, offsetof(struct thread_info, task)); + DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); + DEFINE(TI_CPU_DOMAIN, offsetof(struct thread_info, cpu_domain)); +diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S +index 68261a83b7ad..fa7d110ce555 100644 +--- a/arch/arm/kernel/entry-armv.S ++++ b/arch/arm/kernel/entry-armv.S +@@ -206,11 +206,18 @@ __irq_svc: + + #ifdef CONFIG_PREEMPTION + ldr r8, [tsk, #TI_PREEMPT] @ get preempt count +- ldr r0, [tsk, #TI_FLAGS] @ get flags + teq r8, #0 @ if preempt count != 0 ++ bne 1f @ return from exeption ++ ldr r0, [tsk, #TI_FLAGS] @ get flags ++ tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set ++ blne svc_preempt @ preempt! ++ ++ ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count ++ teq r8, #0 @ if preempt lazy count != 0 + movne r0, #0 @ force flags to 0 +- tst r0, #_TIF_NEED_RESCHED ++ tst r0, #_TIF_NEED_RESCHED_LAZY + blne svc_preempt ++1: + #endif + + svc_exit r5, irq = 1 @ return from exception +@@ -225,8 +232,14 @@ svc_preempt: + 1: bl preempt_schedule_irq @ irq en/disable is done inside + ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS + tst r0, #_TIF_NEED_RESCHED ++ bne 1b ++ tst r0, #_TIF_NEED_RESCHED_LAZY + reteq r8 @ go again +- b 1b ++ ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count ++ teq r0, #0 @ if preempt lazy count != 0 ++ beq 1b ++ ret r8 @ go again ++ + #endif + + __und_fault: +diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c +index 539897ac2828..4655f04ccdcd 100644 +--- a/arch/arm/kernel/signal.c ++++ b/arch/arm/kernel/signal.c +@@ -607,7 +607,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) + */ + trace_hardirqs_off(); + do { +- if (likely(thread_flags & _TIF_NEED_RESCHED)) { ++ if (likely(thread_flags & (_TIF_NEED_RESCHED | ++ _TIF_NEED_RESCHED_LAZY))) { + schedule(); + } else { + if (unlikely(!user_mode(regs))) +diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c +index af5177801fb1..1de016008e2e 100644 +--- a/arch/arm/mm/fault.c ++++ b/arch/arm/mm/fault.c +@@ -400,6 +400,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + if (addr < TASK_SIZE) + return do_page_fault(addr, fsr, regs); + ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ + if (user_mode(regs)) + goto bad_area; + +@@ -470,6 +473,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, + static int + do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) + { ++ if (interrupts_enabled(regs)) ++ local_irq_enable(); ++ + do_bad_area(addr, fsr, regs); + return 0; + } +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 9d3cbe786f8d..c86b845d0d79 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -88,6 +88,7 @@ config ARM64 + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 + select ARCH_SUPPORTS_NUMA_BALANCING ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT + select ARCH_WANT_DEFAULT_BPF_JIT + select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT +@@ -191,6 +192,7 @@ config ARM64 + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP + select HAVE_REGS_AND_STACK_ACCESS_API ++ select HAVE_PREEMPT_LAZY + select HAVE_FUNCTION_ARG_ACCESS_API + select HAVE_FUTEX_CMPXCHG if FUTEX + select MMU_GATHER_RCU_TABLE_FREE +@@ -212,6 +214,7 @@ config ARM64 + select PCI_DOMAINS_GENERIC if PCI + select PCI_ECAM if (ACPI && PCI) + select PCI_SYSCALL if PCI ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select POWER_RESET + select POWER_SUPPLY + select SPARSE_IRQ +diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h +index ed57717cd004..63b39229890b 100644 +--- a/arch/arm64/include/asm/pgtable.h ++++ b/arch/arm64/include/asm/pgtable.h +@@ -1001,7 +1001,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, + */ + static inline bool arch_faults_on_old_pte(void) + { +- WARN_ON(preemptible()); ++ WARN_ON(is_migratable()); + + return !cpu_has_hw_af(); + } +diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h +index e83f0982b99c..2545c17281e1 100644 +--- a/arch/arm64/include/asm/preempt.h ++++ b/arch/arm64/include/asm/preempt.h +@@ -70,13 +70,36 @@ static inline bool __preempt_count_dec_and_test(void) + * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE + * pair. + */ +- return !pc || !READ_ONCE(ti->preempt_count); ++ if (!pc || !READ_ONCE(ti->preempt_count)) ++ return true; ++#ifdef CONFIG_PREEMPT_LAZY ++ if ((pc & ~PREEMPT_NEED_RESCHED)) ++ return false; ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else ++ return false; ++#endif + } + + static inline bool should_resched(int preempt_offset) + { ++#ifdef CONFIG_PREEMPT_LAZY ++ u64 pc = READ_ONCE(current_thread_info()->preempt_count); ++ if (pc == preempt_offset) ++ return true; ++ ++ if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) ++ return false; ++ ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + return pc == preempt_offset; ++#endif + } + + #ifdef CONFIG_PREEMPTION +diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h +index ef449f5f4ba8..5e535c3e4926 100644 +--- a/arch/arm64/include/asm/signal.h ++++ b/arch/arm64/include/asm/signal.h +@@ -22,4 +22,8 @@ static inline void __user *arch_untagged_si_addr(void __user *addr, + } + #define arch_untagged_si_addr arch_untagged_si_addr + ++#if defined(CONFIG_PREEMPT_RT) ++#define ARCH_RT_DELAYS_SIGNAL_SEND ++#endif ++ + #endif +diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h +index 18782f0c4721..11ab1c077697 100644 +--- a/arch/arm64/include/asm/spinlock_types.h ++++ b/arch/arm64/include/asm/spinlock_types.h +@@ -5,7 +5,7 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) ++#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) + # error "please don't include this file directly" + #endif + +diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h +index 6623c99f0984..c55ccec33a5a 100644 +--- a/arch/arm64/include/asm/thread_info.h ++++ b/arch/arm64/include/asm/thread_info.h +@@ -26,6 +26,7 @@ struct thread_info { + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ + #endif ++ int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ + union { + u64 preempt_count; /* 0 => preemptible, <0 => bug */ + struct { +@@ -67,6 +68,7 @@ int arch_dup_task_struct(struct task_struct *dst, + #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ + #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ + #define TIF_NOTIFY_SIGNAL 6 /* signal notifications exist */ ++#define TIF_NEED_RESCHED_LAZY 7 + #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ + #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ + #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ +@@ -97,8 +99,10 @@ int arch_dup_task_struct(struct task_struct *dst, + #define _TIF_SVE (1 << TIF_SVE) + #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + +-#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ ++#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_SIGPENDING | \ + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ + _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ + _TIF_NOTIFY_SIGNAL) +@@ -107,6 +111,8 @@ int arch_dup_task_struct(struct task_struct *dst, + _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ + _TIF_SYSCALL_EMU) + ++#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) ++ + #ifdef CONFIG_SHADOW_CALL_STACK + #define INIT_SCS \ + .scs_base = init_shadow_call_stack, \ +diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c +index 551427ae8cc5..96a4f6c9eb78 100644 +--- a/arch/arm64/kernel/asm-offsets.c ++++ b/arch/arm64/kernel/asm-offsets.c +@@ -31,6 +31,7 @@ int main(void) + BLANK(); + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); ++ DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); + #ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); + #endif +diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c +index 7a3fcf21b18a..5689d2b0c0b6 100644 +--- a/arch/arm64/kernel/fpsimd.c ++++ b/arch/arm64/kernel/fpsimd.c +@@ -179,10 +179,19 @@ static void __get_cpu_fpsimd_context(void) + * + * The double-underscore version must only be called if you know the task + * can't be preempted. ++ * ++ * On RT kernels local_bh_disable() is not sufficient because it only ++ * serializes soft interrupt related sections via a local lock, but stays ++ * preemptible. Disabling preemption is the right choice here as bottom ++ * half processing is always in thread context on RT kernels so it ++ * implicitly prevents bottom half processing as well. + */ + static void get_cpu_fpsimd_context(void) + { +- local_bh_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_bh_disable(); ++ else ++ preempt_disable(); + __get_cpu_fpsimd_context(); + } + +@@ -203,7 +212,10 @@ static void __put_cpu_fpsimd_context(void) + static void put_cpu_fpsimd_context(void) + { + __put_cpu_fpsimd_context(); +- local_bh_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_bh_enable(); ++ else ++ preempt_enable(); + } + + static bool have_cpu_fpsimd_context(void) +@@ -1033,6 +1045,7 @@ void fpsimd_thread_switch(struct task_struct *next) + void fpsimd_flush_thread(void) + { + int vl, supported_vl; ++ void *sve_state = NULL; + + if (!system_supports_fpsimd()) + return; +@@ -1045,7 +1058,10 @@ void fpsimd_flush_thread(void) + + if (system_supports_sve()) { + clear_thread_flag(TIF_SVE); +- sve_free(current); ++ ++ /* Defer kfree() while in atomic context */ ++ sve_state = current->thread.sve_state; ++ current->thread.sve_state = NULL; + + /* + * Reset the task vector length as required. +@@ -1079,6 +1095,7 @@ void fpsimd_flush_thread(void) + } + + put_cpu_fpsimd_context(); ++ kfree(sve_state); + } + + /* +diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c +index b3e1beccf458..03183563feb8 100644 +--- a/arch/arm64/kernel/signal.c ++++ b/arch/arm64/kernel/signal.c +@@ -922,7 +922,7 @@ static void do_signal(struct pt_regs *regs) + void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) + { + do { +- if (thread_flags & _TIF_NEED_RESCHED) { ++ if (thread_flags & _TIF_NEED_RESCHED_MASK) { + /* Unmask Debug and SError for the next task */ + local_daif_restore(DAIF_PROCCTX_NOIRQ); + +@@ -930,6 +930,14 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) + } else { + local_daif_restore(DAIF_PROCCTX); + ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (unlikely(current->forced_info.si_signo)) { ++ struct task_struct *t = current; ++ force_sig_info(&t->forced_info); ++ t->forced_info.si_signo = 0; ++ } ++#endif ++ + if (thread_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + +diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c +index 3fe816c244ce..ba8c69cda361 100644 +--- a/arch/arm64/kvm/arm.c ++++ b/arch/arm64/kvm/arm.c +@@ -828,7 +828,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + * involves poking the GIC, which must be done in a + * non-preemptible context. + */ +- preempt_disable(); ++ migrate_disable(); + + kvm_pmu_flush_hwstate(vcpu); + +@@ -852,7 +852,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + kvm_timer_sync_user(vcpu); + kvm_vgic_sync_hwstate(vcpu); + local_irq_enable(); +- preempt_enable(); ++ migrate_enable(); + continue; + } + +@@ -921,7 +921,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) + /* Exit types that need handling before we can be preempted */ + handle_exit_early(vcpu, ret); + +- preempt_enable(); ++ migrate_enable(); + + /* + * The ARMv8 architecture doesn't give the hypervisor +diff --git a/arch/csky/include/asm/spinlock_types.h b/arch/csky/include/asm/spinlock_types.h +index 8ff0f6ff3a00..db87a12c3827 100644 +--- a/arch/csky/include/asm/spinlock_types.h ++++ b/arch/csky/include/asm/spinlock_types.h +@@ -3,7 +3,7 @@ + #ifndef __ASM_CSKY_SPINLOCK_TYPES_H + #define __ASM_CSKY_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h +index 19d233497ba5..d5f66495b670 100644 +--- a/arch/hexagon/include/asm/spinlock_types.h ++++ b/arch/hexagon/include/asm/spinlock_types.h +@@ -8,7 +8,7 @@ + #ifndef _ASM_SPINLOCK_TYPES_H + #define _ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h +index 6e345fefcdca..14b8a161c165 100644 +--- a/arch/ia64/include/asm/spinlock_types.h ++++ b/arch/ia64/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_IA64_SPINLOCK_TYPES_H + #define _ASM_IA64_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig +index 27222b75d2a4..5495225807eb 100644 +--- a/arch/powerpc/Kconfig ++++ b/arch/powerpc/Kconfig +@@ -151,6 +151,7 @@ config PPC + select ARCH_STACKWALK + select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_DEBUG_PAGEALLOC if PPC_BOOK3S || PPC_8xx || 40x ++ select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_CMPXCHG_LOCKREF if PPC64 + select ARCH_USE_MEMTEST +@@ -218,6 +219,7 @@ config PPC + select HAVE_HW_BREAKPOINT if PERF_EVENTS && (PPC_BOOK3S || PPC_8xx) + select HAVE_IOREMAP_PROT + select HAVE_IRQ_TIME_ACCOUNTING ++ select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM + select HAVE_KERNEL_GZIP + select HAVE_KERNEL_LZMA if DEFAULT_UIMAGE + select HAVE_KERNEL_LZO if DEFAULT_UIMAGE +@@ -234,6 +236,7 @@ config PPC + select HAVE_PERF_EVENTS_NMI if PPC64 + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_RELIABLE_STACKTRACE + select HAVE_RSEQ +diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h +index 0f3cdd8faa95..08243338069d 100644 +--- a/arch/powerpc/include/asm/simple_spinlock_types.h ++++ b/arch/powerpc/include/asm/simple_spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h +index 7ef1cd8168a0..f9e63cacd220 100644 +--- a/arch/powerpc/include/asm/smp.h ++++ b/arch/powerpc/include/asm/smp.h +@@ -62,6 +62,7 @@ struct smp_ops_t { + + extern int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); + extern int smp_send_safe_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us); ++extern void smp_send_debugger_break_cpu(unsigned int cpu); + extern void smp_send_debugger_break(void); + extern void start_secondary_resume(void); + extern void smp_generic_give_timebase(void); +diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h +index c5d742f18021..d5f8a74ed2e8 100644 +--- a/arch/powerpc/include/asm/spinlock_types.h ++++ b/arch/powerpc/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H + #define _ASM_POWERPC_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h +index 1c8460e23583..b1653c160bab 100644 +--- a/arch/powerpc/include/asm/stackprotector.h ++++ b/arch/powerpc/include/asm/stackprotector.h +@@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void) + unsigned long canary; + + /* Try to get a semi random initial value. */ ++#ifdef CONFIG_PREEMPT_RT ++ canary = (unsigned long)&canary; ++#else + canary = get_random_canary(); ++#endif + canary ^= mftb(); + canary ^= LINUX_VERSION_CODE; + canary &= CANARY_MASK; +diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h +index 87013ac2a640..2920ed371188 100644 +--- a/arch/powerpc/include/asm/thread_info.h ++++ b/arch/powerpc/include/asm/thread_info.h +@@ -53,6 +53,8 @@ + struct thread_info { + int preempt_count; /* 0 => preemptable, + <0 => BUG */ ++ int preempt_lazy_count; /* 0 => preemptable, ++ <0 => BUG */ + unsigned long local_flags; /* private flags for thread */ + #ifdef CONFIG_LIVEPATCH + unsigned long *livepatch_sp; +@@ -99,6 +101,7 @@ void arch_setup_new_exec(void); + #define TIF_PATCH_PENDING 6 /* pending live patching update */ + #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ + #define TIF_SINGLESTEP 8 /* singlestepping active */ ++#define TIF_NEED_RESCHED_LAZY 9 /* lazy rescheduling necessary */ + #define TIF_SECCOMP 10 /* secure computing */ + #define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ + #define TIF_NOERROR 12 /* Force successful syscall return */ +@@ -114,6 +117,7 @@ void arch_setup_new_exec(void); + #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ + #define TIF_32BIT 20 /* 32 bit binary */ + ++ + /* as above, but as bit values */ + #define _TIF_SYSCALL_TRACE (1<flags); + while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { + local_irq_enable(); +- if (ti_flags & _TIF_NEED_RESCHED) { ++ if (ti_flags & _TIF_NEED_RESCHED_MASK) { + schedule(); + } else { + /* +@@ -554,11 +554,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) + /* Returning to a kernel context with local irqs enabled. */ + WARN_ON_ONCE(!(regs->msr & MSR_EE)); + again: +- if (IS_ENABLED(CONFIG_PREEMPT)) { ++ if (IS_ENABLED(CONFIG_PREEMPTION)) { + /* Return to preemptible kernel context */ + if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED)) { + if (preempt_count() == 0) + preempt_schedule_irq(); ++ } else if (unlikely(current_thread_info()->flags & _TIF_NEED_RESCHED_LAZY)) { ++ if ((preempt_count() == 0) && ++ (current_thread_info()->preempt_lazy_count == 0)) ++ preempt_schedule_irq(); + } + } + +diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c +index c4f1d6b7d992..02e17a57da83 100644 +--- a/arch/powerpc/kernel/irq.c ++++ b/arch/powerpc/kernel/irq.c +@@ -690,6 +690,7 @@ static inline void check_stack_overflow(void) + } + } + ++#ifndef CONFIG_PREEMPT_RT + static __always_inline void call_do_softirq(const void *sp) + { + /* Temporarily switch r1 to sp, call __do_softirq() then restore r1. */ +@@ -708,6 +709,7 @@ static __always_inline void call_do_softirq(const void *sp) + "r11", "r12" + ); + } ++#endif + + static __always_inline void call_do_irq(struct pt_regs *regs, void *sp) + { +@@ -820,10 +822,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly; + void *softirq_ctx[NR_CPUS] __read_mostly; + void *hardirq_ctx[NR_CPUS] __read_mostly; + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + call_do_softirq(softirq_ctx[smp_processor_id()]); + } ++#endif + + irq_hw_number_t virq_to_hw(unsigned int virq) + { +diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c +index bdee7262c080..d57d37497862 100644 +--- a/arch/powerpc/kernel/kgdb.c ++++ b/arch/powerpc/kernel/kgdb.c +@@ -120,11 +120,19 @@ int kgdb_skipexception(int exception, struct pt_regs *regs) + + static int kgdb_debugger_ipi(struct pt_regs *regs) + { +- kgdb_nmicallback(raw_smp_processor_id(), regs); ++ int cpu = raw_smp_processor_id(); ++ ++ if (!kgdb_roundup_delay(cpu)) ++ kgdb_nmicallback(cpu, regs); + return 0; + } + + #ifdef CONFIG_SMP ++void kgdb_roundup_cpu(unsigned int cpu) ++{ ++ smp_send_debugger_break_cpu(cpu); ++} ++ + void kgdb_roundup_cpus(void) + { + smp_send_debugger_break(); +diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c +index fb95f92dcfac..308765f2e7a0 100644 +--- a/arch/powerpc/kernel/smp.c ++++ b/arch/powerpc/kernel/smp.c +@@ -590,6 +590,11 @@ static void debugger_ipi_callback(struct pt_regs *regs) + debugger_ipi(regs); + } + ++void smp_send_debugger_break_cpu(unsigned int cpu) ++{ ++ smp_send_nmi_ipi(cpu, debugger_ipi_callback, 1000000); ++} ++ + void smp_send_debugger_break(void) + { + smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, debugger_ipi_callback, 1000000); +diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c +index a08bb7cefdc5..ae34f68eedc1 100644 +--- a/arch/powerpc/kernel/traps.c ++++ b/arch/powerpc/kernel/traps.c +@@ -260,12 +260,17 @@ static char *get_mmu_str(void) + + static int __die(const char *str, struct pt_regs *regs, long err) + { ++ const char *pr = ""; ++ + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + ++ if (IS_ENABLED(CONFIG_PREEMPTION)) ++ pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; ++ + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", + IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", + PAGE_SIZE / 1024, get_mmu_str(), +- IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", ++ pr, + IS_ENABLED(CONFIG_SMP) ? " SMP" : "", + IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", + debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", +diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig +index ff581d70f20c..e5c84d55bdfb 100644 +--- a/arch/powerpc/kvm/Kconfig ++++ b/arch/powerpc/kvm/Kconfig +@@ -178,6 +178,7 @@ config KVM_E500MC + config KVM_MPIC + bool "KVM in-kernel MPIC emulation" + depends on KVM && E500 ++ depends on !PREEMPT_RT + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD + select HAVE_KVM_IRQ_ROUTING +diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c +index ec5d84b4958c..62a80ecc6735 100644 +--- a/arch/powerpc/platforms/pseries/iommu.c ++++ b/arch/powerpc/platforms/pseries/iommu.c +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -200,7 +201,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, + return ret; + } + +-static DEFINE_PER_CPU(__be64 *, tce_page); ++struct tce_page { ++ __be64 * page; ++ local_lock_t lock; ++}; ++static DEFINE_PER_CPU(struct tce_page, tce_page) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + long npages, unsigned long uaddr, +@@ -223,9 +230,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + direction, attrs); + } + +- local_irq_save(flags); /* to protect tcep and the page behind it */ ++ /* to protect tcep and the page behind it */ ++ local_lock_irqsave(&tce_page.lock, flags); + +- tcep = __this_cpu_read(tce_page); ++ tcep = __this_cpu_read(tce_page.page); + + /* This is safe to do since interrupts are off when we're called + * from iommu_alloc{,_sg}() +@@ -234,12 +242,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + /* If allocation fails, fall back to the loop implementation */ + if (!tcep) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); + return tce_build_pSeriesLP(tbl->it_index, tcenum, + tceshift, + npages, uaddr, direction, attrs); + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + rpn = __pa(uaddr) >> tceshift; +@@ -269,7 +277,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + tcenum += limit; + } while (npages > 0 && !rc); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&tce_page.lock, flags); + + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { + ret = (int)rc; +@@ -454,16 +462,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + DMA_BIDIRECTIONAL, 0); + } + +- local_irq_disable(); /* to protect tcep and the page behind it */ +- tcep = __this_cpu_read(tce_page); ++ /* to protect tcep and the page behind it */ ++ local_lock_irq(&tce_page.lock); ++ tcep = __this_cpu_read(tce_page.page); + + if (!tcep) { + tcep = (__be64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { +- local_irq_enable(); ++ local_unlock_irq(&tce_page.lock); + return -ENOMEM; + } +- __this_cpu_write(tce_page, tcep); ++ __this_cpu_write(tce_page.page, tcep); + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; +@@ -506,7 +515,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + + /* error cleanup: caller will clear whole range */ + +- local_irq_enable(); ++ local_unlock_irq(&tce_page.lock); + return rc; + } + +diff --git a/arch/riscv/include/asm/spinlock_types.h b/arch/riscv/include/asm/spinlock_types.h +index f398e7638dd6..5a35a49505da 100644 +--- a/arch/riscv/include/asm/spinlock_types.h ++++ b/arch/riscv/include/asm/spinlock_types.h +@@ -6,7 +6,7 @@ + #ifndef _ASM_RISCV_SPINLOCK_TYPES_H + #define _ASM_RISCV_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h +index a2bbfd7df85f..b69695e39957 100644 +--- a/arch/s390/include/asm/spinlock_types.h ++++ b/arch/s390/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h +index e82369f286a2..907bda4b1619 100644 +--- a/arch/sh/include/asm/spinlock_types.h ++++ b/arch/sh/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_SH_SPINLOCK_TYPES_H + #define __ASM_SH_SPINLOCK_TYPES_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c +index ef0f0827cf57..2d3eca8fee01 100644 +--- a/arch/sh/kernel/irq.c ++++ b/arch/sh/kernel/irq.c +@@ -149,6 +149,7 @@ void irq_ctx_exit(int cpu) + hardirq_ctx[cpu] = NULL; + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + struct thread_info *curctx; +@@ -176,6 +177,7 @@ void do_softirq_own_stack(void) + "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" + ); + } ++#endif + #else + static inline void handle_one_irq(unsigned int irq) + { +diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c +index c8848bb681a1..41fa1be980a3 100644 +--- a/arch/sparc/kernel/irq_64.c ++++ b/arch/sparc/kernel/irq_64.c +@@ -855,6 +855,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs) + set_irq_regs(old_regs); + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + void *orig_sp, *sp = softirq_stack[smp_processor_id()]; +@@ -869,6 +870,7 @@ void do_softirq_own_stack(void) + __asm__ __volatile__("mov %0, %%sp" + : : "r" (orig_sp)); + } ++#endif + + #ifdef CONFIG_HOTPLUG_CPU + void fixup_irqs(void) +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index a08ce6360382..4a4498670861 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -107,6 +107,7 @@ config X86 + select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 + select ARCH_SUPPORTS_LTO_CLANG + select ARCH_SUPPORTS_LTO_CLANG_THIN ++ select ARCH_SUPPORTS_RT + select ARCH_USE_BUILTIN_BSWAP + select ARCH_USE_MEMTEST + select ARCH_USE_QUEUED_RWLOCKS +@@ -230,6 +231,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP ++ select HAVE_PREEMPT_LAZY + select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_REGS_AND_STACK_ACCESS_API +diff --git a/arch/x86/include/asm/irq_stack.h b/arch/x86/include/asm/irq_stack.h +index e087cd7837c3..96cc92f63b06 100644 +--- a/arch/x86/include/asm/irq_stack.h ++++ b/arch/x86/include/asm/irq_stack.h +@@ -202,6 +202,7 @@ + IRQ_CONSTRAINTS, regs, vector); \ + } + ++#ifndef CONFIG_PREEMPT_RT + /* + * Macro to invoke __do_softirq on the irq stack. This is only called from + * task context when bottom halves are about to be reenabled and soft +@@ -215,6 +216,8 @@ + __this_cpu_write(hardirq_stack_inuse, false); \ + } + ++#endif ++ + #else /* CONFIG_X86_64 */ + /* System vector handlers always run on the stack they interrupted. */ + #define run_sysvec_on_irqstack_cond(func, regs) \ +diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h +index fe5efbcba824..ab8cb5fc2329 100644 +--- a/arch/x86/include/asm/preempt.h ++++ b/arch/x86/include/asm/preempt.h +@@ -90,17 +90,48 @@ static __always_inline void __preempt_count_sub(int val) + * a decrement which hits zero means we have no preempt_count and should + * reschedule. + */ +-static __always_inline bool __preempt_count_dec_and_test(void) ++static __always_inline bool ____preempt_count_dec_and_test(void) + { + return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); + } + ++static __always_inline bool __preempt_count_dec_and_test(void) ++{ ++ if (____preempt_count_dec_and_test()) ++ return true; ++#ifdef CONFIG_PREEMPT_LAZY ++ if (preempt_count()) ++ return false; ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else ++ return false; ++#endif ++} ++ + /* + * Returns true when we need to resched and can (barring IRQ state). + */ + static __always_inline bool should_resched(int preempt_offset) + { ++#ifdef CONFIG_PREEMPT_LAZY ++ u32 tmp; ++ tmp = raw_cpu_read_4(__preempt_count); ++ if (tmp == preempt_offset) ++ return true; ++ ++ /* preempt count == 0 ? */ ++ tmp &= ~PREEMPT_NEED_RESCHED; ++ if (tmp != preempt_offset) ++ return false; ++ /* XXX PREEMPT_LOCK_OFFSET */ ++ if (current_thread_info()->preempt_lazy_count) ++ return false; ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++#else + return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); ++#endif + } + + #ifdef CONFIG_PREEMPTION +diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h +index 2dfb5fea13af..fc03f4f7ed84 100644 +--- a/arch/x86/include/asm/signal.h ++++ b/arch/x86/include/asm/signal.h +@@ -28,6 +28,19 @@ typedef struct { + #define SA_IA32_ABI 0x02000000u + #define SA_X32_ABI 0x01000000u + ++/* ++ * Because some traps use the IST stack, we must keep preemption ++ * disabled while calling do_trap(), but do_trap() may call ++ * force_sig_info() which will grab the signal spin_locks for the ++ * task, which in PREEMPT_RT are mutexes. By defining ++ * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set ++ * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the ++ * trap. ++ */ ++#if defined(CONFIG_PREEMPT_RT) ++#define ARCH_RT_DELAYS_SIGNAL_SEND ++#endif ++ + #ifndef CONFIG_COMPAT + #define compat_sigset_t compat_sigset_t + typedef sigset_t compat_sigset_t; +diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h +index 24a8d6c4fb18..2fc22c27df18 100644 +--- a/arch/x86/include/asm/stackprotector.h ++++ b/arch/x86/include/asm/stackprotector.h +@@ -50,7 +50,7 @@ + */ + static __always_inline void boot_init_stack_canary(void) + { +- u64 canary; ++ u64 canary = 0; + u64 tsc; + + #ifdef CONFIG_X86_64 +@@ -61,8 +61,14 @@ static __always_inline void boot_init_stack_canary(void) + * of randomness. The TSC only matters for very early init, + * there it already has some randomness on most systems. Later + * on during the bootup the random pool has true entropy too. ++ * For preempt-rt we need to weaken the randomness a bit, as ++ * we can't call into the random generator from atomic context ++ * due to locking constraints. We just leave canary ++ * uninitialized and use the TSC based randomness on top of it. + */ ++#ifndef CONFIG_PREEMPT_RT + get_random_bytes(&canary, sizeof(canary)); ++#endif + tsc = rdtsc(); + canary += tsc + (tsc << 32UL); + canary &= CANARY_MASK; +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index cf132663c219..75dc786e6365 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -57,11 +57,14 @@ struct thread_info { + unsigned long flags; /* low level flags */ + unsigned long syscall_work; /* SYSCALL_WORK_ flags */ + u32 status; /* thread synchronous flags */ ++ int preempt_lazy_count; /* 0 => lazy preemptable ++ <0 => BUG */ + }; + + #define INIT_THREAD_INFO(tsk) \ + { \ + .flags = 0, \ ++ .preempt_lazy_count = 0, \ + } + + #else /* !__ASSEMBLY__ */ +@@ -90,6 +93,7 @@ struct thread_info { + #define TIF_NOTSC 16 /* TSC is not accessible in userland */ + #define TIF_NOTIFY_SIGNAL 17 /* signal notifications exist */ + #define TIF_SLD 18 /* Restore split lock detection on context switch */ ++#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ + #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ + #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ + #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ +@@ -114,6 +118,7 @@ struct thread_info { + #define _TIF_NOTSC (1 << TIF_NOTSC) + #define _TIF_NOTIFY_SIGNAL (1 << TIF_NOTIFY_SIGNAL) + #define _TIF_SLD (1 << TIF_SLD) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) + #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) + #define _TIF_SPEC_FORCE_UPDATE (1 << TIF_SPEC_FORCE_UPDATE) +diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c +index 044902d5a3c4..e5dd6da78713 100644 +--- a/arch/x86/kernel/irq_32.c ++++ b/arch/x86/kernel/irq_32.c +@@ -132,6 +132,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) + return 0; + } + ++#ifndef CONFIG_PREEMPT_RT + void do_softirq_own_stack(void) + { + struct irq_stack *irqstk; +@@ -148,6 +149,7 @@ void do_softirq_own_stack(void) + + call_on_stack(__do_softirq, isp); + } ++#endif + + void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) + { +diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c +index 3a43a2dee658..37bd37cdf2b6 100644 +--- a/arch/x86/kernel/kgdb.c ++++ b/arch/x86/kernel/kgdb.c +@@ -502,9 +502,12 @@ static int kgdb_nmi_handler(unsigned int cmd, struct pt_regs *regs) + if (atomic_read(&kgdb_active) != -1) { + /* KGDB CPU roundup */ + cpu = raw_smp_processor_id(); +- kgdb_nmicallback(cpu, regs); +- set_bit(cpu, was_in_debug_nmi); +- touch_nmi_watchdog(); ++ ++ if (!kgdb_roundup_delay(cpu)) { ++ kgdb_nmicallback(cpu, regs); ++ set_bit(cpu, was_in_debug_nmi); ++ touch_nmi_watchdog(); ++ } + + return NMI_HANDLED; + } +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 7e1e3bc74562..38639c57b462 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -8686,6 +8686,14 @@ int kvm_arch_init(void *opaque) + goto out; + } + ++#ifdef CONFIG_PREEMPT_RT ++ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { ++ pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); ++ r = -EOPNOTSUPP; ++ goto out; ++ } ++#endif ++ + r = -ENOMEM; + x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), + __alignof__(struct fpu), SLAB_ACCOUNT, +diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h +index 64c9389254f1..797aed7df3dd 100644 +--- a/arch/xtensa/include/asm/spinlock_types.h ++++ b/arch/xtensa/include/asm/spinlock_types.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_SPINLOCK_TYPES_H + #define __ASM_SPINLOCK_TYPES_H + +-#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) ++#if !defined(__LINUX_SPINLOCK_TYPES_RAW_H) && !defined(__ASM_SPINLOCK_H) + # error "please don't include this file directly" + #endif + +diff --git a/block/blk-mq.c b/block/blk-mq.c +index bbbbcd2c1941..0fc928de505d 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -1567,14 +1567,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, + return; + + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { +- int cpu = get_cpu(); ++ int cpu = get_cpu_light(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); +- put_cpu(); ++ put_cpu_light(); + return; + } + +- put_cpu(); ++ put_cpu_light(); + } + + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, +diff --git a/crypto/testmgr.c b/crypto/testmgr.c +index 163a1283a866..444183fe847d 100644 +--- a/crypto/testmgr.c ++++ b/crypto/testmgr.c +@@ -1061,14 +1061,14 @@ static void generate_random_testvec_config(struct testvec_config *cfg, + + static void crypto_disable_simd_for_test(void) + { +- preempt_disable(); ++ migrate_disable(); + __this_cpu_write(crypto_simd_disabled_for_test, true); + } + + static void crypto_reenable_simd_for_test(void) + { + __this_cpu_write(crypto_simd_disabled_for_test, false); +- preempt_enable(); ++ migrate_enable(); + } + + /* +diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c +index 6383c81ac5b3..abb695f5f5e4 100644 +--- a/drivers/block/zram/zram_drv.c ++++ b/drivers/block/zram/zram_drv.c +@@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); + static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, + u32 index, int offset, struct bio *bio); + ++#ifdef CONFIG_PREEMPT_RT ++static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) ++{ ++ size_t index; ++ ++ for (index = 0; index < num_pages; index++) ++ spin_lock_init(&zram->table[index].lock); ++} ++ ++static int zram_slot_trylock(struct zram *zram, u32 index) ++{ ++ int ret; ++ ++ ret = spin_trylock(&zram->table[index].lock); ++ if (ret) ++ __set_bit(ZRAM_LOCK, &zram->table[index].flags); ++ return ret; ++} ++ ++static void zram_slot_lock(struct zram *zram, u32 index) ++{ ++ spin_lock(&zram->table[index].lock); ++ __set_bit(ZRAM_LOCK, &zram->table[index].flags); ++} ++ ++static void zram_slot_unlock(struct zram *zram, u32 index) ++{ ++ __clear_bit(ZRAM_LOCK, &zram->table[index].flags); ++ spin_unlock(&zram->table[index].lock); ++} ++ ++#else ++ ++static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } + + static int zram_slot_trylock(struct zram *zram, u32 index) + { +@@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) + { + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); + } ++#endif + + static inline bool init_done(struct zram *zram) + { +@@ -1169,6 +1204,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) + + if (!huge_class_size) + huge_class_size = zs_huge_class_size(zram->mem_pool); ++ zram_meta_init_table_locks(zram, num_pages); + return true; + } + +diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h +index 80c3b43b4828..d8f6d880f915 100644 +--- a/drivers/block/zram/zram_drv.h ++++ b/drivers/block/zram/zram_drv.h +@@ -63,6 +63,7 @@ struct zram_table_entry { + unsigned long element; + }; + unsigned long flags; ++ spinlock_t lock; + #ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; + #endif +diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c +index dfb463ee7ca1..b19c4f745ee3 100644 +--- a/drivers/char/tpm/tpm_tis.c ++++ b/drivers/char/tpm/tpm_tis.c +@@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da + return container_of(data, struct tpm_tis_tcg_phy, priv); + } + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * Flushes previous write operations to chip so that a subsequent ++ * ioread*()s won't stall a cpu. ++ */ ++static inline void tpm_tis_flush(void __iomem *iobase) ++{ ++ ioread8(iobase + TPM_ACCESS(0)); ++} ++#else ++#define tpm_tis_flush(iobase) do { } while (0) ++#endif ++ ++static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) ++{ ++ iowrite8(b, iobase + addr); ++ tpm_tis_flush(iobase); ++} ++ ++static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) ++{ ++ iowrite32(b, iobase + addr); ++ tpm_tis_flush(iobase); ++} ++ + static int interrupts = -1; + module_param(interrupts, int, 0444); + MODULE_PARM_DESC(interrupts, "Enable interrupts"); +@@ -186,7 +211,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, + struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); + + while (len--) +- iowrite8(*value++, phy->iobase + addr); ++ tpm_tis_iowrite8(*value++, phy->iobase, addr); + + return 0; + } +@@ -213,7 +238,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) + { + struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); + +- iowrite32(value, phy->iobase + addr); ++ tpm_tis_iowrite32(value, phy->iobase, addr); + + return 0; + } +diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c +index 332739f3eded..8589df0e8c1f 100644 +--- a/drivers/firmware/efi/efi.c ++++ b/drivers/firmware/efi/efi.c +@@ -66,7 +66,7 @@ struct mm_struct efi_mm = { + + struct workqueue_struct *efi_rts_wq; + +-static bool disable_runtime; ++static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); + static int __init setup_noefi(char *arg) + { + disable_runtime = true; +@@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str) + if (parse_option_str(str, "noruntime")) + disable_runtime = true; + ++ if (parse_option_str(str, "runtime")) ++ disable_runtime = false; ++ + if (parse_option_str(str, "nosoftreserve")) + set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); + +diff --git a/drivers/gpu/drm/i915/display/intel_crtc.c b/drivers/gpu/drm/i915/display/intel_crtc.c +index 254e67141a77..7a39029b083f 100644 +--- a/drivers/gpu/drm/i915/display/intel_crtc.c ++++ b/drivers/gpu/drm/i915/display/intel_crtc.c +@@ -425,7 +425,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + */ + intel_psr_wait_for_idle(new_crtc_state); + +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + + crtc->debug.min_vbl = min; + crtc->debug.max_vbl = max; +@@ -450,11 +451,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + break; + } + +- local_irq_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + + timeout = schedule_timeout(timeout); + +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + } + + finish_wait(wq, &wait); +@@ -487,7 +490,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) + return; + + irq_disable: +- local_irq_disable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_disable(); + } + + #if IS_ENABLED(CONFIG_DRM_I915_DEBUG_VBLANK_EVADE) +@@ -566,7 +570,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) + new_crtc_state->uapi.event = NULL; + } + +- local_irq_enable(); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ local_irq_enable(); + + /* Send VRR Push to terminate Vblank */ + intel_vrr_send_push(new_crtc_state); +diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +index 209cf265bf74..6e1b9068d944 100644 +--- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c ++++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +@@ -311,10 +311,9 @@ void __intel_breadcrumbs_park(struct intel_breadcrumbs *b) + /* Kick the work once more to drain the signalers, and disarm the irq */ + irq_work_sync(&b->irq_work); + while (READ_ONCE(b->irq_armed) && !atomic_read(&b->active)) { +- local_irq_disable(); +- signal_irq_work(&b->irq_work); +- local_irq_enable(); ++ irq_work_queue(&b->irq_work); + cond_resched(); ++ irq_work_sync(&b->irq_work); + } + } + +diff --git a/drivers/gpu/drm/i915/gt/intel_context.h b/drivers/gpu/drm/i915/gt/intel_context.h +index c41098950746..601274ba86e4 100644 +--- a/drivers/gpu/drm/i915/gt/intel_context.h ++++ b/drivers/gpu/drm/i915/gt/intel_context.h +@@ -163,7 +163,8 @@ static inline void intel_context_enter(struct intel_context *ce) + + static inline void intel_context_mark_active(struct intel_context *ce) + { +- lockdep_assert_held(&ce->timeline->mutex); ++ lockdep_assert(lockdep_is_held(&ce->timeline->mutex) || ++ test_bit(CONTEXT_IS_PARKED, &ce->flags)); + ++ce->active_count; + } + +diff --git a/drivers/gpu/drm/i915/gt/intel_context_types.h b/drivers/gpu/drm/i915/gt/intel_context_types.h +index a63631ea0ec4..314457fb9db5 100644 +--- a/drivers/gpu/drm/i915/gt/intel_context_types.h ++++ b/drivers/gpu/drm/i915/gt/intel_context_types.h +@@ -112,6 +112,7 @@ struct intel_context { + #define CONTEXT_FORCE_SINGLE_SUBMISSION 7 + #define CONTEXT_NOPREEMPT 8 + #define CONTEXT_LRCA_DIRTY 9 ++#define CONTEXT_IS_PARKED 10 + + struct { + u64 timeout_us; +diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c +index dacd62773735..73e96ca024df 100644 +--- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c ++++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c +@@ -80,39 +80,6 @@ static int __engine_unpark(struct intel_wakeref *wf) + return 0; + } + +-#if IS_ENABLED(CONFIG_LOCKDEP) +- +-static unsigned long __timeline_mark_lock(struct intel_context *ce) +-{ +- unsigned long flags; +- +- local_irq_save(flags); +- mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); +- +- return flags; +-} +- +-static void __timeline_mark_unlock(struct intel_context *ce, +- unsigned long flags) +-{ +- mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); +- local_irq_restore(flags); +-} +- +-#else +- +-static unsigned long __timeline_mark_lock(struct intel_context *ce) +-{ +- return 0; +-} +- +-static void __timeline_mark_unlock(struct intel_context *ce, +- unsigned long flags) +-{ +-} +- +-#endif /* !IS_ENABLED(CONFIG_LOCKDEP) */ +- + static void duration(struct dma_fence *fence, struct dma_fence_cb *cb) + { + struct i915_request *rq = to_request(fence); +@@ -159,7 +126,6 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) + { + struct intel_context *ce = engine->kernel_context; + struct i915_request *rq; +- unsigned long flags; + bool result = true; + + /* GPU is pointing to the void, as good as in the kernel context. */ +@@ -201,7 +167,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) + * engine->wakeref.count, we may see the request completion and retire + * it causing an underflow of the engine->wakeref. + */ +- flags = __timeline_mark_lock(ce); ++ set_bit(CONTEXT_IS_PARKED, &ce->flags); + GEM_BUG_ON(atomic_read(&ce->timeline->active_count) < 0); + + rq = __i915_request_create(ce, GFP_NOWAIT); +@@ -233,7 +199,7 @@ static bool switch_to_kernel_context(struct intel_engine_cs *engine) + + result = false; + out_unlock: +- __timeline_mark_unlock(ce, flags); ++ clear_bit(CONTEXT_IS_PARKED, &ce->flags); + return result; + } + +diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +index 773ff5121833..f330457209d5 100644 +--- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c ++++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +@@ -1286,7 +1286,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + * and context switches) submission. + */ + +- spin_lock(&sched_engine->lock); ++ spin_lock_irq(&sched_engine->lock); + + /* + * If the queue is higher priority than the last +@@ -1386,7 +1386,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + * Even if ELSP[1] is occupied and not worthy + * of timeslices, our queue might be. + */ +- spin_unlock(&sched_engine->lock); ++ spin_unlock_irq(&sched_engine->lock); + return; + } + } +@@ -1412,7 +1412,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + + if (last && !can_merge_rq(last, rq)) { + spin_unlock(&ve->base.sched_engine->lock); +- spin_unlock(&engine->sched_engine->lock); ++ spin_unlock_irq(&engine->sched_engine->lock); + return; /* leave this for another sibling */ + } + +@@ -1574,7 +1574,7 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + */ + sched_engine->queue_priority_hint = queue_prio(sched_engine); + i915_sched_engine_reset_on_empty(sched_engine); +- spin_unlock(&sched_engine->lock); ++ spin_unlock_irq(&sched_engine->lock); + + /* + * We can skip poking the HW if we ended up with exactly the same set +@@ -1600,13 +1600,6 @@ static void execlists_dequeue(struct intel_engine_cs *engine) + } + } + +-static void execlists_dequeue_irq(struct intel_engine_cs *engine) +-{ +- local_irq_disable(); /* Suspend interrupts across request submission */ +- execlists_dequeue(engine); +- local_irq_enable(); /* flush irq_work (e.g. breadcrumb enabling) */ +-} +- + static void clear_ports(struct i915_request **ports, int count) + { + memset_p((void **)ports, NULL, count); +@@ -2442,7 +2435,7 @@ static void execlists_submission_tasklet(struct tasklet_struct *t) + } + + if (!engine->execlists.pending[0]) { +- execlists_dequeue_irq(engine); ++ execlists_dequeue(engine); + start_timeslice(engine); + } + +diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c +index 9bc4f4a8e12e..547347241a47 100644 +--- a/drivers/gpu/drm/i915/i915_irq.c ++++ b/drivers/gpu/drm/i915/i915_irq.c +@@ -886,7 +886,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, + */ + spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); + +- /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + + /* Get optional system timestamp before query. */ + if (stime) +@@ -950,7 +951,8 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, + if (etime) + *etime = ktime_get(); + +- /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + + spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); + +diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c +index 79da5eca60af..b9dd6100c6d1 100644 +--- a/drivers/gpu/drm/i915/i915_request.c ++++ b/drivers/gpu/drm/i915/i915_request.c +@@ -559,7 +559,6 @@ bool __i915_request_submit(struct i915_request *request) + + RQ_TRACE(request, "\n"); + +- GEM_BUG_ON(!irqs_disabled()); + lockdep_assert_held(&engine->sched_engine->lock); + + /* +@@ -668,7 +667,6 @@ void __i915_request_unsubmit(struct i915_request *request) + */ + RQ_TRACE(request, "\n"); + +- GEM_BUG_ON(!irqs_disabled()); + lockdep_assert_held(&engine->sched_engine->lock); + + /* +diff --git a/drivers/gpu/drm/i915/i915_request.h b/drivers/gpu/drm/i915/i915_request.h +index 1bc1349ba3c2..a2f713b4ac2f 100644 +--- a/drivers/gpu/drm/i915/i915_request.h ++++ b/drivers/gpu/drm/i915/i915_request.h +@@ -609,7 +609,8 @@ i915_request_timeline(const struct i915_request *rq) + { + /* Valid only while the request is being constructed (or retired). */ + return rcu_dereference_protected(rq->timeline, +- lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex)); ++ lockdep_is_held(&rcu_access_pointer(rq->timeline)->mutex) || ++ test_bit(CONTEXT_IS_PARKED, &rq->context->flags)); + } + + static inline struct i915_gem_context * +diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h +index 63fec1c3c132..f345a0f12bf6 100644 +--- a/drivers/gpu/drm/i915/i915_trace.h ++++ b/drivers/gpu/drm/i915/i915_trace.h +@@ -2,6 +2,10 @@ + #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) + #define _I915_TRACE_H_ + ++#ifdef CONFIG_PREEMPT_RT ++#define NOTRACE ++#endif ++ + #include + #include + #include +@@ -819,7 +823,7 @@ DEFINE_EVENT(i915_request, i915_request_add, + TP_ARGS(rq) + ); + +-#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) ++#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) + DEFINE_EVENT(i915_request, i915_request_guc_submit, + TP_PROTO(struct i915_request *rq), + TP_ARGS(rq) +diff --git a/drivers/gpu/drm/i915/i915_utils.h b/drivers/gpu/drm/i915/i915_utils.h +index 5259edacde38..b36b27c09049 100644 +--- a/drivers/gpu/drm/i915/i915_utils.h ++++ b/drivers/gpu/drm/i915/i915_utils.h +@@ -343,7 +343,7 @@ wait_remaining_ms_from_jiffies(unsigned long timestamp_jiffies, int to_wait_ms) + #define wait_for(COND, MS) _wait_for((COND), (MS) * 1000, 10, 1000) + + /* If CONFIG_PREEMPT_COUNT is disabled, in_atomic() always reports false. */ +-#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) ++#if defined(CONFIG_DRM_I915_DEBUG) && defined(CONFIG_PREEMPT_COUNT) && !defined(CONFIG_PREEMPT_RT) + # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) WARN_ON_ONCE((ATOMIC) && !in_atomic()) + #else + # define _WAIT_FOR_ATOMIC_CHECK(ATOMIC) do { } while (0) +diff --git a/drivers/i2c/busses/i2c-cht-wc.c b/drivers/i2c/busses/i2c-cht-wc.c +index 1cf68f85b2e1..8ccf0c928bb4 100644 +--- a/drivers/i2c/busses/i2c-cht-wc.c ++++ b/drivers/i2c/busses/i2c-cht-wc.c +@@ -99,15 +99,8 @@ static irqreturn_t cht_wc_i2c_adap_thread_handler(int id, void *data) + * interrupt handler as well, so running the client irq handler from + * this thread will cause things to lock up. + */ +- if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) { +- /* +- * generic_handle_irq expects local IRQs to be disabled +- * as normally it is called from interrupt context. +- */ +- local_irq_disable(); +- generic_handle_irq(adap->client_irq); +- local_irq_enable(); +- } ++ if (reg & CHT_WC_EXTCHGRIRQ_CLIENT_IRQ) ++ generic_handle_irq_safe(adap->client_irq); + + return IRQ_HANDLED; + } +diff --git a/drivers/i2c/i2c-core-base.c b/drivers/i2c/i2c-core-base.c +index 8fb065caf30b..c232535ca8f4 100644 +--- a/drivers/i2c/i2c-core-base.c ++++ b/drivers/i2c/i2c-core-base.c +@@ -1422,7 +1422,7 @@ int i2c_handle_smbus_host_notify(struct i2c_adapter *adap, unsigned short addr) + if (irq <= 0) + return -ENXIO; + +- generic_handle_irq(irq); ++ generic_handle_irq_safe(irq); + + return 0; + } +diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig +index 1f1d57288085..dc6816d36d06 100644 +--- a/drivers/leds/trigger/Kconfig ++++ b/drivers/leds/trigger/Kconfig +@@ -64,6 +64,7 @@ config LEDS_TRIGGER_BACKLIGHT + + config LEDS_TRIGGER_CPU + bool "LED CPU Trigger" ++ depends on !PREEMPT_RT + help + This allows LEDs to be controlled by active CPUs. This shows + the active CPUs across an array of LEDs so you can see which +diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c +index c2a42486f985..451a22641b5a 100644 +--- a/drivers/md/raid5.c ++++ b/drivers/md/raid5.c +@@ -2218,8 +2218,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) + struct raid5_percpu *percpu; + unsigned long cpu; + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + percpu = per_cpu_ptr(conf->percpu, cpu); ++ spin_lock(&percpu->lock); + if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { + ops_run_biofill(sh); + overlap_clear++; +@@ -2278,7 +2279,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) + if (test_and_clear_bit(R5_Overlap, &dev->flags)) + wake_up(&sh->raid_conf->wait_for_overlap); + } +- put_cpu(); ++ spin_unlock(&percpu->lock); ++ put_cpu_light(); + } + + static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) +@@ -7110,6 +7112,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) + __func__, cpu); + return -ENOMEM; + } ++ spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); + return 0; + } + +diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h +index 5c05acf20e1f..665fe138ab4f 100644 +--- a/drivers/md/raid5.h ++++ b/drivers/md/raid5.h +@@ -635,6 +635,7 @@ struct r5conf { + int recovery_disabled; + /* per cpu variables */ + struct raid5_percpu { ++ spinlock_t lock; /* Protection for -RT */ + struct page *spare_page; /* Used when checking P/Q in raid6 */ + void *scribble; /* space for constructing buffer + * lists and performing address +diff --git a/drivers/mfd/ezx-pcap.c b/drivers/mfd/ezx-pcap.c +index 70fa18b04ad2..b14d3f98e1eb 100644 +--- a/drivers/mfd/ezx-pcap.c ++++ b/drivers/mfd/ezx-pcap.c +@@ -193,13 +193,11 @@ static void pcap_isr_work(struct work_struct *work) + ezx_pcap_write(pcap, PCAP_REG_MSR, isr | msr); + ezx_pcap_write(pcap, PCAP_REG_ISR, isr); + +- local_irq_disable(); + service = isr & ~msr; + for (irq = pcap->irq_base; service; service >>= 1, irq++) { + if (service & 1) +- generic_handle_irq(irq); ++ generic_handle_irq_safe(irq); + } +- local_irq_enable(); + ezx_pcap_write(pcap, PCAP_REG_MSR, pcap->msr); + } while (gpio_get_value(pdata->gpio)); + } +diff --git a/drivers/misc/hi6421v600-irq.c b/drivers/misc/hi6421v600-irq.c +index 08535e97ff43..0585a5821d05 100644 +--- a/drivers/misc/hi6421v600-irq.c ++++ b/drivers/misc/hi6421v600-irq.c +@@ -118,8 +118,8 @@ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) + * If both powerkey down and up IRQs are received, + * handle them at the right order + */ +- generic_handle_irq(priv->irqs[POWERKEY_DOWN]); +- generic_handle_irq(priv->irqs[POWERKEY_UP]); ++ generic_handle_irq_safe(priv->irqs[POWERKEY_DOWN]); ++ generic_handle_irq_safe(priv->irqs[POWERKEY_UP]); + pending &= ~HISI_IRQ_POWERKEY_UP_DOWN; + } + +@@ -127,7 +127,7 @@ static irqreturn_t hi6421v600_irq_handler(int irq, void *__priv) + continue; + + for_each_set_bit(offset, &pending, BITS_PER_BYTE) { +- generic_handle_irq(priv->irqs[offset + i * BITS_PER_BYTE]); ++ generic_handle_irq_safe(priv->irqs[offset + i * BITS_PER_BYTE]); + } + } + +diff --git a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c +index 2473fb5f75e5..2a5cc64227e9 100644 +--- a/drivers/net/ethernet/netronome/nfp/abm/qdisc.c ++++ b/drivers/net/ethernet/netronome/nfp/abm/qdisc.c +@@ -458,7 +458,7 @@ nfp_abm_qdisc_graft(struct nfp_abm_link *alink, u32 handle, u32 child_handle, + static void + nfp_abm_stats_calculate(struct nfp_alink_stats *new, + struct nfp_alink_stats *old, +- struct gnet_stats_basic_packed *bstats, ++ struct gnet_stats_basic_sync *bstats, + struct gnet_stats_queue *qstats) + { + _bstats_update(bstats, new->tx_bytes - old->tx_bytes, +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index 5700c9d20a3e..be3330a1c922 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -1367,11 +1367,8 @@ static void lan78xx_status(struct lan78xx_net *dev, struct urb *urb) + netif_dbg(dev, link, dev->net, "PHY INTR: 0x%08x\n", intdata); + lan78xx_defer_kevent(dev, EVENT_LINK_RESET); + +- if (dev->domain_data.phyirq > 0) { +- local_irq_disable(); +- generic_handle_irq(dev->domain_data.phyirq); +- local_irq_enable(); +- } ++ if (dev->domain_data.phyirq > 0) ++ generic_handle_irq_safe(dev->domain_data.phyirq); + } else { + netdev_warn(dev->net, + "unexpected interrupt: 0x%08x\n", intdata); +diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c +index 76dbdae0e987..967431858dcd 100644 +--- a/drivers/scsi/fcoe/fcoe.c ++++ b/drivers/scsi/fcoe/fcoe.c +@@ -1450,11 +1450,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, + static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) + { + struct fcoe_percpu_s *fps; +- int rc; ++ int rc, cpu = get_cpu_light(); + +- fps = &get_cpu_var(fcoe_percpu); ++ fps = &per_cpu(fcoe_percpu, cpu); + rc = fcoe_get_paged_crc_eof(skb, tlen, fps); +- put_cpu_var(fcoe_percpu); ++ put_cpu_light(); + + return rc; + } +@@ -1639,11 +1639,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport, + return 0; + } + +- stats = per_cpu_ptr(lport->stats, get_cpu()); ++ stats = per_cpu_ptr(lport->stats, get_cpu_light()); + stats->InvalidCRCCount++; + if (stats->InvalidCRCCount < 5) + printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); +- put_cpu(); ++ put_cpu_light(); + return -EINVAL; + } + +@@ -1684,7 +1684,7 @@ static void fcoe_recv_frame(struct sk_buff *skb) + */ + hp = (struct fcoe_hdr *) skb_network_header(skb); + +- stats = per_cpu_ptr(lport->stats, get_cpu()); ++ stats = per_cpu_ptr(lport->stats, get_cpu_light()); + if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { + if (stats->ErrorFrames < 5) + printk(KERN_WARNING "fcoe: FCoE version " +@@ -1716,13 +1716,13 @@ static void fcoe_recv_frame(struct sk_buff *skb) + goto drop; + + if (!fcoe_filter_frames(lport, fp)) { +- put_cpu(); ++ put_cpu_light(); + fc_exch_recv(lport, fp); + return; + } + drop: + stats->ErrorFrames++; +- put_cpu(); ++ put_cpu_light(); + kfree_skb(skb); + } + +diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c +index 558f3f4e1859..f08feaa4f398 100644 +--- a/drivers/scsi/fcoe/fcoe_ctlr.c ++++ b/drivers/scsi/fcoe/fcoe_ctlr.c +@@ -828,7 +828,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) + + INIT_LIST_HEAD(&del_list); + +- stats = per_cpu_ptr(fip->lp->stats, get_cpu()); ++ stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); + + list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { + deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; +@@ -864,7 +864,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) + sel_time = fcf->time; + } + } +- put_cpu(); ++ put_cpu_light(); + + list_for_each_entry_safe(fcf, next, &del_list, list) { + /* Removes fcf from current list */ +diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c +index aa223db4cf53..0ceb93800704 100644 +--- a/drivers/scsi/libfc/fc_exch.c ++++ b/drivers/scsi/libfc/fc_exch.c +@@ -825,10 +825,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, + } + memset(ep, 0, sizeof(*ep)); + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + pool = per_cpu_ptr(mp->pool, cpu); + spin_lock_bh(&pool->lock); +- put_cpu(); ++ put_cpu_light(); + + /* peek cache of free slot */ + if (pool->left != FC_XID_UNKNOWN) { +diff --git a/drivers/staging/greybus/gpio.c b/drivers/staging/greybus/gpio.c +index 7e6347fe93f9..8a7cf1d0e968 100644 +--- a/drivers/staging/greybus/gpio.c ++++ b/drivers/staging/greybus/gpio.c +@@ -391,10 +391,7 @@ static int gb_gpio_request_handler(struct gb_operation *op) + return -EINVAL; + } + +- local_irq_disable(); +- ret = generic_handle_irq(irq); +- local_irq_enable(); +- ++ ret = generic_handle_irq_safe(irq); + if (ret) + dev_err(dev, "failed to invoke irq handler\n"); + +diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h +index bb1a98c97adf..8639210a89c7 100644 +--- a/drivers/tty/serial/8250/8250.h ++++ b/drivers/tty/serial/8250/8250.h +@@ -156,12 +156,55 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) + up->dl_write(up, value); + } + ++static inline void serial8250_set_IER(struct uart_8250_port *up, ++ unsigned char ier) ++{ ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(flags); ++ ++ serial_out(up, UART_IER, ier); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++} ++ ++static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) ++{ ++ struct uart_port *port = &up->port; ++ unsigned int clearval = 0; ++ unsigned long flags; ++ unsigned int prior; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (up->capabilities & UART_CAP_UUE) ++ clearval = UART_IER_UUE; ++ ++ if (is_console) ++ console_atomic_lock(flags); ++ ++ prior = serial_port_in(port, UART_IER); ++ serial_port_out(port, UART_IER, clearval); ++ ++ if (is_console) ++ console_atomic_unlock(flags); ++ ++ return prior; ++} ++ + static inline bool serial8250_set_THRI(struct uart_8250_port *up) + { + if (up->ier & UART_IER_THRI) + return false; + up->ier |= UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +@@ -170,7 +213,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) + if (!(up->ier & UART_IER_THRI)) + return false; + up->ier &= ~UART_IER_THRI; +- serial_out(up, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + return true; + } + +diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c +index 1890f342f090..16d5d450b32f 100644 +--- a/drivers/tty/serial/8250/8250_core.c ++++ b/drivers/tty/serial/8250/8250_core.c +@@ -265,10 +265,8 @@ static void serial8250_backup_timeout(struct timer_list *t) + * Must disable interrupts or else we risk racing with the interrupt + * based handler. + */ +- if (up->port.irq) { +- ier = serial_in(up, UART_IER); +- serial_out(up, UART_IER, 0); +- } ++ if (up->port.irq) ++ ier = serial8250_clear_IER(up); + + iir = serial_in(up, UART_IIR); + +@@ -291,7 +289,7 @@ static void serial8250_backup_timeout(struct timer_list *t) + serial8250_tx_chars(up); + + if (up->port.irq) +- serial_out(up, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + spin_unlock_irqrestore(&up->port.lock, flags); + +@@ -578,6 +576,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) + + #ifdef CONFIG_SERIAL_8250_CONSOLE + ++static void univ8250_console_write_atomic(struct console *co, const char *s, ++ unsigned int count) ++{ ++ struct uart_8250_port *up = &serial8250_ports[co->index]; ++ ++ serial8250_console_write_atomic(up, s, count); ++} ++ + static void univ8250_console_write(struct console *co, const char *s, + unsigned int count) + { +@@ -671,6 +677,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, + + static struct console univ8250_console = { + .name = "ttyS", ++ .write_atomic = univ8250_console_write_atomic, + .write = univ8250_console_write, + .device = uart_console_device, + .setup = univ8250_console_setup, +diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c +index 6a22f3a970f3..a6c02140eff0 100644 +--- a/drivers/tty/serial/8250/8250_fsl.c ++++ b/drivers/tty/serial/8250/8250_fsl.c +@@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port) + + /* Stop processing interrupts on input overrun */ + if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { ++ unsigned long flags; + unsigned long delay; ++ bool is_console; + ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(flags); + up->ier = port->serial_in(port, UART_IER); ++ if (is_console) ++ console_atomic_unlock(flags); ++ + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { + port->ops->stop_rx(port); + } else { +diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c +index 65402d05eff9..8122645ab05c 100644 +--- a/drivers/tty/serial/8250/8250_ingenic.c ++++ b/drivers/tty/serial/8250/8250_ingenic.c +@@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", + + static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + { ++ unsigned long flags; ++ bool is_console; + int ier; + + switch (offset) { +@@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) + * If we have enabled modem status IRQs we should enable + * modem mode. + */ ++ is_console = uart_console(p); ++ if (is_console) ++ console_atomic_lock(flags); + ier = p->serial_in(p, UART_IER); ++ if (is_console) ++ console_atomic_unlock(flags); + + if (ier & UART_IER_MSI) + value |= UART_MCR_MDCE | UART_MCR_FCM; +diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c +index de48a58460f4..364ee950f21a 100644 +--- a/drivers/tty/serial/8250/8250_mtk.c ++++ b/drivers/tty/serial/8250/8250_mtk.c +@@ -222,12 +222,37 @@ static void mtk8250_shutdown(struct uart_port *port) + + static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int ier; ++ bool is_console; ++ ++ is_console = uart_console(port); ++ ++ if (is_console) ++ console_atomic_lock(flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier & (~mask)); ++ ++ if (is_console) ++ console_atomic_unlock(flags); + } + + static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) + { +- serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int ier; ++ ++ if (uart_console(port)) ++ console_atomic_lock(flags); ++ ++ ier = serial_in(up, UART_IER); ++ serial_out(up, UART_IER, ier | mask); ++ ++ if (uart_console(port)) ++ console_atomic_unlock(flags); + } + + static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) +diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c +index bfdd9ecc2baf..479b94b3238a 100644 +--- a/drivers/tty/serial/8250/8250_port.c ++++ b/drivers/tty/serial/8250/8250_port.c +@@ -752,7 +752,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) + serial_out(p, UART_EFR, UART_EFR_ECB); + serial_out(p, UART_LCR, 0); + } +- serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); ++ serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); + if (p->capabilities & UART_CAP_EFR) { + serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); + serial_out(p, UART_EFR, efr); +@@ -1427,7 +1427,7 @@ static void serial8250_stop_rx(struct uart_port *port) + + up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); + up->port.read_status_mask &= ~UART_LSR_DR; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + serial8250_rpm_put(up); + } +@@ -1457,7 +1457,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p) + serial8250_clear_and_reinit_fifos(p); + + p->ier |= UART_IER_RLSI | UART_IER_RDI; +- serial_port_out(&p->port, UART_IER, p->ier); ++ serial8250_set_IER(p, p->ier); + } + } + EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); +@@ -1693,7 +1693,7 @@ static void serial8250_disable_ms(struct uart_port *port) + mctrl_gpio_disable_ms(up->gpios); + + up->ier &= ~UART_IER_MSI; +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + } + + static void serial8250_enable_ms(struct uart_port *port) +@@ -1709,7 +1709,7 @@ static void serial8250_enable_ms(struct uart_port *port) + up->ier |= UART_IER_MSI; + + serial8250_rpm_get(up); +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + serial8250_rpm_put(up); + } + +@@ -2143,14 +2143,7 @@ static void serial8250_put_poll_char(struct uart_port *port, + struct uart_8250_port *up = up_to_u8250p(port); + + serial8250_rpm_get(up); +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ ier = serial8250_clear_IER(up); + + wait_for_xmitr(up, BOTH_EMPTY); + /* +@@ -2163,7 +2156,7 @@ static void serial8250_put_poll_char(struct uart_port *port, + * and restore the IER + */ + wait_for_xmitr(up, BOTH_EMPTY); +- serial_port_out(port, UART_IER, ier); ++ serial8250_set_IER(up, ier); + serial8250_rpm_put(up); + } + +@@ -2468,7 +2461,7 @@ void serial8250_do_shutdown(struct uart_port *port) + */ + spin_lock_irqsave(&port->lock, flags); + up->ier = 0; +- serial_port_out(port, UART_IER, 0); ++ serial8250_set_IER(up, 0); + spin_unlock_irqrestore(&port->lock, flags); + + synchronize_irq(port->irq); +@@ -2850,7 +2843,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, + if (up->capabilities & UART_CAP_RTOIE) + up->ier |= UART_IER_RTOIE; + +- serial_port_out(port, UART_IER, up->ier); ++ serial8250_set_IER(up, up->ier); + + if (up->capabilities & UART_CAP_EFR) { + unsigned char efr = 0; +@@ -3315,7 +3308,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); + + #ifdef CONFIG_SERIAL_8250_CONSOLE + +-static void serial8250_console_putchar(struct uart_port *port, int ch) ++static void serial8250_console_putchar_locked(struct uart_port *port, int ch) + { + struct uart_8250_port *up = up_to_u8250p(port); + +@@ -3323,6 +3316,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch) + serial_port_out(port, UART_TX, ch); + } + ++static void serial8250_console_putchar(struct uart_port *port, int ch) ++{ ++ struct uart_8250_port *up = up_to_u8250p(port); ++ unsigned long flags; ++ ++ wait_for_xmitr(up, UART_LSR_THRE); ++ ++ console_atomic_lock(flags); ++ serial8250_console_putchar_locked(port, ch); ++ console_atomic_unlock(flags); ++} ++ + /* + * Restore serial console when h/w power-off detected + */ +@@ -3349,6 +3354,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) + serial8250_out_MCR(up, up->mcr | UART_MCR_DTR | UART_MCR_RTS); + } + ++void serial8250_console_write_atomic(struct uart_8250_port *up, ++ const char *s, unsigned int count) ++{ ++ struct uart_port *port = &up->port; ++ unsigned long flags; ++ unsigned int ier; ++ ++ console_atomic_lock(flags); ++ ++ touch_nmi_watchdog(); ++ ++ ier = serial8250_clear_IER(up); ++ ++ if (atomic_fetch_inc(&up->console_printing)) { ++ uart_console_write(port, "\n", 1, ++ serial8250_console_putchar_locked); ++ } ++ uart_console_write(port, s, count, serial8250_console_putchar_locked); ++ atomic_dec(&up->console_printing); ++ ++ wait_for_xmitr(up, BOTH_EMPTY); ++ serial8250_set_IER(up, ier); ++ ++ console_atomic_unlock(flags); ++} ++ + /* + * Print a string to the serial port trying not to disturb + * any possible real use of the port... +@@ -3365,24 +3396,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + struct uart_port *port = &up->port; + unsigned long flags; + unsigned int ier; +- int locked = 1; + + touch_nmi_watchdog(); + +- if (oops_in_progress) +- locked = spin_trylock_irqsave(&port->lock, flags); +- else +- spin_lock_irqsave(&port->lock, flags); +- +- /* +- * First save the IER then disable the interrupts +- */ +- ier = serial_port_in(port, UART_IER); ++ spin_lock_irqsave(&port->lock, flags); + +- if (up->capabilities & UART_CAP_UUE) +- serial_port_out(port, UART_IER, UART_IER_UUE); +- else +- serial_port_out(port, UART_IER, 0); ++ ier = serial8250_clear_IER(up); + + /* check scratch reg to see if port powered off during system sleep */ + if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { +@@ -3396,7 +3415,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + mdelay(port->rs485.delay_rts_before_send); + } + ++ atomic_inc(&up->console_printing); + uart_console_write(port, s, count, serial8250_console_putchar); ++ atomic_dec(&up->console_printing); + + /* + * Finally, wait for transmitter to become empty +@@ -3409,8 +3430,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + if (em485->tx_stopped) + up->rs485_stop_tx(up); + } +- +- serial_port_out(port, UART_IER, ier); ++ serial8250_set_IER(up, ier); + + /* + * The receive handling will happen properly because the +@@ -3422,8 +3442,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, + if (up->msr_saved_flags) + serial8250_modem_status(up); + +- if (locked) +- spin_unlock_irqrestore(&port->lock, flags); ++ spin_unlock_irqrestore(&port->lock, flags); + } + + static unsigned int probe_baud(struct uart_port *port) +@@ -3443,6 +3462,7 @@ static unsigned int probe_baud(struct uart_port *port) + + int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + { ++ struct uart_8250_port *up = up_to_u8250p(port); + int baud = 9600; + int bits = 8; + int parity = 'n'; +@@ -3452,6 +3472,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) + if (!port->iobase && !port->membase) + return -ENODEV; + ++ atomic_set(&up->console_printing, 0); ++ + if (options) + uart_parse_options(options, &baud, &parity, &bits, &flow); + else if (probe) +diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c +index b91fe25a64a1..5986658e130b 100644 +--- a/drivers/tty/serial/amba-pl011.c ++++ b/drivers/tty/serial/amba-pl011.c +@@ -2340,18 +2340,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) + { + struct uart_amba_port *uap = amba_ports[co->index]; + unsigned int old_cr = 0, new_cr; +- unsigned long flags; ++ unsigned long flags = 0; + int locked = 1; + + clk_enable(uap->clk); + +- local_irq_save(flags); ++ /* ++ * local_irq_save(flags); ++ * ++ * This local_irq_save() is nonsense. If we come in via sysrq ++ * handling then interrupts are already disabled. Aside of ++ * that the port.sysrq check is racy on SMP regardless. ++ */ + if (uap->port.sysrq) + locked = 0; + else if (oops_in_progress) +- locked = spin_trylock(&uap->port.lock); ++ locked = spin_trylock_irqsave(&uap->port.lock, flags); + else +- spin_lock(&uap->port.lock); ++ spin_lock_irqsave(&uap->port.lock, flags); + + /* + * First save the CR then disable the interrupts +@@ -2377,8 +2383,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) + pl011_write(old_cr, uap, REG_CR); + + if (locked) +- spin_unlock(&uap->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&uap->port.lock, flags); + + clk_disable(uap->clk); + } +diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c +index 0862941862c8..10970632f0e4 100644 +--- a/drivers/tty/serial/omap-serial.c ++++ b/drivers/tty/serial/omap-serial.c +@@ -1255,13 +1255,10 @@ serial_omap_console_write(struct console *co, const char *s, + unsigned int ier; + int locked = 1; + +- local_irq_save(flags); +- if (up->port.sysrq) +- locked = 0; +- else if (oops_in_progress) +- locked = spin_trylock(&up->port.lock); ++ if (up->port.sysrq || oops_in_progress) ++ locked = spin_trylock_irqsave(&up->port.lock, flags); + else +- spin_lock(&up->port.lock); ++ spin_lock_irqsave(&up->port.lock, flags); + + /* + * First save the IER then disable the interrupts +@@ -1288,8 +1285,7 @@ serial_omap_console_write(struct console *co, const char *s, + check_modem_status(up); + + if (locked) +- spin_unlock(&up->port.lock); +- local_irq_restore(flags); ++ spin_unlock_irqrestore(&up->port.lock, flags); + } + + static int __init +diff --git a/drivers/virt/acrn/irqfd.c b/drivers/virt/acrn/irqfd.c +index df5184979b28..d4ad211dce7a 100644 +--- a/drivers/virt/acrn/irqfd.c ++++ b/drivers/virt/acrn/irqfd.c +@@ -17,7 +17,6 @@ + #include "acrn_drv.h" + + static LIST_HEAD(acrn_irqfd_clients); +-static DEFINE_MUTEX(acrn_irqfds_mutex); + + /** + * struct hsm_irqfd - Properties of HSM irqfd +diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c +index 45cfd50a9521..502b56597f10 100644 +--- a/fs/afs/dir_silly.c ++++ b/fs/afs/dir_silly.c +@@ -239,7 +239,7 @@ int afs_silly_iput(struct dentry *dentry, struct inode *inode) + struct dentry *alias; + int ret; + +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); + +diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c +index 1929e80c09ee..48eb8c30c6db 100644 +--- a/fs/cifs/readdir.c ++++ b/fs/cifs/readdir.c +@@ -69,7 +69,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, + struct inode *inode; + struct super_block *sb = parent->d_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); + +diff --git a/fs/dcache.c b/fs/dcache.c +index cf871a81f4fd..02db80f2817f 100644 +--- a/fs/dcache.c ++++ b/fs/dcache.c +@@ -2537,7 +2537,13 @@ EXPORT_SYMBOL(d_rehash); + + static inline unsigned start_dir_add(struct inode *dir) + { +- ++ /* ++ * The caller has a spinlock_t (dentry::d_lock) acquired which disables ++ * preemption on !PREEMPT_RT. On PREEMPT_RT the lock does not disable ++ * preemption and it has be done explicitly. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + for (;;) { + unsigned n = dir->i_dir_seq; + if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) +@@ -2549,25 +2555,30 @@ static inline unsigned start_dir_add(struct inode *dir) + static inline void end_dir_add(struct inode *dir, unsigned n) + { + smp_store_release(&dir->i_dir_seq, n + 2); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + } + + static void d_wait_lookup(struct dentry *dentry) + { +- if (d_in_lookup(dentry)) { +- DECLARE_WAITQUEUE(wait, current); +- add_wait_queue(dentry->d_wait, &wait); +- do { +- set_current_state(TASK_UNINTERRUPTIBLE); +- spin_unlock(&dentry->d_lock); +- schedule(); +- spin_lock(&dentry->d_lock); +- } while (d_in_lookup(dentry)); +- } ++ struct swait_queue __wait; ++ ++ if (!d_in_lookup(dentry)) ++ return; ++ ++ INIT_LIST_HEAD(&__wait.task_list); ++ do { ++ prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); ++ spin_unlock(&dentry->d_lock); ++ schedule(); ++ spin_lock(&dentry->d_lock); ++ } while (d_in_lookup(dentry)); ++ finish_swait(dentry->d_wait, &__wait); + } + + struct dentry *d_alloc_parallel(struct dentry *parent, + const struct qstr *name, +- wait_queue_head_t *wq) ++ struct swait_queue_head *wq) + { + unsigned int hash = name->hash; + struct hlist_bl_head *b = in_lookup_hash(parent, hash); +@@ -2682,7 +2693,7 @@ void __d_lookup_done(struct dentry *dentry) + hlist_bl_lock(b); + dentry->d_flags &= ~DCACHE_PAR_LOOKUP; + __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); +- wake_up_all(dentry->d_wait); ++ swake_up_all(dentry->d_wait); + dentry->d_wait = NULL; + hlist_bl_unlock(b); + INIT_HLIST_NODE(&dentry->d_u.d_alias); +diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h +index c3e4804b8fcb..9edb87e11680 100644 +--- a/fs/fscache/internal.h ++++ b/fs/fscache/internal.h +@@ -81,7 +81,6 @@ extern unsigned fscache_debug; + extern struct kobject *fscache_root; + extern struct workqueue_struct *fscache_object_wq; + extern struct workqueue_struct *fscache_op_wq; +-DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + + extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); + +diff --git a/fs/fscache/main.c b/fs/fscache/main.c +index 4207f98e405f..85f8cf3a323d 100644 +--- a/fs/fscache/main.c ++++ b/fs/fscache/main.c +@@ -41,8 +41,6 @@ struct kobject *fscache_root; + struct workqueue_struct *fscache_object_wq; + struct workqueue_struct *fscache_op_wq; + +-DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); +- + /* these values serve as lower bounds, will be adjusted in fscache_init() */ + static unsigned fscache_object_max_active = 4; + static unsigned fscache_op_max_active = 2; +@@ -138,7 +136,6 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) + static int __init fscache_init(void) + { + unsigned int nr_cpus = num_possible_cpus(); +- unsigned int cpu; + int ret; + + fscache_object_max_active = +@@ -161,9 +158,6 @@ static int __init fscache_init(void) + if (!fscache_op_wq) + goto error_op_wq; + +- for_each_possible_cpu(cpu) +- init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); +- + ret = fscache_proc_init(); + if (ret < 0) + goto error_proc; +diff --git a/fs/fscache/object.c b/fs/fscache/object.c +index 6a675652129b..7a972d144b54 100644 +--- a/fs/fscache/object.c ++++ b/fs/fscache/object.c +@@ -798,6 +798,8 @@ void fscache_object_destroy(struct fscache_object *object) + } + EXPORT_SYMBOL(fscache_object_destroy); + ++static DECLARE_WAIT_QUEUE_HEAD(fscache_object_cong_wait); ++ + /* + * enqueue an object for metadata-type processing + */ +@@ -806,16 +808,12 @@ void fscache_enqueue_object(struct fscache_object *object) + _enter("{OBJ%x}", object->debug_id); + + if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { +- wait_queue_head_t *cong_wq = +- &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) +- wake_up(cong_wq); ++ wake_up(&fscache_object_cong_wait); + } else + fscache_put_object(object, fscache_obj_put_queue); +- +- put_cpu_var(fscache_object_cong_wait); + } + } + +@@ -833,16 +831,15 @@ void fscache_enqueue_object(struct fscache_object *object) + */ + bool fscache_object_sleep_till_congested(signed long *timeoutp) + { +- wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + +- add_wait_queue_exclusive(cong_wq, &wait); ++ add_wait_queue_exclusive(&fscache_object_cong_wait, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); +- finish_wait(cong_wq, &wait); ++ finish_wait(&fscache_object_cong_wait, &wait); + + return fscache_object_congested(); + } +diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c +index d5294e663df5..ee8846818b34 100644 +--- a/fs/fuse/readdir.c ++++ b/fs/fuse/readdir.c +@@ -160,7 +160,7 @@ static int fuse_direntplus_link(struct file *file, + struct inode *dir = d_inode(parent); + struct fuse_conn *fc; + struct inode *inode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + if (!o->nodeid) { + /* +diff --git a/fs/namei.c b/fs/namei.c +index 02e99606c65b..c1d11a2e7fa3 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -1635,7 +1635,7 @@ static struct dentry *__lookup_slow(const struct qstr *name, + { + struct dentry *dentry, *old; + struct inode *inode = dir->d_inode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + /* Don't go there if it's already dead */ + if (unlikely(IS_DEADDIR(inode))) +@@ -3305,7 +3305,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, + struct dentry *dentry; + int error, create_error = 0; + umode_t mode = op->mode; +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + + if (unlikely(IS_DEADDIR(dir_inode))) + return ERR_PTR(-ENOENT); +diff --git a/fs/namespace.c b/fs/namespace.c +index 1a9df6afb90b..373b0e738997 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -344,8 +344,24 @@ int __mnt_want_write(struct vfsmount *m) + * incremented count after it has set MNT_WRITE_HOLD. + */ + smp_mb(); +- while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) +- cpu_relax(); ++ might_lock(&mount_lock.lock); ++ while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ cpu_relax(); ++ } else { ++ /* ++ * This prevents priority inversion, if the task ++ * setting MNT_WRITE_HOLD got preempted on a remote ++ * CPU, and it prevents life lock if the task setting ++ * MNT_WRITE_HOLD has a lower priority and is bound to ++ * the same CPU as the task that is spinning here. ++ */ ++ preempt_enable(); ++ lock_mount_hash(); ++ unlock_mount_hash(); ++ preempt_disable(); ++ } ++ } + /* + * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will + * be set to match its requirements. So we must not load that until +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index 32c3d0c454b1..b8ff452317e6 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -637,7 +637,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, + unsigned long dir_verifier) + { + struct qstr filename = QSTR_INIT(entry->name, entry->len); +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + struct dentry *dentry; + struct dentry *alias; + struct inode *inode; +@@ -1873,7 +1873,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, + struct file *file, unsigned open_flags, + umode_t mode) + { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + struct nfs_open_context *ctx; + struct dentry *res; + struct iattr attr = { .ia_valid = ATTR_OPEN }; +diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c +index d5ccf095b2a7..0944c068f5cb 100644 +--- a/fs/nfs/unlink.c ++++ b/fs/nfs/unlink.c +@@ -13,7 +13,7 @@ + #include + #include + #include +-#include ++#include + #include + #include + +@@ -184,7 +184,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) + + data->cred = get_current_cred(); + data->res.dir_attr = &data->dir_attr; +- init_waitqueue_head(&data->wq); ++ init_swait_queue_head(&data->wq); + + status = -EBUSY; + spin_lock(&dentry->d_lock); +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 300d53ee7040..6ab25d4d4037 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -96,6 +96,7 @@ + #include + #include + #include ++#include + #include + #include + #include "internal.h" +@@ -2071,7 +2072,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, + + child = d_hash_and_lookup(dir, &qname); + if (!child) { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + goto end_instantiate; +diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c +index 0b7a00ed6c49..a7828fce675a 100644 +--- a/fs/proc/proc_sysctl.c ++++ b/fs/proc/proc_sysctl.c +@@ -679,7 +679,7 @@ static bool proc_sys_fill_cache(struct file *file, + + child = d_lookup(dir, &qname); + if (!child) { +- DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); ++ DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); + child = d_alloc_parallel(dir, &qname, &wq); + if (IS_ERR(child)) + return false; +diff --git a/include/asm-generic/softirq_stack.h b/include/asm-generic/softirq_stack.h +index eceeecf6a5bd..d3e2d81656e0 100644 +--- a/include/asm-generic/softirq_stack.h ++++ b/include/asm-generic/softirq_stack.h +@@ -2,7 +2,7 @@ + #ifndef __ASM_GENERIC_SOFTIRQ_STACK_H + #define __ASM_GENERIC_SOFTIRQ_STACK_H + +-#ifdef CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK ++#if defined(CONFIG_HAVE_SOFTIRQ_ON_OWN_STACK) && !defined(CONFIG_PREEMPT_RT) + void do_softirq_own_stack(void); + #else + static inline void do_softirq_own_stack(void) +diff --git a/include/linux/console.h b/include/linux/console.h +index a97f277cfdfa..487a4266ab2c 100644 +--- a/include/linux/console.h ++++ b/include/linux/console.h +@@ -16,6 +16,13 @@ + + #include + #include ++#include ++#include ++ ++struct latched_seq { ++ seqcount_latch_t latch; ++ u64 val[2]; ++}; + + struct vc_data; + struct console_font_op; +@@ -136,10 +143,12 @@ static inline int con_debug_leave(void) + #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ + #define CON_BRL (32) /* Used for a braille device */ + #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ ++#define CON_HANDOVER (128) /* Device was previously a boot console. */ + + struct console { + char name[16]; + void (*write)(struct console *, const char *, unsigned); ++ void (*write_atomic)(struct console *co, const char *s, unsigned int count); + int (*read)(struct console *, char *, unsigned); + struct tty_driver *(*device)(struct console *, int *); + void (*unblank)(void); +@@ -149,6 +158,16 @@ struct console { + short flags; + short index; + int cflag; ++#ifdef CONFIG_PRINTK ++ char sync_buf[CONSOLE_LOG_MAX]; ++ struct latched_seq printk_seq; ++ struct latched_seq printk_sync_seq; ++#ifdef CONFIG_HAVE_NMI ++ struct latched_seq printk_sync_nmi_seq; ++#endif ++#endif /* CONFIG_PRINTK */ ++ ++ struct task_struct *thread; + uint ispeed; + uint ospeed; + void *data; +diff --git a/include/linux/dcache.h b/include/linux/dcache.h +index 9e23d33bb6f1..9f89d4887e35 100644 +--- a/include/linux/dcache.h ++++ b/include/linux/dcache.h +@@ -108,7 +108,7 @@ struct dentry { + + union { + struct list_head d_lru; /* LRU list */ +- wait_queue_head_t *d_wait; /* in-lookup ones only */ ++ struct swait_queue_head *d_wait; /* in-lookup ones only */ + }; + struct list_head d_child; /* child of parent list */ + struct list_head d_subdirs; /* our children */ +@@ -240,7 +240,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op + extern struct dentry * d_alloc(struct dentry *, const struct qstr *); + extern struct dentry * d_alloc_anon(struct super_block *); + extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, +- wait_queue_head_t *); ++ struct swait_queue_head *); + extern struct dentry * d_splice_alias(struct inode *, struct dentry *); + extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); + extern struct dentry * d_exact_alias(struct dentry *, struct inode *); +diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h +index 2e2b8d6140ed..71064a2c2caf 100644 +--- a/include/linux/entry-common.h ++++ b/include/linux/entry-common.h +@@ -57,9 +57,15 @@ + # define ARCH_EXIT_TO_USER_MODE_WORK (0) + #endif + ++#ifdef CONFIG_PREEMPT_LAZY ++# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) ++#else ++# define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED) ++#endif ++ + #define EXIT_TO_USER_MODE_WORK \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ +- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ++ _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + ARCH_EXIT_TO_USER_MODE_WORK) + + /** +diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h +index ec2a47a81e42..8cd11a223260 100644 +--- a/include/linux/irq_work.h ++++ b/include/linux/irq_work.h +@@ -3,6 +3,7 @@ + #define _LINUX_IRQ_WORK_H + + #include ++#include + + /* + * An entry can be in one of four states: +@@ -16,11 +17,13 @@ + struct irq_work { + struct __call_single_node node; + void (*func)(struct irq_work *); ++ struct rcuwait irqwait; + }; + + #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ + .node = { .u_flags = (_flags), }, \ + .func = (_func), \ ++ .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ + } + + #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) +@@ -46,6 +49,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) + return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; + } + ++static inline bool irq_work_is_hard(struct irq_work *work) ++{ ++ return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; ++} ++ + bool irq_work_queue(struct irq_work *work); + bool irq_work_queue_on(struct irq_work *work, int cpu); + +diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h +index 59aea39785bf..d69b819b53e0 100644 +--- a/include/linux/irqdesc.h ++++ b/include/linux/irqdesc.h +@@ -160,6 +160,7 @@ static inline void generic_handle_irq_desc(struct irq_desc *desc) + + int handle_irq_desc(struct irq_desc *desc); + int generic_handle_irq(unsigned int irq); ++int generic_handle_irq_safe(unsigned int irq); + + #ifdef CONFIG_IRQ_DOMAIN + /* +diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h +index 747f40e0c326..5ec0fa71399e 100644 +--- a/include/linux/irqflags.h ++++ b/include/linux/irqflags.h +@@ -71,14 +71,6 @@ do { \ + do { \ + __this_cpu_dec(hardirq_context); \ + } while (0) +-# define lockdep_softirq_enter() \ +-do { \ +- current->softirq_context++; \ +-} while (0) +-# define lockdep_softirq_exit() \ +-do { \ +- current->softirq_context--; \ +-} while (0) + + # define lockdep_hrtimer_enter(__hrtimer) \ + ({ \ +@@ -140,6 +132,21 @@ do { \ + # define lockdep_irq_work_exit(__work) do { } while (0) + #endif + ++#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) ++# define lockdep_softirq_enter() \ ++do { \ ++ current->softirq_context++; \ ++} while (0) ++# define lockdep_softirq_exit() \ ++do { \ ++ current->softirq_context--; \ ++} while (0) ++ ++#else ++# define lockdep_softirq_enter() do { } while (0) ++# define lockdep_softirq_exit() do { } while (0) ++#endif ++ + #if defined(CONFIG_IRQSOFF_TRACER) || \ + defined(CONFIG_PREEMPT_TRACER) + extern void stop_critical_timings(void); +diff --git a/include/linux/kernel.h b/include/linux/kernel.h +index f56cd8879a59..49f1e924b6e6 100644 +--- a/include/linux/kernel.h ++++ b/include/linux/kernel.h +@@ -111,8 +111,8 @@ static __always_inline void might_resched(void) + #endif /* CONFIG_PREEMPT_* */ + + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP +-extern void ___might_sleep(const char *file, int line, int preempt_offset); +-extern void __might_sleep(const char *file, int line, int preempt_offset); ++extern void __might_resched(const char *file, int line, unsigned int offsets); ++extern void __might_sleep(const char *file, int line); + extern void __cant_sleep(const char *file, int line, int preempt_offset); + extern void __cant_migrate(const char *file, int line); + +@@ -129,7 +129,7 @@ extern void __cant_migrate(const char *file, int line); + * supposed to. + */ + # define might_sleep() \ +- do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) ++ do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0) + /** + * cant_sleep - annotation for functions that cannot sleep + * +@@ -168,10 +168,9 @@ extern void __cant_migrate(const char *file, int line); + */ + # define non_block_end() WARN_ON(current->non_block_count-- == 0) + #else +- static inline void ___might_sleep(const char *file, int line, +- int preempt_offset) { } +- static inline void __might_sleep(const char *file, int line, +- int preempt_offset) { } ++ static inline void __might_resched(const char *file, int line, ++ unsigned int offsets) { } ++static inline void __might_sleep(const char *file, int line) { } + # define might_sleep() do { might_resched(); } while (0) + # define cant_sleep() do { } while (0) + # define cant_migrate() do { } while (0) +diff --git a/include/linux/kgdb.h b/include/linux/kgdb.h +index 258cdde8d356..9bca0d98db5a 100644 +--- a/include/linux/kgdb.h ++++ b/include/linux/kgdb.h +@@ -212,6 +212,8 @@ extern void kgdb_call_nmi_hook(void *ignored); + */ + extern void kgdb_roundup_cpus(void); + ++extern void kgdb_roundup_cpu(unsigned int cpu); ++ + /** + * kgdb_arch_set_pc - Generic call back to the program counter + * @regs: Current &struct pt_regs. +@@ -365,5 +367,6 @@ extern void kgdb_free_init_mem(void); + #define dbg_late_init() + static inline void kgdb_panic(const char *msg) {} + static inline void kgdb_free_init_mem(void) { } ++static inline void kgdb_roundup_cpu(unsigned int cpu) {} + #endif /* ! CONFIG_KGDB */ + #endif /* _KGDB_H_ */ +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 7f8ee09c711f..e9672de22cf2 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -12,6 +12,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -572,6 +573,9 @@ struct mm_struct { + bool tlb_flush_batched; + #endif + struct uprobes_state uprobes_state; ++#ifdef CONFIG_PREEMPT_RT ++ struct rcu_head delayed_drop; ++#endif + #ifdef CONFIG_HUGETLB_PAGE + atomic_long_t hugetlb_usage; + #endif +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index c0a4589ab706..0f9dadec3b46 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -1941,7 +1941,6 @@ enum netdev_ml_priv_type { + * @sfp_bus: attached &struct sfp_bus structure. + * + * @qdisc_tx_busylock: lockdep class annotating Qdisc->busylock spinlock +- * @qdisc_running_key: lockdep class annotating Qdisc->running seqcount + * + * @proto_down: protocol port state information can be sent to the + * switch driver and used to set the phys state of the +@@ -2272,7 +2271,6 @@ struct net_device { + struct phy_device *phydev; + struct sfp_bus *sfp_bus; + struct lock_class_key *qdisc_tx_busylock; +- struct lock_class_key *qdisc_running_key; + bool proto_down; + unsigned wol_enabled:1; + unsigned threaded:1; +@@ -2382,13 +2380,11 @@ static inline void netdev_for_each_tx_queue(struct net_device *dev, + #define netdev_lockdep_set_classes(dev) \ + { \ + static struct lock_class_key qdisc_tx_busylock_key; \ +- static struct lock_class_key qdisc_running_key; \ + static struct lock_class_key qdisc_xmit_lock_key; \ + static struct lock_class_key dev_addr_list_lock_key; \ + unsigned int i; \ + \ + (dev)->qdisc_tx_busylock = &qdisc_tx_busylock_key; \ +- (dev)->qdisc_running_key = &qdisc_running_key; \ + lockdep_set_class(&(dev)->addr_list_lock, \ + &dev_addr_list_lock_key); \ + for (i = 0; i < (dev)->num_tx_queues; i++) \ +diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h +index 7fcd56c6ded6..2ccb84f15aa3 100644 +--- a/include/linux/nfs_xdr.h ++++ b/include/linux/nfs_xdr.h +@@ -1692,7 +1692,7 @@ struct nfs_unlinkdata { + struct nfs_removeargs args; + struct nfs_removeres res; + struct dentry *dentry; +- wait_queue_head_t wq; ++ struct swait_queue_head wq; + const struct cred *cred; + struct nfs_fattr dir_attr; + long timeout; +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index 4d244e295e85..3da73c968211 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -122,9 +122,10 @@ + * The preempt_count offset after spin_lock() + */ + #if !defined(CONFIG_PREEMPT_RT) +-#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET ++#define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET + #else +-#define PREEMPT_LOCK_OFFSET 0 ++/* Locks on RT do not disable preemption */ ++#define PREEMPT_LOCK_OFFSET 0 + #endif + + /* +@@ -174,6 +175,20 @@ extern void preempt_count_sub(int val); + #define preempt_count_inc() preempt_count_add(1) + #define preempt_count_dec() preempt_count_sub(1) + ++#ifdef CONFIG_PREEMPT_LAZY ++#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) ++#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) ++#define inc_preempt_lazy_count() add_preempt_lazy_count(1) ++#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) ++#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) ++#else ++#define add_preempt_lazy_count(val) do { } while (0) ++#define sub_preempt_lazy_count(val) do { } while (0) ++#define inc_preempt_lazy_count() do { } while (0) ++#define dec_preempt_lazy_count() do { } while (0) ++#define preempt_lazy_count() (0) ++#endif ++ + #ifdef CONFIG_PREEMPT_COUNT + + #define preempt_disable() \ +@@ -182,13 +197,25 @@ do { \ + barrier(); \ + } while (0) + ++#define preempt_lazy_disable() \ ++do { \ ++ inc_preempt_lazy_count(); \ ++ barrier(); \ ++} while (0) ++ + #define sched_preempt_enable_no_resched() \ + do { \ + barrier(); \ + preempt_count_dec(); \ + } while (0) + +-#define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++#ifndef CONFIG_PREEMPT_RT ++# define preempt_enable_no_resched() sched_preempt_enable_no_resched() ++# define preempt_check_resched_rt() barrier(); ++#else ++# define preempt_enable_no_resched() preempt_enable() ++# define preempt_check_resched_rt() preempt_check_resched() ++#endif + + #define preemptible() (preempt_count() == 0 && !irqs_disabled()) + +@@ -213,6 +240,18 @@ do { \ + __preempt_schedule(); \ + } while (0) + ++/* ++ * open code preempt_check_resched() because it is not exported to modules and ++ * used by local_unlock() or bpf_enable_instrumentation(). ++ */ ++#define preempt_lazy_enable() \ ++do { \ ++ dec_preempt_lazy_count(); \ ++ barrier(); \ ++ if (should_resched(0)) \ ++ __preempt_schedule(); \ ++} while (0) ++ + #else /* !CONFIG_PREEMPTION */ + #define preempt_enable() \ + do { \ +@@ -220,6 +259,12 @@ do { \ + preempt_count_dec(); \ + } while (0) + ++#define preempt_lazy_enable() \ ++do { \ ++ dec_preempt_lazy_count(); \ ++ barrier(); \ ++} while (0) ++ + #define preempt_enable_notrace() \ + do { \ + barrier(); \ +@@ -258,8 +303,12 @@ do { \ + #define preempt_disable_notrace() barrier() + #define preempt_enable_no_resched_notrace() barrier() + #define preempt_enable_notrace() barrier() ++#define preempt_check_resched_rt() barrier() + #define preemptible() 0 + ++#define preempt_lazy_disable() barrier() ++#define preempt_lazy_enable() barrier() ++ + #endif /* CONFIG_PREEMPT_COUNT */ + + #ifdef MODULE +@@ -278,7 +327,7 @@ do { \ + } while (0) + #define preempt_fold_need_resched() \ + do { \ +- if (tif_need_resched()) \ ++ if (tif_need_resched_now()) \ + set_preempt_need_resched(); \ + } while (0) + +@@ -394,8 +443,15 @@ extern void migrate_enable(void); + + #else + +-static inline void migrate_disable(void) { } +-static inline void migrate_enable(void) { } ++static inline void migrate_disable(void) ++{ ++ preempt_lazy_disable(); ++} ++ ++static inline void migrate_enable(void) ++{ ++ preempt_lazy_enable(); ++} + + #endif /* CONFIG_SMP */ + +diff --git a/include/linux/printk.h b/include/linux/printk.h +index 9497f6b98339..eddfc5de6ee7 100644 +--- a/include/linux/printk.h ++++ b/include/linux/printk.h +@@ -47,6 +47,12 @@ static inline const char *printk_skip_headers(const char *buffer) + + #define CONSOLE_EXT_LOG_MAX 8192 + ++/* ++ * The maximum size of a record formatted for console printing ++ * (i.e. with the prefix prepended to every line). ++ */ ++#define CONSOLE_LOG_MAX 1024 ++ + /* printk's without a loglevel use this.. */ + #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT + +@@ -155,6 +161,8 @@ int vprintk(const char *fmt, va_list args); + asmlinkage __printf(1, 2) __cold + int _printk(const char *fmt, ...); + ++bool pr_flush(int timeout_ms, bool reset_on_progress); ++ + /* + * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! + */ +@@ -224,6 +232,11 @@ static inline void printk_deferred_exit(void) + { + } + ++static inline bool pr_flush(int timeout_ms, bool reset_on_progress) ++{ ++ return true; ++} ++ + static inline int printk_ratelimit(void) + { + return 0; +@@ -284,17 +297,30 @@ static inline void printk_trigger_flush(void) + extern int __printk_cpu_trylock(void); + extern void __printk_wait_on_cpu_lock(void); + extern void __printk_cpu_unlock(void); ++extern bool kgdb_roundup_delay(unsigned int cpu); ++ ++#else ++ ++#define __printk_cpu_trylock() 1 ++#define __printk_wait_on_cpu_lock() ++#define __printk_cpu_unlock() ++ ++static inline bool kgdb_roundup_delay(unsigned int cpu) ++{ ++ return false; ++} ++#endif /* CONFIG_SMP */ + + /** +- * printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning +- * lock and disable interrupts. ++ * raw_printk_cpu_lock_irqsave() - Acquire the printk cpu-reentrant spinning ++ * lock and disable interrupts. + * @flags: Stack-allocated storage for saving local interrupt state, +- * to be passed to printk_cpu_unlock_irqrestore(). ++ * to be passed to raw_printk_cpu_unlock_irqrestore(). + * + * If the lock is owned by another CPU, spin until it becomes available. + * Interrupts are restored while spinning. + */ +-#define printk_cpu_lock_irqsave(flags) \ ++#define raw_printk_cpu_lock_irqsave(flags) \ + for (;;) { \ + local_irq_save(flags); \ + if (__printk_cpu_trylock()) \ +@@ -304,22 +330,30 @@ extern void __printk_cpu_unlock(void); + } + + /** +- * printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant spinning +- * lock and restore interrupts. +- * @flags: Caller's saved interrupt state, from printk_cpu_lock_irqsave(). ++ * raw_printk_cpu_unlock_irqrestore() - Release the printk cpu-reentrant ++ * spinning lock and restore interrupts. ++ * @flags: Caller's saved interrupt state from raw_printk_cpu_lock_irqsave(). + */ +-#define printk_cpu_unlock_irqrestore(flags) \ ++#define raw_printk_cpu_unlock_irqrestore(flags) \ + do { \ + __printk_cpu_unlock(); \ + local_irq_restore(flags); \ +- } while (0) \ +- +-#else ++ } while (0) + +-#define printk_cpu_lock_irqsave(flags) ((void)flags) +-#define printk_cpu_unlock_irqrestore(flags) ((void)flags) ++/* ++ * Used to synchronize atomic consoles. ++ * ++ * The same as raw_printk_cpu_lock_irqsave() except that hardware interrupts ++ * are _not_ restored while spinning. ++ */ ++#define console_atomic_lock(flags) \ ++ do { \ ++ local_irq_save(flags); \ ++ while (!__printk_cpu_trylock()) \ ++ cpu_relax(); \ ++ } while (0) + +-#endif /* CONFIG_SMP */ ++#define console_atomic_unlock raw_printk_cpu_unlock_irqrestore + + extern int kptr_restrict; + +diff --git a/include/linux/ratelimit_types.h b/include/linux/ratelimit_types.h +index f0e535f199be..002266693e50 100644 +--- a/include/linux/ratelimit_types.h ++++ b/include/linux/ratelimit_types.h +@@ -4,7 +4,7 @@ + + #include + #include +-#include ++#include + + #define DEFAULT_RATELIMIT_INTERVAL (5 * HZ) + #define DEFAULT_RATELIMIT_BURST 10 +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index 13bddb841ceb..e33445348eb0 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -94,6 +94,13 @@ void rcu_init_tasks_generic(void); + static inline void rcu_init_tasks_generic(void) { } + #endif + ++#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_TASKS_RCU_GENERIC) ++void rcu_tasks_initiate_self_tests(void); ++#else ++static inline void rcu_tasks_initiate_self_tests(void) {} ++#endif ++ ++ + #ifdef CONFIG_RCU_STALL_COMMON + void rcu_sysrq_start(void); + void rcu_sysrq_end(void); +diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h +index 9deedfeec2b1..7d049883a08a 100644 +--- a/include/linux/rtmutex.h ++++ b/include/linux/rtmutex.h +@@ -99,13 +99,22 @@ extern void __rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + extern void rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass); ++extern void _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock); + #define rt_mutex_lock(lock) rt_mutex_lock_nested(lock, 0) ++#define rt_mutex_lock_nest_lock(lock, nest_lock) \ ++ do { \ ++ typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ ++ _rt_mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ ++ } while (0) ++ + #else + extern void rt_mutex_lock(struct rt_mutex *lock); + #define rt_mutex_lock_nested(lock, subclass) rt_mutex_lock(lock) ++#define rt_mutex_lock_nest_lock(lock, nest_lock) rt_mutex_lock(lock) + #endif + + extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); ++extern int rt_mutex_lock_killable(struct rt_mutex *lock); + extern int rt_mutex_trylock(struct rt_mutex *lock); + + extern void rt_mutex_unlock(struct rt_mutex *lock); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 7c17742d359c..2cdeb099d3c9 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -118,12 +118,8 @@ struct task_group; + + #define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING) + +-#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0) +- + #define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0) + +-#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0) +- + /* + * Special states are those that do not use the normal wait-loop pattern. See + * the comment with set_special_state(). +@@ -1084,6 +1080,10 @@ struct task_struct { + /* Restored if set_restore_sigmask() was used: */ + sigset_t saved_sigmask; + struct sigpending pending; ++#ifdef CONFIG_PREEMPT_RT ++ /* TODO: move me into ->restart_block ? */ ++ struct kernel_siginfo forced_info; ++#endif + unsigned long sas_ss_sp; + size_t sas_ss_size; + unsigned int sas_ss_flags; +@@ -1738,6 +1738,16 @@ static __always_inline bool is_percpu_thread(void) + #endif + } + ++/* Is the current task guaranteed to stay on its current CPU? */ ++static inline bool is_migratable(void) ++{ ++#ifdef CONFIG_SMP ++ return preemptible() && !current->migration_disabled; ++#else ++ return false; ++#endif ++} ++ + /* Per-process atomic flags. */ + #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ + #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ +@@ -2013,6 +2023,118 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) ++{ ++ return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); ++} ++ ++static inline int need_resched_lazy(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED_LAZY); ++} ++ ++static inline int need_resched_now(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED); ++} ++ ++#else ++static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } ++static inline int need_resched_lazy(void) { return 0; } ++ ++static inline int need_resched_now(void) ++{ ++ return test_thread_flag(TIF_NEED_RESCHED); ++} ++ ++#endif ++ ++#ifdef CONFIG_PREEMPT_RT ++static inline bool task_match_saved_state(struct task_struct *p, long match_state) ++{ ++ return p->saved_state == match_state; ++} ++ ++static inline bool task_is_traced(struct task_struct *task) ++{ ++ bool traced = false; ++ ++ /* in case the task is sleeping on tasklist_lock */ ++ raw_spin_lock_irq(&task->pi_lock); ++ if (READ_ONCE(task->__state) & __TASK_TRACED) ++ traced = true; ++ else if (task->saved_state & __TASK_TRACED) ++ traced = true; ++ raw_spin_unlock_irq(&task->pi_lock); ++ return traced; ++} ++ ++static inline bool task_is_stopped_or_traced(struct task_struct *task) ++{ ++ bool traced_stopped = false; ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ ++ if (READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) ++ traced_stopped = true; ++ else if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) ++ traced_stopped = true; ++ ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++ return traced_stopped; ++} ++ ++#else ++ ++static inline bool task_match_saved_state(struct task_struct *p, long match_state) ++{ ++ return false; ++} ++ ++static inline bool task_is_traced(struct task_struct *task) ++{ ++ return READ_ONCE(task->__state) & __TASK_TRACED; ++} ++ ++static inline bool task_is_stopped_or_traced(struct task_struct *task) ++{ ++ return READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED); ++} ++#endif ++ ++static inline bool task_match_state_or_saved(struct task_struct *p, ++ long match_state) ++{ ++ if (READ_ONCE(p->__state) == match_state) ++ return true; ++ ++ return task_match_saved_state(p, match_state); ++} ++ ++static inline bool task_match_state_lock(struct task_struct *p, ++ long match_state) ++{ ++ bool match; ++ ++ raw_spin_lock_irq(&p->pi_lock); ++ match = task_match_state_or_saved(p, match_state); ++ raw_spin_unlock_irq(&p->pi_lock); ++ ++ return match; ++} ++ + /* + * cond_resched() and cond_resched_lock(): latency reduction via + * explicit rescheduling in places that are safe. The return +@@ -2047,7 +2169,7 @@ static inline int _cond_resched(void) { return 0; } + #endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */ + + #define cond_resched() ({ \ +- ___might_sleep(__FILE__, __LINE__, 0); \ ++ __might_resched(__FILE__, __LINE__, 0); \ + _cond_resched(); \ + }) + +@@ -2055,19 +2177,38 @@ extern int __cond_resched_lock(spinlock_t *lock); + extern int __cond_resched_rwlock_read(rwlock_t *lock); + extern int __cond_resched_rwlock_write(rwlock_t *lock); + +-#define cond_resched_lock(lock) ({ \ +- ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ +- __cond_resched_lock(lock); \ ++#define MIGHT_RESCHED_RCU_SHIFT 8 ++#define MIGHT_RESCHED_PREEMPT_MASK ((1U << MIGHT_RESCHED_RCU_SHIFT) - 1) ++ ++#ifndef CONFIG_PREEMPT_RT ++/* ++ * Non RT kernels have an elevated preempt count due to the held lock, ++ * but are not allowed to be inside a RCU read side critical section ++ */ ++# define PREEMPT_LOCK_RESCHED_OFFSETS PREEMPT_LOCK_OFFSET ++#else ++/* ++ * spin/rw_lock() on RT implies rcu_read_lock(). The might_sleep() check in ++ * cond_resched*lock() has to take that into account because it checks for ++ * preempt_count() and rcu_preempt_depth(). ++ */ ++# define PREEMPT_LOCK_RESCHED_OFFSETS \ ++ (PREEMPT_LOCK_OFFSET + (1U << MIGHT_RESCHED_RCU_SHIFT)) ++#endif ++ ++#define cond_resched_lock(lock) ({ \ ++ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ ++ __cond_resched_lock(lock); \ + }) + +-#define cond_resched_rwlock_read(lock) ({ \ +- __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ +- __cond_resched_rwlock_read(lock); \ ++#define cond_resched_rwlock_read(lock) ({ \ ++ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ ++ __cond_resched_rwlock_read(lock); \ + }) + +-#define cond_resched_rwlock_write(lock) ({ \ +- __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ +- __cond_resched_rwlock_write(lock); \ ++#define cond_resched_rwlock_write(lock) ({ \ ++ __might_resched(__FILE__, __LINE__, PREEMPT_LOCK_RESCHED_OFFSETS); \ ++ __cond_resched_rwlock_write(lock); \ + }) + + static inline void cond_resched_rcu(void) +diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h +index 95fb7aaaec8d..28e9cc60f47e 100644 +--- a/include/linux/sched/mm.h ++++ b/include/linux/sched/mm.h +@@ -49,6 +49,26 @@ static inline void mmdrop(struct mm_struct *mm) + __mmdrop(mm); + } + ++#ifdef CONFIG_PREEMPT_RT ++extern void __mmdrop_delayed(struct rcu_head *rhp); ++ ++/* ++ * Invoked from finish_task_switch(). Delegates the heavy lifting on RT ++ * kernels via RCU. ++ */ ++static inline void mmdrop_sched(struct mm_struct *mm) ++{ ++ /* Provides a full memory barrier. See mmdrop() */ ++ if (atomic_dec_and_test(&mm->mm_count)) ++ call_rcu(&mm->delayed_drop, __mmdrop_delayed); ++} ++#else ++static inline void mmdrop_sched(struct mm_struct *mm) ++{ ++ mmdrop(mm); ++} ++#endif ++ + /** + * mmget() - Pin the address space associated with a &struct mm_struct. + * @mm: The address space to pin. +diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h +index 68abc6bdd891..dfe81e08e143 100644 +--- a/include/linux/serial_8250.h ++++ b/include/linux/serial_8250.h +@@ -7,6 +7,7 @@ + #ifndef _LINUX_SERIAL_8250_H + #define _LINUX_SERIAL_8250_H + ++#include + #include + #include + #include +@@ -126,6 +127,8 @@ struct uart_8250_port { + #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA + unsigned char msr_saved_flags; + ++ atomic_t console_printing; ++ + struct uart_8250_dma *dma; + const struct uart_8250_ops *ops; + +@@ -181,6 +184,8 @@ void serial8250_init_port(struct uart_8250_port *up); + void serial8250_set_defaults(struct uart_8250_port *up); + void serial8250_console_write(struct uart_8250_port *up, const char *s, + unsigned int count); ++void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, ++ unsigned int count); + int serial8250_console_setup(struct uart_port *port, char *options, bool probe); + int serial8250_console_exit(struct uart_port *port); + +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 7ed1d4472c0c..6ac2df270a97 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -301,6 +301,7 @@ struct sk_buff_head { + + __u32 qlen; + spinlock_t lock; ++ raw_spinlock_t raw_lock; + }; + + struct sk_buff; +@@ -1993,6 +1994,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) + __skb_queue_head_init(list); + } + ++static inline void skb_queue_head_init_raw(struct sk_buff_head *list) ++{ ++ raw_spin_lock_init(&list->raw_lock); ++ __skb_queue_head_init(list); ++} ++ + static inline void skb_queue_head_init_class(struct sk_buff_head *list, + struct lock_class_key *class) + { +diff --git a/include/linux/smp.h b/include/linux/smp.h +index 510519e8a1eb..7ac9fdb5ad09 100644 +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -268,6 +268,9 @@ static inline int get_boot_cpu_id(void) + #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) + #define put_cpu() preempt_enable() + ++#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) ++#define put_cpu_light() migrate_enable() ++ + /* + * Callback to arch code if there's nosmp or maxcpus=0 on the + * boot command line: +diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h +index c09b6407ae1b..7f86a2016ac5 100644 +--- a/include/linux/spinlock_types_up.h ++++ b/include/linux/spinlock_types_up.h +@@ -1,7 +1,7 @@ + #ifndef __LINUX_SPINLOCK_TYPES_UP_H + #define __LINUX_SPINLOCK_TYPES_UP_H + +-#ifndef __LINUX_SPINLOCK_TYPES_H ++#ifndef __LINUX_SPINLOCK_TYPES_RAW_H + # error "please don't include this file directly" + #endif + +diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h +index 9a073535c0bd..0536fbba7f69 100644 +--- a/include/linux/thread_info.h ++++ b/include/linux/thread_info.h +@@ -177,7 +177,17 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti + clear_ti_thread_flag(task_thread_info(t), TIF_##fl) + #endif /* !CONFIG_GENERIC_ENTRY */ + +-#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) ++#ifdef CONFIG_PREEMPT_LAZY ++#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ ++ test_thread_flag(TIF_NEED_RESCHED_LAZY)) ++#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) ++#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY) ++ ++#else ++#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) ++#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) ++#define tif_need_resched_lazy() 0 ++#endif + + #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES + static inline int arch_within_stack_frames(const void * const stack, +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index ff137179e0c3..54fe3b1a638d 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -69,6 +69,7 @@ struct trace_entry { + unsigned char flags; + unsigned char preempt_count; + int pid; ++ unsigned char preempt_lazy_count; + }; + + #define TRACE_EVENT_TYPE_MAX \ +@@ -158,9 +159,10 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, + unsigned int trace_ctx) + { + entry->preempt_count = trace_ctx & 0xff; ++ entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; + entry->pid = current->pid; + entry->type = type; +- entry->flags = trace_ctx >> 16; ++ entry->flags = trace_ctx >> 24; + } + + unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); +@@ -173,6 +175,7 @@ enum trace_flag_type { + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, ++ TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, + }; + + #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h +index e81856c0ba13..81dc1f5e181a 100644 +--- a/include/linux/u64_stats_sync.h ++++ b/include/linux/u64_stats_sync.h +@@ -66,7 +66,7 @@ + #include + + struct u64_stats_sync { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + seqcount_t seq; + #endif + }; +@@ -83,6 +83,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) + return local64_read(&p->v); + } + ++static inline void u64_stats_set(u64_stats_t *p, u64 val) ++{ ++ local64_set(&p->v, val); ++} ++ + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) + { + local64_add(val, &p->v); +@@ -104,6 +109,11 @@ static inline u64 u64_stats_read(const u64_stats_t *p) + return p->v; + } + ++static inline void u64_stats_set(u64_stats_t *p, u64 val) ++{ ++ p->v = val; ++} ++ + static inline void u64_stats_add(u64_stats_t *p, unsigned long val) + { + p->v += val; +@@ -115,7 +125,7 @@ static inline void u64_stats_inc(u64_stats_t *p) + } + #endif + +-#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) + #else + static inline void u64_stats_init(struct u64_stats_sync *syncp) +@@ -125,15 +135,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) + + static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); + write_seqcount_begin(&syncp->seq); + #endif + } + + static inline void u64_stats_update_end(struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + write_seqcount_end(&syncp->seq); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); + #endif + } + +@@ -142,8 +156,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) + { + unsigned long flags = 0; + +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +- local_irq_save(flags); ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_disable(); ++ else ++ local_irq_save(flags); + write_seqcount_begin(&syncp->seq); + #endif + return flags; +@@ -153,15 +170,18 @@ static inline void + u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, + unsigned long flags) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + write_seqcount_end(&syncp->seq); +- local_irq_restore(flags); ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ preempt_enable(); ++ else ++ local_irq_restore(flags); + #endif + } + + static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + return read_seqcount_begin(&syncp->seq); + #else + return 0; +@@ -170,7 +190,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * + + static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) + preempt_disable(); + #endif + return __u64_stats_fetch_begin(syncp); +@@ -179,7 +199,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy + static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + return read_seqcount_retry(&syncp->seq, start); + #else + return false; +@@ -189,7 +209,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) + preempt_enable(); + #endif + return __u64_stats_fetch_retry(syncp, start); +@@ -203,7 +223,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + */ + static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) ++ preempt_disable(); ++#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) + local_irq_disable(); + #endif + return __u64_stats_fetch_begin(syncp); +@@ -212,7 +234,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync + static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, + unsigned int start) + { +-#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) ++#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) ++ preempt_enable(); ++#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) + local_irq_enable(); + #endif + return __u64_stats_fetch_retry(syncp, start); +diff --git a/include/net/act_api.h b/include/net/act_api.h +index f19f7f4a463c..b5b624c7e488 100644 +--- a/include/net/act_api.h ++++ b/include/net/act_api.h +@@ -30,13 +30,13 @@ struct tc_action { + atomic_t tcfa_bindcnt; + int tcfa_action; + struct tcf_t tcfa_tm; +- struct gnet_stats_basic_packed tcfa_bstats; +- struct gnet_stats_basic_packed tcfa_bstats_hw; ++ struct gnet_stats_basic_sync tcfa_bstats; ++ struct gnet_stats_basic_sync tcfa_bstats_hw; + struct gnet_stats_queue tcfa_qstats; + struct net_rate_estimator __rcu *tcfa_rate_est; + spinlock_t tcfa_lock; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats_hw; + struct gnet_stats_queue __percpu *cpu_qstats; + struct tc_cookie __rcu *act_cookie; + struct tcf_chain __rcu *goto_chain; +@@ -206,7 +206,7 @@ static inline void tcf_action_update_bstats(struct tc_action *a, + struct sk_buff *skb) + { + if (likely(a->cpu_bstats)) { +- bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(a->cpu_bstats), skb); + return; + } + spin_lock(&a->tcfa_lock); +diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h +index 1424e02cef90..7aa2b8e1fb29 100644 +--- a/include/net/gen_stats.h ++++ b/include/net/gen_stats.h +@@ -7,14 +7,17 @@ + #include + #include + +-/* Note: this used to be in include/uapi/linux/gen_stats.h */ +-struct gnet_stats_basic_packed { +- __u64 bytes; +- __u64 packets; +-}; +- +-struct gnet_stats_basic_cpu { +- struct gnet_stats_basic_packed bstats; ++/* Throughput stats. ++ * Must be initialized beforehand with gnet_stats_basic_sync_init(). ++ * ++ * If no reads can ever occur parallel to writes (e.g. stack-allocated ++ * bstats), then the internal stat values can be written to and read ++ * from directly. Otherwise, use _bstats_set/update() for writes and ++ * gnet_stats_add_basic() for reads. ++ */ ++struct gnet_stats_basic_sync { ++ u64_stats_t bytes; ++ u64_stats_t packets; + struct u64_stats_sync syncp; + } __aligned(2 * sizeof(u64)); + +@@ -34,6 +37,7 @@ struct gnet_dump { + struct tc_stats tc_stats; + }; + ++void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b); + int gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + struct gnet_dump *d, int padattr); + +@@ -42,41 +46,38 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, + spinlock_t *lock, struct gnet_dump *d, + int padattr); + +-int gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b); +-void __gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b); +-int gnet_stats_copy_basic_hw(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b); ++int gnet_stats_copy_basic(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running); ++void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running); ++int gnet_stats_copy_basic_hw(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running); + int gnet_stats_copy_rate_est(struct gnet_dump *d, + struct net_rate_estimator __rcu **ptr); + int gnet_stats_copy_queue(struct gnet_dump *d, + struct gnet_stats_queue __percpu *cpu_q, + struct gnet_stats_queue *q, __u32 qlen); +-void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, +- const struct gnet_stats_queue __percpu *cpu_q, +- const struct gnet_stats_queue *q, __u32 qlen); ++void gnet_stats_add_queue(struct gnet_stats_queue *qstats, ++ const struct gnet_stats_queue __percpu *cpu_q, ++ const struct gnet_stats_queue *q); + int gnet_stats_copy_app(struct gnet_dump *d, void *st, int len); + + int gnet_stats_finish_copy(struct gnet_dump *d); + +-int gen_new_estimator(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu_bstats, ++int gen_new_estimator(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt); ++ bool running, struct nlattr *opt); + void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); +-int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu_bstats, ++int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **ptr, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt); ++ bool running, struct nlattr *opt); + bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); + bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, + struct gnet_stats_rate_est64 *sample); +diff --git a/include/net/netfilter/xt_rateest.h b/include/net/netfilter/xt_rateest.h +index 832ab69efda5..4c3809e141f4 100644 +--- a/include/net/netfilter/xt_rateest.h ++++ b/include/net/netfilter/xt_rateest.h +@@ -6,7 +6,7 @@ + + struct xt_rateest { + /* keep lock and bstats on same cache line to speedup xt_rateest_tg() */ +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + spinlock_t lock; + + +diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h +index 83a6d0792180..4a5833108083 100644 +--- a/include/net/pkt_cls.h ++++ b/include/net/pkt_cls.h +@@ -765,7 +765,7 @@ struct tc_cookie { + }; + + struct tc_qopt_offload_stats { +- struct gnet_stats_basic_packed *bstats; ++ struct gnet_stats_basic_sync *bstats; + struct gnet_stats_queue *qstats; + }; + +@@ -885,7 +885,7 @@ struct tc_gred_qopt_offload_params { + }; + + struct tc_gred_qopt_offload_stats { +- struct gnet_stats_basic_packed bstats[MAX_DPs]; ++ struct gnet_stats_basic_sync bstats[MAX_DPs]; + struct gnet_stats_queue qstats[MAX_DPs]; + struct red_stats *xstats[MAX_DPs]; + }; +diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h +index 6906da5c733e..e9fe7a613eba 100644 +--- a/include/net/sch_generic.h ++++ b/include/net/sch_generic.h +@@ -40,6 +40,13 @@ enum qdisc_state_t { + __QDISC_STATE_DRAINING, + }; + ++enum qdisc_state2_t { ++ /* Only for !TCQ_F_NOLOCK qdisc. Never access it directly. ++ * Use qdisc_run_begin/end() or qdisc_is_running() instead. ++ */ ++ __QDISC_STATE2_RUNNING, ++}; ++ + #define QDISC_STATE_MISSED BIT(__QDISC_STATE_MISSED) + #define QDISC_STATE_DRAINING BIT(__QDISC_STATE_DRAINING) + +@@ -97,7 +104,7 @@ struct Qdisc { + struct netdev_queue *dev_queue; + + struct net_rate_estimator __rcu *rate_est; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; + int pad; + refcount_t refcnt; +@@ -107,10 +114,10 @@ struct Qdisc { + */ + struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; + struct qdisc_skb_head q; +- struct gnet_stats_basic_packed bstats; +- seqcount_t running; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + unsigned long state; ++ unsigned long state2; /* must be written under qdisc spinlock */ + struct Qdisc *next_sched; + struct sk_buff_head skb_bad_txq; + +@@ -143,11 +150,15 @@ static inline struct Qdisc *qdisc_refcount_inc_nz(struct Qdisc *qdisc) + return NULL; + } + ++/* For !TCQ_F_NOLOCK qdisc: callers must either call this within a qdisc ++ * root_lock section, or provide their own memory barriers -- ordering ++ * against qdisc_run_begin/end() atomic bit operations. ++ */ + static inline bool qdisc_is_running(struct Qdisc *qdisc) + { + if (qdisc->flags & TCQ_F_NOLOCK) + return spin_is_locked(&qdisc->seqlock); +- return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; ++ return test_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + } + + static inline bool nolock_qdisc_is_empty(const struct Qdisc *qdisc) +@@ -167,6 +178,9 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc) + return !READ_ONCE(qdisc->q.qlen); + } + ++/* For !TCQ_F_NOLOCK qdisc, qdisc_run_begin/end() must be invoked with ++ * the qdisc root lock acquired. ++ */ + static inline bool qdisc_run_begin(struct Qdisc *qdisc) + { + if (qdisc->flags & TCQ_F_NOLOCK) { +@@ -186,15 +200,8 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) + * when testing it in qdisc_run_end() + */ + return spin_trylock(&qdisc->seqlock); +- } else if (qdisc_is_running(qdisc)) { +- return false; + } +- /* Variant of write_seqcount_begin() telling lockdep a trylock +- * was attempted. +- */ +- raw_write_seqcount_begin(&qdisc->running); +- seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); +- return true; ++ return !__test_and_set_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + } + + static inline void qdisc_run_end(struct Qdisc *qdisc) +@@ -212,7 +219,7 @@ static inline void qdisc_run_end(struct Qdisc *qdisc) + &qdisc->state))) + __netif_schedule(qdisc); + } else { +- write_seqcount_end(&qdisc->running); ++ __clear_bit(__QDISC_STATE2_RUNNING, &qdisc->state2); + } + } + +@@ -576,14 +583,6 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) + return qdisc_lock(root); + } + +-static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) +-{ +- struct Qdisc *root = qdisc_root_sleeping(qdisc); +- +- ASSERT_RTNL(); +- return &root->running; +-} +- + static inline struct net_device *qdisc_dev(const struct Qdisc *qdisc) + { + return qdisc->dev_queue->dev; +@@ -833,14 +832,16 @@ static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, + return sch->enqueue(skb, sch, to_free); + } + +-static inline void _bstats_update(struct gnet_stats_basic_packed *bstats, ++static inline void _bstats_update(struct gnet_stats_basic_sync *bstats, + __u64 bytes, __u32 packets) + { +- bstats->bytes += bytes; +- bstats->packets += packets; ++ u64_stats_update_begin(&bstats->syncp); ++ u64_stats_add(&bstats->bytes, bytes); ++ u64_stats_add(&bstats->packets, packets); ++ u64_stats_update_end(&bstats->syncp); + } + +-static inline void bstats_update(struct gnet_stats_basic_packed *bstats, ++static inline void bstats_update(struct gnet_stats_basic_sync *bstats, + const struct sk_buff *skb) + { + _bstats_update(bstats, +@@ -848,26 +849,10 @@ static inline void bstats_update(struct gnet_stats_basic_packed *bstats, + skb_is_gso(skb) ? skb_shinfo(skb)->gso_segs : 1); + } + +-static inline void _bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, +- __u64 bytes, __u32 packets) +-{ +- u64_stats_update_begin(&bstats->syncp); +- _bstats_update(&bstats->bstats, bytes, packets); +- u64_stats_update_end(&bstats->syncp); +-} +- +-static inline void bstats_cpu_update(struct gnet_stats_basic_cpu *bstats, +- const struct sk_buff *skb) +-{ +- u64_stats_update_begin(&bstats->syncp); +- bstats_update(&bstats->bstats, skb); +- u64_stats_update_end(&bstats->syncp); +-} +- + static inline void qdisc_bstats_cpu_update(struct Qdisc *sch, + const struct sk_buff *skb) + { +- bstats_cpu_update(this_cpu_ptr(sch->cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(sch->cpu_bstats), skb); + } + + static inline void qdisc_bstats_update(struct Qdisc *sch, +@@ -956,10 +941,9 @@ static inline void qdisc_qstats_qlen_backlog(struct Qdisc *sch, __u32 *qlen, + __u32 *backlog) + { + struct gnet_stats_queue qstats = { 0 }; +- __u32 len = qdisc_qlen_sum(sch); + +- __gnet_stats_copy_queue(&qstats, sch->cpu_qstats, &sch->qstats, len); +- *qlen = qstats.qlen; ++ gnet_stats_add_queue(&qstats, sch->cpu_qstats, &sch->qstats); ++ *qlen = qstats.qlen + qdisc_qlen(sch); + *backlog = qstats.backlog; + } + +@@ -1304,7 +1288,7 @@ void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64); + struct mini_Qdisc { + struct tcf_proto *filter_list; + struct tcf_block *block; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats; + struct gnet_stats_queue __percpu *cpu_qstats; + struct rcu_head rcu; + }; +@@ -1312,7 +1296,7 @@ struct mini_Qdisc { + static inline void mini_qdisc_bstats_cpu_update(struct mini_Qdisc *miniq, + const struct sk_buff *skb) + { +- bstats_cpu_update(this_cpu_ptr(miniq->cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(miniq->cpu_bstats), skb); + } + + static inline void mini_qdisc_qstats_cpu_drop(struct mini_Qdisc *miniq) +diff --git a/init/Kconfig b/init/Kconfig +index dafc3ba6fa7a..cd852df4e7d4 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -910,7 +910,7 @@ config NUMA_BALANCING + bool "Memory placement aware NUMA scheduler" + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY +- depends on SMP && NUMA && MIGRATION ++ depends on SMP && NUMA && MIGRATION && !PREEMPT_RT + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -1905,6 +1905,7 @@ choice + + config SLAB + bool "SLAB" ++ depends on !PREEMPT_RT + select HAVE_HARDENED_USERCOPY_ALLOCATOR + help + The regular slab allocator that is established and known to work +@@ -1925,6 +1926,7 @@ config SLUB + config SLOB + depends on EXPERT + bool "SLOB (Simple Allocator)" ++ depends on !PREEMPT_RT + help + SLOB replaces the stock allocator with a drastically simpler + allocator. SLOB is generally more space efficient but +diff --git a/init/main.c b/init/main.c +index 649d9e4201a8..ee92d608ffc4 100644 +--- a/init/main.c ++++ b/init/main.c +@@ -1606,6 +1606,7 @@ static noinline void __init kernel_init_freeable(void) + + rcu_init_tasks_generic(); + do_pre_smp_initcalls(); ++ rcu_tasks_initiate_self_tests(); + lockup_detector_init(); + + smp_init(); +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index 5876e30c5740..5df0776264c2 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -1,5 +1,11 @@ + # SPDX-License-Identifier: GPL-2.0-only + ++config HAVE_PREEMPT_LAZY ++ bool ++ ++config PREEMPT_LAZY ++ def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT ++ + choice + prompt "Preemption Model" + default PREEMPT_NONE +diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c +index 1486768f2318..bb3b805436c4 100644 +--- a/kernel/cgroup/rstat.c ++++ b/kernel/cgroup/rstat.c +@@ -156,8 +156,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); + struct cgroup *pos = NULL; ++ unsigned long flags; + +- raw_spin_lock(cpu_lock); ++ raw_spin_lock_irqsave(cpu_lock, flags); + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + struct cgroup_subsys_state *css; + +@@ -169,7 +170,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } +- raw_spin_unlock(cpu_lock); ++ raw_spin_unlock_irqrestore(cpu_lock, flags); + + /* if @may_sleep, play nice and yield if necessary */ + if (may_sleep && (need_resched() || +diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c +index 7beceb447211..28497c00e63b 100644 +--- a/kernel/debug/debug_core.c ++++ b/kernel/debug/debug_core.c +@@ -239,35 +239,42 @@ NOKPROBE_SYMBOL(kgdb_call_nmi_hook); + static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = + CSD_INIT(kgdb_call_nmi_hook, NULL); + +-void __weak kgdb_roundup_cpus(void) ++void __weak kgdb_roundup_cpu(unsigned int cpu) + { + call_single_data_t *csd; ++ int ret; ++ ++ csd = &per_cpu(kgdb_roundup_csd, cpu); ++ ++ /* ++ * If it didn't round up last time, don't try again ++ * since smp_call_function_single_async() will block. ++ * ++ * If rounding_up is false then we know that the ++ * previous call must have at least started and that ++ * means smp_call_function_single_async() won't block. ++ */ ++ if (kgdb_info[cpu].rounding_up) ++ return; ++ kgdb_info[cpu].rounding_up = true; ++ ++ ret = smp_call_function_single_async(cpu, csd); ++ if (ret) ++ kgdb_info[cpu].rounding_up = false; ++} ++NOKPROBE_SYMBOL(kgdb_roundup_cpu); ++ ++void __weak kgdb_roundup_cpus(void) ++{ + int this_cpu = raw_smp_processor_id(); + int cpu; +- int ret; + + for_each_online_cpu(cpu) { + /* No need to roundup ourselves */ + if (cpu == this_cpu) + continue; + +- csd = &per_cpu(kgdb_roundup_csd, cpu); +- +- /* +- * If it didn't round up last time, don't try again +- * since smp_call_function_single_async() will block. +- * +- * If rounding_up is false then we know that the +- * previous call must have at least started and that +- * means smp_call_function_single_async() won't block. +- */ +- if (kgdb_info[cpu].rounding_up) +- continue; +- kgdb_info[cpu].rounding_up = true; +- +- ret = smp_call_function_single_async(cpu, csd); +- if (ret) +- kgdb_info[cpu].rounding_up = false; ++ kgdb_roundup_cpu(cpu); + } + } + NOKPROBE_SYMBOL(kgdb_roundup_cpus); +diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c +index 6735ac36b718..539a2f0dc89d 100644 +--- a/kernel/debug/kdb/kdb_io.c ++++ b/kernel/debug/kdb/kdb_io.c +@@ -559,23 +559,17 @@ static void kdb_msg_write(const char *msg, int msg_len) + cp++; + } + ++ /* mirror output on atomic consoles */ + for_each_console(c) { + if (!(c->flags & CON_ENABLED)) + continue; + if (c == dbg_io_ops->cons) + continue; +- /* +- * Set oops_in_progress to encourage the console drivers to +- * disregard their internal spin locks: in the current calling +- * context the risk of deadlock is a bigger problem than risks +- * due to re-entering the console driver. We operate directly on +- * oops_in_progress rather than using bust_spinlocks() because +- * the calls bust_spinlocks() makes on exit are not appropriate +- * for this calling context. +- */ +- ++oops_in_progress; +- c->write(c, msg, msg_len); +- --oops_in_progress; ++ ++ if (!c->write_atomic) ++ continue; ++ c->write_atomic(c, msg, msg_len); ++ + touch_nmi_watchdog(); + } + } +diff --git a/kernel/entry/common.c b/kernel/entry/common.c +index e002bea6b4be..51ddfdacfc1f 100644 +--- a/kernel/entry/common.c ++++ b/kernel/entry/common.c +@@ -159,9 +159,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + + local_irq_enable_exit_to_user(ti_work); + +- if (ti_work & _TIF_NEED_RESCHED) ++ if (ti_work & _TIF_NEED_RESCHED_MASK) + schedule(); + ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (unlikely(current->forced_info.si_signo)) { ++ struct task_struct *t = current; ++ force_sig_info(&t->forced_info); ++ t->forced_info.si_signo = 0; ++ } ++#endif ++ + if (ti_work & _TIF_UPROBE) + uprobe_notify_resume(regs); + +@@ -388,7 +396,7 @@ void irqentry_exit_cond_resched(void) + rcu_irq_exit_check_preempt(); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + WARN_ON_ONCE(!on_thread_stack()); +- if (need_resched()) ++ if (should_resched(0)) + preempt_schedule_irq(); + } + } +diff --git a/kernel/exit.c b/kernel/exit.c +index 80efdfda6662..6ff17e977392 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -64,6 +64,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -215,8 +216,14 @@ static void delayed_put_task_struct(struct rcu_head *rhp) + { + struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); + ++ kprobe_flush_task(tsk); + perf_event_delayed_put(tsk); + trace_sched_process_free(tsk); ++ ++ /* RT enabled kernels delay freeing the VMAP'ed task stack */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ put_task_stack(tsk); ++ + put_task_struct(tsk); + } + +diff --git a/kernel/fork.c b/kernel/fork.c +index 1906230a000e..47f647e6a6c3 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -289,7 +289,10 @@ static inline void free_thread_stack(struct task_struct *tsk) + return; + } + +- vfree_atomic(tsk->stack); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ vfree_atomic(tsk->stack); ++ else ++ vfree(tsk->stack); + return; + } + #endif +@@ -709,6 +712,19 @@ void __mmdrop(struct mm_struct *mm) + } + EXPORT_SYMBOL_GPL(__mmdrop); + ++#ifdef CONFIG_PREEMPT_RT ++/* ++ * RCU callback for delayed mm drop. Not strictly RCU, but call_rcu() is ++ * by far the least expensive way to do that. ++ */ ++void __mmdrop_delayed(struct rcu_head *rhp) ++{ ++ struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); ++ ++ __mmdrop(mm); ++} ++#endif ++ + static void mmdrop_async_fn(struct work_struct *work) + { + struct mm_struct *mm; +diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c +index 7a45fd593245..23dc888e0885 100644 +--- a/kernel/irq/irqdesc.c ++++ b/kernel/irq/irqdesc.c +@@ -664,6 +664,29 @@ int generic_handle_irq(unsigned int irq) + } + EXPORT_SYMBOL_GPL(generic_handle_irq); + ++/** ++ * generic_handle_irq_safe - Invoke the handler for a particular irq from any ++ * context. ++ * @irq: The irq number to handle ++ * ++ * Returns: 0 on success, a negative value on error. ++ * ++ * This function can be called from any context (IRQ or process context). It ++ * will report an error if not invoked from IRQ context and the irq has been ++ * marked to enforce IRQ-context only. ++ */ ++int generic_handle_irq_safe(unsigned int irq) ++{ ++ unsigned long flags; ++ int ret; ++ ++ local_irq_save(flags); ++ ret = handle_irq_desc(irq_to_desc(irq)); ++ local_irq_restore(flags); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(generic_handle_irq_safe); ++ + #ifdef CONFIG_IRQ_DOMAIN + /** + * generic_handle_domain_irq - Invoke the handler for a HW irq belonging +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index 9862372e0f01..78d90ac0528c 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -1301,6 +1301,8 @@ static int irq_thread(void *data) + + irq_thread_set_ready(desc, action); + ++ sched_set_fifo(current); ++ + if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD, + &action->thread_flags)) + handler_fn = irq_forced_thread_fn; +@@ -1466,8 +1468,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) + if (IS_ERR(t)) + return PTR_ERR(t); + +- sched_set_fifo(t); +- + /* + * We keep the reference to the task struct even if + * the thread dies to avoid that the interrupt code +@@ -2861,7 +2861,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); + * This call sets the internal irqchip state of an interrupt, + * depending on the value of @which. + * +- * This function should be called with preemption disabled if the ++ * This function should be called with migration disabled if the + * interrupt controller has per-cpu registers. + */ + int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, +diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c +index c481d8458325..02b2daf07441 100644 +--- a/kernel/irq/spurious.c ++++ b/kernel/irq/spurious.c +@@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); + + static int __init irqfixup_setup(char *str) + { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ pr_warn("irqfixup boot option not supported with PREEMPT_RT\n"); ++ return 1; ++ } + irqfixup = 1; + printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); + printk(KERN_WARNING "This may impact system performance.\n"); +@@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644); + + static int __init irqpoll_setup(char *str) + { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ pr_warn("irqpoll boot option not supported with PREEMPT_RT\n"); ++ return 1; ++ } + irqfixup = 2; + printk(KERN_WARNING "Misrouted IRQ fixup and polling support " + "enabled\n"); +diff --git a/kernel/irq_work.c b/kernel/irq_work.c +index db8c248ebc8c..f7df715ec28e 100644 +--- a/kernel/irq_work.c ++++ b/kernel/irq_work.c +@@ -18,11 +18,36 @@ + #include + #include + #include ++#include + #include + #include + + static DEFINE_PER_CPU(struct llist_head, raised_list); + static DEFINE_PER_CPU(struct llist_head, lazy_list); ++static DEFINE_PER_CPU(struct task_struct *, irq_workd); ++ ++static void wake_irq_workd(void) ++{ ++ struct task_struct *tsk = __this_cpu_read(irq_workd); ++ ++ if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) ++ wake_up_process(tsk); ++} ++ ++#ifdef CONFIG_SMP ++static void irq_work_wake(struct irq_work *entry) ++{ ++ wake_irq_workd(); ++} ++ ++static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = ++ IRQ_WORK_INIT_HARD(irq_work_wake); ++#endif ++ ++static int irq_workd_should_run(unsigned int cpu) ++{ ++ return !llist_empty(this_cpu_ptr(&lazy_list)); ++} + + /* + * Claim the entry so that no one else will poke at it. +@@ -52,15 +77,29 @@ void __weak arch_irq_work_raise(void) + /* Enqueue on current CPU, work must already be claimed and preempt disabled */ + static void __irq_work_queue_local(struct irq_work *work) + { ++ struct llist_head *list; ++ bool rt_lazy_work = false; ++ bool lazy_work = false; ++ int work_flags; ++ ++ work_flags = atomic_read(&work->node.a_flags); ++ if (work_flags & IRQ_WORK_LAZY) ++ lazy_work = true; ++ else if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(work_flags & IRQ_WORK_HARD_IRQ)) ++ rt_lazy_work = true; ++ ++ if (lazy_work || rt_lazy_work) ++ list = this_cpu_ptr(&lazy_list); ++ else ++ list = this_cpu_ptr(&raised_list); ++ ++ if (!llist_add(&work->node.llist, list)) ++ return; ++ + /* If the work is "lazy", handle it from next tick if any */ +- if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { +- if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && +- tick_nohz_tick_stopped()) +- arch_irq_work_raise(); +- } else { +- if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) +- arch_irq_work_raise(); +- } ++ if (!lazy_work || tick_nohz_tick_stopped()) ++ arch_irq_work_raise(); + } + + /* Enqueue the irq work @work on the current CPU */ +@@ -104,17 +143,34 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) + if (cpu != smp_processor_id()) { + /* Arch remote IPI send/receive backend aren't NMI safe */ + WARN_ON_ONCE(in_nmi()); ++ ++ /* ++ * On PREEMPT_RT the items which are not marked as ++ * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work ++ * item is used on the remote CPU to wake the thread. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && ++ !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { ++ ++ if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) ++ goto out; ++ ++ work = &per_cpu(irq_work_wakeup, cpu); ++ if (!irq_work_claim(work)) ++ goto out; ++ } ++ + __smp_call_single_queue(cpu, &work->node.llist); + } else { + __irq_work_queue_local(work); + } ++out: + preempt_enable(); + + return true; + #endif /* CONFIG_SMP */ + } + +- + bool irq_work_needs_cpu(void) + { + struct llist_head *raised, *lazy; +@@ -160,6 +216,10 @@ void irq_work_single(void *arg) + * else claimed it meanwhile. + */ + (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); ++ ++ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || ++ !arch_irq_work_has_interrupt()) ++ rcuwait_wake_up(&work->irqwait); + } + + static void irq_work_run_list(struct llist_head *list) +@@ -167,7 +227,12 @@ static void irq_work_run_list(struct llist_head *list) + struct irq_work *work, *tmp; + struct llist_node *llnode; + +- BUG_ON(!irqs_disabled()); ++ /* ++ * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed ++ * in a per-CPU thread in preemptible context. Only the items which are ++ * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. ++ */ ++ BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); + + if (llist_empty(list)) + return; +@@ -184,7 +249,10 @@ static void irq_work_run_list(struct llist_head *list) + void irq_work_run(void) + { + irq_work_run_list(this_cpu_ptr(&raised_list)); +- irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ else ++ wake_irq_workd(); + } + EXPORT_SYMBOL_GPL(irq_work_run); + +@@ -194,7 +262,11 @@ void irq_work_tick(void) + + if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) + irq_work_run_list(raised); +- irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++ else ++ wake_irq_workd(); + } + + /* +@@ -204,8 +276,42 @@ void irq_work_tick(void) + void irq_work_sync(struct irq_work *work) + { + lockdep_assert_irqs_enabled(); ++ might_sleep(); ++ ++ if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || ++ !arch_irq_work_has_interrupt()) { ++ rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), ++ TASK_UNINTERRUPTIBLE); ++ return; ++ } + + while (irq_work_is_busy(work)) + cpu_relax(); + } + EXPORT_SYMBOL_GPL(irq_work_sync); ++ ++static void run_irq_workd(unsigned int cpu) ++{ ++ irq_work_run_list(this_cpu_ptr(&lazy_list)); ++} ++ ++static void irq_workd_setup(unsigned int cpu) ++{ ++ sched_set_fifo_low(current); ++} ++ ++static struct smp_hotplug_thread irqwork_threads = { ++ .store = &irq_workd, ++ .setup = irq_workd_setup, ++ .thread_should_run = irq_workd_should_run, ++ .thread_fn = run_irq_workd, ++ .thread_comm = "irq_work/%u", ++}; ++ ++static __init int irq_work_init_threads(void) ++{ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); ++ return 0; ++} ++early_initcall(irq_work_init_threads); +diff --git a/kernel/kcov.c b/kernel/kcov.c +index 80bfe71bbe13..36ca640c4f8e 100644 +--- a/kernel/kcov.c ++++ b/kernel/kcov.c +@@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); + + struct kcov_percpu_data { + void *irq_area; ++ local_lock_t lock; + + unsigned int saved_mode; + unsigned int saved_size; +@@ -96,7 +97,9 @@ struct kcov_percpu_data { + int saved_sequence; + }; + +-static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); ++static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + /* Must be called with kcov_remote_lock locked. */ + static struct kcov_remote *kcov_remote_find(u64 handle) +@@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle) + if (!in_task() && !in_serving_softirq()) + return; + +- local_irq_save(flags); ++ local_lock_irqsave(&kcov_percpu_data.lock, flags); + + /* + * Check that kcov_remote_start() is not called twice in background +@@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle) + */ + mode = READ_ONCE(t->kcov_mode); + if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + /* +@@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle) + * happened while collecting coverage from a background thread. + */ + if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + + spin_lock(&kcov_remote_lock); + remote = kcov_remote_find(handle); + if (!remote) { +- spin_unlock_irqrestore(&kcov_remote_lock, flags); ++ spin_unlock(&kcov_remote_lock); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + kcov_debug("handle = %llx, context: %s\n", handle, +@@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle) + size = CONFIG_KCOV_IRQ_AREA_SIZE; + area = this_cpu_ptr(&kcov_percpu_data)->irq_area; + } +- spin_unlock_irqrestore(&kcov_remote_lock, flags); ++ spin_unlock(&kcov_remote_lock); + + /* Can only happen when in_task(). */ + if (!area) { ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + area = vmalloc(size * sizeof(unsigned long)); + if (!area) { + kcov_put(kcov); + return; + } ++ local_lock_irqsave(&kcov_percpu_data.lock, flags); + } + +- local_irq_save(flags); +- + /* Reset coverage size. */ + *(u64 *)area = 0; + +@@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle) + } + kcov_start(t, kcov, size, area, mode, sequence); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + + } + EXPORT_SYMBOL(kcov_remote_start); +@@ -965,12 +969,12 @@ void kcov_remote_stop(void) + if (!in_task() && !in_serving_softirq()) + return; + +- local_irq_save(flags); ++ local_lock_irqsave(&kcov_percpu_data.lock, flags); + + mode = READ_ONCE(t->kcov_mode); + barrier(); + if (!kcov_mode_enabled(mode)) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + /* +@@ -978,12 +982,12 @@ void kcov_remote_stop(void) + * actually found the remote handle and started collecting coverage. + */ + if (in_serving_softirq() && !t->kcov_softirq) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + /* Make sure that kcov_softirq is only set when in softirq. */ + if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + return; + } + +@@ -1013,7 +1017,7 @@ void kcov_remote_stop(void) + spin_unlock(&kcov_remote_lock); + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&kcov_percpu_data.lock, flags); + + /* Get in kcov_remote_start(). */ + kcov_put(kcov); +@@ -1034,8 +1038,8 @@ static int __init kcov_init(void) + int cpu; + + for_each_possible_cpu(cpu) { +- void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * +- sizeof(unsigned long)); ++ void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE * ++ sizeof(unsigned long), cpu_to_node(cpu)); + if (!area) + return -ENOMEM; + per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; +diff --git a/kernel/kprobes.c b/kernel/kprobes.c +index 7e9fa1b7ff67..d83e818ffbdb 100644 +--- a/kernel/kprobes.c ++++ b/kernel/kprobes.c +@@ -1248,10 +1248,10 @@ void kprobe_busy_end(void) + } + + /* +- * This function is called from finish_task_switch when task tk becomes dead, +- * so that we can recycle any function-return probe instances associated +- * with this task. These left over instances represent probed functions +- * that have been called but will never return. ++ * This function is called from delayed_put_task_struct() when a task is ++ * dead and cleaned up to recycle any function-return probe instances ++ * associated with this task. These left over instances represent probed ++ * functions that have been called but will never return. + */ + void kprobe_flush_task(struct task_struct *tk) + { +diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c +index e20c19e3ba49..777168d58f02 100644 +--- a/kernel/ksysfs.c ++++ b/kernel/ksysfs.c +@@ -143,6 +143,15 @@ KERNEL_ATTR_RO(vmcoreinfo); + + #endif /* CONFIG_CRASH_CORE */ + ++#if defined(CONFIG_PREEMPT_RT) ++static ssize_t realtime_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%d\n", 1); ++} ++KERNEL_ATTR_RO(realtime); ++#endif ++ + /* whether file capabilities are enabled */ + static ssize_t fscaps_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +@@ -233,6 +242,9 @@ static struct attribute * kernel_attrs[] = { + #ifndef CONFIG_TINY_RCU + &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, ++#endif ++#ifdef CONFIG_PREEMPT_RT ++ &realtime_attr.attr, + #endif + NULL + }; +diff --git a/kernel/kthread.c b/kernel/kthread.c +index e319a1b62586..c3870b2a150d 100644 +--- a/kernel/kthread.c ++++ b/kernel/kthread.c +@@ -270,6 +270,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); + + static int kthread(void *_create) + { ++ static const struct sched_param param = { .sched_priority = 0 }; + /* Copy data: it's on kthread's stack */ + struct kthread_create_info *create = _create; + int (*threadfn)(void *data) = create->threadfn; +@@ -300,6 +301,13 @@ static int kthread(void *_create) + init_completion(&self->parked); + current->vfork_done = &self->exited; + ++ /* ++ * The new thread inherited kthreadd's priority and CPU mask. Reset ++ * back to default in case they have been changed. ++ */ ++ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); ++ set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); ++ + /* OK, tell user we're spawned, wait for stop or wakeup */ + __set_current_state(TASK_UNINTERRUPTIBLE); + create->result = current; +@@ -397,7 +405,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + } + task = create->result; + if (!IS_ERR(task)) { +- static const struct sched_param param = { .sched_priority = 0 }; + char name[TASK_COMM_LEN]; + + /* +@@ -406,13 +413,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), + */ + vsnprintf(name, sizeof(name), namefmt, args); + set_task_comm(task, name); +- /* +- * root may have changed our (kthreadd's) priority or CPU mask. +- * The kernel thread should not inherit these properties. +- */ +- sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); +- set_cpus_allowed_ptr(task, +- housekeeping_cpumask(HK_FLAG_KTHREAD)); + } + kfree(create); + return task; +diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c +index e6a282bc1665..ce3c8a4a5506 100644 +--- a/kernel/locking/lockdep.c ++++ b/kernel/locking/lockdep.c +@@ -5470,6 +5470,7 @@ static noinstr void check_flags(unsigned long flags) + } + } + ++#ifndef CONFIG_PREEMPT_RT + /* + * We dont accurately track softirq state in e.g. + * hardirq contexts (such as on 4KSTACKS), so only +@@ -5484,6 +5485,7 @@ static noinstr void check_flags(unsigned long flags) + DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); + } + } ++#endif + + if (!debug_locks) + print_irqtrace_events(current); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index b7fa3ee3aa1d..108b963a783b 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1135,8 +1135,26 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock, + * which is wrong, as the other waiter is not in a deadlock + * situation. + */ +- if (owner == task) ++ if (owner == task) { ++#if defined(DEBUG_WW_MUTEXES) && defined(CONFIG_DEBUG_LOCKING_API_SELFTESTS) ++ /* ++ * The lockdep selftest for ww-mutex assumes in a few cases ++ * the ww_ctx->contending_lock assignment via ++ * __ww_mutex_check_kill() which does not happen if the rtmutex ++ * detects the deadlock early. ++ */ ++ if (build_ww_mutex() && ww_ctx) { ++ struct rt_mutex *rtm; ++ ++ /* Check whether the waiter should backout immediately */ ++ rtm = container_of(lock, struct rt_mutex, rtmutex); ++ ++ __ww_mutex_add_waiter(waiter, rtm, ww_ctx); ++ __ww_mutex_check_kill(rtm, waiter, ww_ctx); ++ } ++#endif + return -EDEADLK; ++ } + + raw_spin_lock(&task->pi_lock); + waiter->task = task; +diff --git a/kernel/locking/rtmutex_api.c b/kernel/locking/rtmutex_api.c +index a461be2f873d..cb9fdff76a8a 100644 +--- a/kernel/locking/rtmutex_api.c ++++ b/kernel/locking/rtmutex_api.c +@@ -21,12 +21,13 @@ int max_lock_depth = 1024; + */ + static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock, + unsigned int state, ++ struct lockdep_map *nest_lock, + unsigned int subclass) + { + int ret; + + might_sleep(); +- mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); ++ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_); + ret = __rt_mutex_lock(&lock->rtmutex, state); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); +@@ -48,10 +49,16 @@ EXPORT_SYMBOL(rt_mutex_base_init); + */ + void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass) + { +- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass); ++ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass); + } + EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + ++void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock) ++{ ++ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0); ++} ++EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock); ++ + #else /* !CONFIG_DEBUG_LOCK_ALLOC */ + + /** +@@ -61,7 +68,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested); + */ + void __sched rt_mutex_lock(struct rt_mutex *lock) + { +- __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0); ++ __rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0); + } + EXPORT_SYMBOL_GPL(rt_mutex_lock); + #endif +@@ -77,10 +84,25 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); + */ + int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) + { +- return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0); ++ return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0); + } + EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + ++/** ++ * rt_mutex_lock_killable - lock a rt_mutex killable ++ * ++ * @lock: the rt_mutex to be locked ++ * ++ * Returns: ++ * 0 on success ++ * -EINTR when interrupted by a signal ++ */ ++int __sched rt_mutex_lock_killable(struct rt_mutex *lock) ++{ ++ return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0); ++} ++EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); ++ + /** + * rt_mutex_trylock - try to lock a rt_mutex + * +diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c +index d2912e44d61f..9e396a09fe0f 100644 +--- a/kernel/locking/spinlock_rt.c ++++ b/kernel/locking/spinlock_rt.c +@@ -24,6 +24,17 @@ + #define RT_MUTEX_BUILD_SPINLOCKS + #include "rtmutex.c" + ++/* ++ * __might_resched() skips the state check as rtlocks are state ++ * preserving. Take RCU nesting into account as spin/read/write_lock() can ++ * legitimately nest into an RCU read side critical section. ++ */ ++#define RTLOCK_RESCHED_OFFSETS \ ++ (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) ++ ++#define rtlock_might_resched() \ ++ __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) ++ + static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) + { + if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) +@@ -32,7 +43,7 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) + + static __always_inline void __rt_spin_lock(spinlock_t *lock) + { +- ___might_sleep(__FILE__, __LINE__, 0); ++ rtlock_might_resched(); + rtlock_lock(&lock->lock); + rcu_read_lock(); + migrate_disable(); +@@ -210,7 +221,7 @@ EXPORT_SYMBOL(rt_write_trylock); + + void __sched rt_read_lock(rwlock_t *rwlock) + { +- ___might_sleep(__FILE__, __LINE__, 0); ++ rtlock_might_resched(); + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); +@@ -220,7 +231,7 @@ EXPORT_SYMBOL(rt_read_lock); + + void __sched rt_write_lock(rwlock_t *rwlock) + { +- ___might_sleep(__FILE__, __LINE__, 0); ++ rtlock_might_resched(); + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); + rcu_read_lock(); +@@ -246,12 +257,6 @@ void __sched rt_write_unlock(rwlock_t *rwlock) + } + EXPORT_SYMBOL(rt_write_unlock); + +-int __sched rt_rwlock_is_contended(rwlock_t *rwlock) +-{ +- return rw_base_is_contended(&rwlock->rwbase); +-} +-EXPORT_SYMBOL(rt_rwlock_is_contended); +- + #ifdef CONFIG_DEBUG_LOCK_ALLOC + void __rt_rwlock_init(rwlock_t *rwlock, const char *name, + struct lock_class_key *key) +diff --git a/kernel/panic.c b/kernel/panic.c +index 47933d4c769b..ea5269f486cc 100644 +--- a/kernel/panic.c ++++ b/kernel/panic.c +@@ -245,12 +245,27 @@ void check_panic_on_warn(const char *origin) + void panic(const char *fmt, ...) + { + static char buf[1024]; ++ va_list args2; + va_list args; + long i, i_next = 0, len; + int state = 0; + int old_cpu, this_cpu; + bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + ++ console_verbose(); ++ pr_emerg("Kernel panic - not syncing:\n"); ++ va_start(args2, fmt); ++ va_copy(args, args2); ++ vprintk(fmt, args2); ++ va_end(args2); ++#ifdef CONFIG_DEBUG_BUGVERBOSE ++ /* ++ * Avoid nested stack-dumping if a panic occurs during oops processing ++ */ ++ if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) ++ dump_stack(); ++#endif ++ pr_flush(1000, true); + if (panic_on_warn) { + /* + * This thread may hit another WARN() in the panic path. +@@ -291,24 +306,13 @@ void panic(const char *fmt, ...) + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) + panic_smp_self_stop(); + +- console_verbose(); + bust_spinlocks(1); +- va_start(args, fmt); + len = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (len && buf[len - 1] == '\n') + buf[len - 1] = '\0'; + +- pr_emerg("Kernel panic - not syncing: %s\n", buf); +-#ifdef CONFIG_DEBUG_BUGVERBOSE +- /* +- * Avoid nested stack-dumping if a panic occurs during oops processing +- */ +- if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) +- dump_stack(); +-#endif +- + /* + * If kgdb is enabled, give it a chance to run before we stop all + * the other CPUs or else we won't be able to debug processes left +@@ -617,9 +621,11 @@ static u64 oops_id; + + static int init_oops_id(void) + { ++#ifndef CONFIG_PREEMPT_RT + if (!oops_id) + get_random_bytes(&oops_id, sizeof(oops_id)); + else ++#endif + oops_id++; + + return 0; +@@ -630,6 +636,7 @@ static void print_oops_end_marker(void) + { + init_oops_id(); + pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); ++ pr_flush(1000, true); + } + + /* +diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c +index 8d856b7c2e5a..7f27cfee283e 100644 +--- a/kernel/printk/printk.c ++++ b/kernel/printk/printk.c +@@ -44,6 +44,9 @@ + #include + #include + #include ++#include ++#include ++#include + #include + #include + #include +@@ -268,11 +271,6 @@ static void __up_console_sem(unsigned long ip) + */ + static int console_locked, console_suspended; + +-/* +- * If exclusive_console is non-NULL then only this console is to be printed to. +- */ +-static struct console *exclusive_console; +- + /* + * Array of consoles built from command line options (console=) + */ +@@ -352,10 +350,13 @@ static int console_msg_format = MSG_FORMAT_DEFAULT; + * non-prinatable characters are escaped in the "\xff" notation. + */ + ++#ifdef CONFIG_PRINTK + /* syslog_lock protects syslog_* variables and write access to clear_seq. */ + static DEFINE_MUTEX(syslog_lock); + +-#ifdef CONFIG_PRINTK ++/* Set to enable sync mode. Once set, it is never cleared. */ ++static bool sync_mode; ++ + DECLARE_WAIT_QUEUE_HEAD(log_wait); + /* All 3 protected by @syslog_lock. */ + /* the next printk record to read by syslog(READ) or /proc/kmsg */ +@@ -363,17 +364,6 @@ static u64 syslog_seq; + static size_t syslog_partial; + static bool syslog_time; + +-/* All 3 protected by @console_sem. */ +-/* the next printk record to write to the console */ +-static u64 console_seq; +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; +- +-struct latched_seq { +- seqcount_latch_t latch; +- u64 val[2]; +-}; +- + /* + * The next printk record to read after the last 'clear' command. There are + * two copies (updated with seqcount_latch) so that reads can locklessly +@@ -391,9 +381,6 @@ static struct latched_seq clear_seq = { + #define PREFIX_MAX 32 + #endif + +-/* the maximum size of a formatted record (i.e. with prefix added per line) */ +-#define CONSOLE_LOG_MAX 1024 +- + /* the maximum size allowed to be reserved for a record */ + #define LOG_LINE_MAX (CONSOLE_LOG_MAX - PREFIX_MAX) + +@@ -437,7 +424,7 @@ bool printk_percpu_data_ready(void) + return __printk_percpu_data_ready; + } + +-/* Must be called under syslog_lock. */ ++/* Must be called under associated write-protection lock. */ + static void latched_seq_write(struct latched_seq *ls, u64 val) + { + raw_write_seqcount_latch(&ls->latch); +@@ -1771,188 +1758,152 @@ SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) + return do_syslog(type, buf, len, SYSLOG_FROM_READER); + } + +-/* +- * Special console_lock variants that help to reduce the risk of soft-lockups. +- * They allow to pass console_lock to another printk() call using a busy wait. +- */ ++int printk_delay_msec __read_mostly; + +-#ifdef CONFIG_LOCKDEP +-static struct lockdep_map console_owner_dep_map = { +- .name = "console_owner" +-}; +-#endif ++static inline void printk_delay(int level) ++{ ++ boot_delay_msec(level); + +-static DEFINE_RAW_SPINLOCK(console_owner_lock); +-static struct task_struct *console_owner; +-static bool console_waiter; ++ if (unlikely(printk_delay_msec)) { ++ int m = printk_delay_msec; + +-/** +- * console_lock_spinning_enable - mark beginning of code where another +- * thread might safely busy wait +- * +- * This basically converts console_lock into a spinlock. This marks +- * the section where the console_lock owner can not sleep, because +- * there may be a waiter spinning (like a spinlock). Also it must be +- * ready to hand over the lock at the end of the section. +- */ +-static void console_lock_spinning_enable(void) ++ while (m--) { ++ mdelay(1); ++ touch_nmi_watchdog(); ++ } ++ } ++} ++ ++static bool kernel_sync_mode(void) + { +- raw_spin_lock(&console_owner_lock); +- console_owner = current; +- raw_spin_unlock(&console_owner_lock); ++ return (oops_in_progress || sync_mode); ++} + +- /* The waiter may spin on us after setting console_owner */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); ++static bool console_may_sync(struct console *con) ++{ ++ if (!(con->flags & CON_ENABLED)) ++ return false; ++ if (con->write_atomic && kernel_sync_mode()) ++ return true; ++ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) ++ return true; ++ if (con->write && (con->flags & CON_BOOT) && !con->thread) ++ return true; ++ return false; + } + +-/** +- * console_lock_spinning_disable_and_check - mark end of code where another +- * thread was able to busy wait and check if there is a waiter +- * +- * This is called at the end of the section where spinning is allowed. +- * It has two functions. First, it is a signal that it is no longer +- * safe to start busy waiting for the lock. Second, it checks if +- * there is a busy waiter and passes the lock rights to her. +- * +- * Important: Callers lose the lock if there was a busy waiter. +- * They must not touch items synchronized by console_lock +- * in this case. +- * +- * Return: 1 if the lock rights were passed, 0 otherwise. +- */ +-static int console_lock_spinning_disable_and_check(void) ++static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) + { +- int waiter; ++ if (!(con->flags & CON_ENABLED)) ++ return false; + +- raw_spin_lock(&console_owner_lock); +- waiter = READ_ONCE(console_waiter); +- console_owner = NULL; +- raw_spin_unlock(&console_owner_lock); ++ if (con->write_atomic && kernel_sync_mode()) { ++ con->write_atomic(con, text, text_len); ++ return true; ++ } + +- if (!waiter) { +- spin_release(&console_owner_dep_map, _THIS_IP_); +- return 0; ++ if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) { ++ if (console_trylock()) { ++ con->write_atomic(con, text, text_len); ++ console_unlock(); ++ return true; ++ } ++ ++ } else if (con->write && (con->flags & CON_BOOT) && !con->thread) { ++ if (console_trylock()) { ++ con->write(con, text, text_len); ++ console_unlock(); ++ return true; ++ } + } + +- /* The waiter is now free to continue */ +- WRITE_ONCE(console_waiter, false); ++ return false; ++} + +- spin_release(&console_owner_dep_map, _THIS_IP_); ++static bool have_atomic_console(void) ++{ ++ struct console *con; + +- /* +- * Hand off console_lock to waiter. The waiter will perform +- * the up(). After this, the waiter is the console_lock owner. +- */ +- mutex_release(&console_lock_dep_map, _THIS_IP_); +- return 1; ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ if (con->write_atomic) ++ return true; ++ } ++ return false; + } + +-/** +- * console_trylock_spinning - try to get console_lock by busy waiting +- * +- * This allows to busy wait for the console_lock when the current +- * owner is running in specially marked sections. It means that +- * the current owner is running and cannot reschedule until it +- * is ready to lose the lock. +- * +- * Return: 1 if we got the lock, 0 othrewise +- */ +-static int console_trylock_spinning(void) ++static bool print_sync(struct console *con, u64 *seq) + { +- struct task_struct *owner = NULL; +- bool waiter; +- bool spin = false; +- unsigned long flags; ++ struct printk_info info; ++ struct printk_record r; ++ size_t text_len; + +- if (console_trylock()) +- return 1; ++ prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); + +- printk_safe_enter_irqsave(flags); ++ if (!prb_read_valid(prb, *seq, &r)) ++ return false; + +- raw_spin_lock(&console_owner_lock); +- owner = READ_ONCE(console_owner); +- waiter = READ_ONCE(console_waiter); +- if (!waiter && owner && owner != current) { +- WRITE_ONCE(console_waiter, true); +- spin = true; +- } +- raw_spin_unlock(&console_owner_lock); ++ text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); + +- /* +- * If there is an active printk() writing to the +- * consoles, instead of having it write our data too, +- * see if we can offload that load from the active +- * printer, and do some printing ourselves. +- * Go into a spin only if there isn't already a waiter +- * spinning, and there is an active printer, and +- * that active printer isn't us (recursive printk?). +- */ +- if (!spin) { +- printk_safe_exit_irqrestore(flags); +- return 0; +- } ++ if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) ++ return false; + +- /* We spin waiting for the owner to release us */ +- spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); +- /* Owner will clear console_waiter on hand off */ +- while (READ_ONCE(console_waiter)) +- cpu_relax(); +- spin_release(&console_owner_dep_map, _THIS_IP_); ++ *seq = r.info->seq; + +- printk_safe_exit_irqrestore(flags); +- /* +- * The owner passed the console lock to us. +- * Since we did not spin on console lock, annotate +- * this as a trylock. Otherwise lockdep will +- * complain. +- */ +- mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); ++ touch_softlockup_watchdog_sync(); ++ clocksource_touch_watchdog(); ++ rcu_cpu_stall_reset(); ++ touch_nmi_watchdog(); + +- return 1; ++ if (text_len) ++ printk_delay(r.info->level); ++ ++ return true; + } + +-/* +- * Call the console drivers, asking them to write out +- * log_buf[start] to log_buf[end - 1]. +- * The console_lock must be held. +- */ +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) ++static u64 read_console_seq(struct console *con) + { +- static char dropped_text[64]; +- size_t dropped_len = 0; +- struct console *con; ++ u64 seq2; ++ u64 seq; + +- trace_console_rcuidle(text, len); ++ seq = latched_seq_read_nolock(&con->printk_seq); ++ seq2 = latched_seq_read_nolock(&con->printk_sync_seq); ++ if (seq2 > seq) ++ seq = seq2; ++#ifdef CONFIG_HAVE_NMI ++ seq2 = latched_seq_read_nolock(&con->printk_sync_nmi_seq); ++ if (seq2 > seq) ++ seq = seq2; ++#endif ++ return seq; ++} + +- if (!console_drivers) +- return; ++static void print_sync_until(struct console *con, u64 seq, bool is_locked) ++{ ++ u64 printk_seq; + +- if (console_dropped) { +- dropped_len = snprintf(dropped_text, sizeof(dropped_text), +- "** %lu printk messages dropped **\n", +- console_dropped); +- console_dropped = 0; +- } ++ while (!__printk_cpu_trylock()) ++ cpu_relax(); + +- for_each_console(con) { +- if (exclusive_console && con != exclusive_console) +- continue; +- if (!(con->flags & CON_ENABLED)) +- continue; +- if (!con->write) +- continue; +- if (!cpu_online(smp_processor_id()) && +- !(con->flags & CON_ANYTIME)) +- continue; +- if (con->flags & CON_EXTENDED) +- con->write(con, ext_text, ext_len); +- else { +- if (dropped_len) +- con->write(con, dropped_text, dropped_len); +- con->write(con, text, len); +- } ++ for (;;) { ++ printk_seq = read_console_seq(con); ++ if (printk_seq >= seq) ++ break; ++ if (!print_sync(con, &printk_seq)) ++ break; ++ ++ if (is_locked) ++ latched_seq_write(&con->printk_seq, printk_seq + 1); ++#ifdef CONFIG_PRINTK_NMI ++ else if (in_nmi()) ++ latched_seq_write(&con->printk_sync_nmi_seq, printk_seq + 1); ++#endif ++ else ++ latched_seq_write(&con->printk_sync_seq, printk_seq + 1); + } ++ ++ __printk_cpu_unlock(); + } + + /* +@@ -2025,20 +1976,6 @@ static u8 *__printk_recursion_counter(void) + local_irq_restore(flags); \ + } while (0) + +-int printk_delay_msec __read_mostly; +- +-static inline void printk_delay(void) +-{ +- if (unlikely(printk_delay_msec)) { +- int m = printk_delay_msec; +- +- while (m--) { +- mdelay(1); +- touch_nmi_watchdog(); +- } +- } +-} +- + static inline u32 printk_caller_id(void) + { + return in_task() ? task_pid_nr(current) : +@@ -2126,6 +2063,7 @@ int vprintk_store(int facility, int level, + const u32 caller_id = printk_caller_id(); + struct prb_reserved_entry e; + enum printk_info_flags flags = 0; ++ bool final_commit = false; + struct printk_record r; + unsigned long irqflags; + u16 trunc_msg_len = 0; +@@ -2136,6 +2074,7 @@ int vprintk_store(int facility, int level, + u16 text_len; + int ret = 0; + u64 ts_nsec; ++ u64 seq; + + /* + * Since the duration of printk() can vary depending on the message +@@ -2174,6 +2113,7 @@ int vprintk_store(int facility, int level, + if (flags & LOG_CONT) { + prb_rec_init_wr(&r, reserve_size); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { ++ seq = r.info->seq; + text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, + facility, &flags, fmt, args); + r.info->text_len += text_len; +@@ -2181,6 +2121,7 @@ int vprintk_store(int facility, int level, + if (flags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); ++ final_commit = true; + } else { + prb_commit(&e); + } +@@ -2204,6 +2145,7 @@ int vprintk_store(int facility, int level, + if (!prb_reserve(&e, prb, &r)) + goto out; + } ++ seq = r.info->seq; + + /* fill message */ + text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &flags, fmt, args); +@@ -2219,13 +2161,25 @@ int vprintk_store(int facility, int level, + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + + /* A message without a trailing newline can be continued. */ +- if (!(flags & LOG_NEWLINE)) ++ if (!(flags & LOG_NEWLINE)) { + prb_commit(&e); +- else ++ } else { + prb_final_commit(&e); ++ final_commit = true; ++ } + + ret = text_len + trunc_msg_len; + out: ++ /* only the kernel may perform synchronous printing */ ++ if (facility == 0 && final_commit) { ++ struct console *con; ++ ++ for_each_console(con) { ++ if (console_may_sync(con)) ++ print_sync_until(con, seq + 1, false); ++ } ++ } ++ + printk_exit_irqrestore(recursion_ptr, irqflags); + return ret; + } +@@ -2235,40 +2189,16 @@ asmlinkage int vprintk_emit(int facility, int level, + const char *fmt, va_list args) + { + int printed_len; +- bool in_sched = false; + + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + +- if (level == LOGLEVEL_SCHED) { ++ if (level == LOGLEVEL_SCHED) + level = LOGLEVEL_DEFAULT; +- in_sched = true; +- } +- +- boot_delay_msec(level); +- printk_delay(); + + printed_len = vprintk_store(facility, level, dev_info, fmt, args); + +- /* If called from the scheduler, we can not call up(). */ +- if (!in_sched) { +- /* +- * Disable preemption to avoid being preempted while holding +- * console_sem which would prevent anyone from printing to +- * console +- */ +- preempt_disable(); +- /* +- * Try to acquire and then immediately release the console +- * semaphore. The release will print out buffers and wake up +- * /dev/kmsg and syslog() users. +- */ +- if (console_trylock_spinning()) +- console_unlock(); +- preempt_enable(); +- } +- + wake_up_klogd(); + return printed_len; + } +@@ -2293,37 +2223,162 @@ asmlinkage __visible int _printk(const char *fmt, ...) + } + EXPORT_SYMBOL(_printk); + +-#else /* CONFIG_PRINTK */ ++static int printk_kthread_func(void *data) ++{ ++ struct console *con = data; ++ unsigned long dropped = 0; ++ char *dropped_text = NULL; ++ struct printk_info info; ++ struct printk_record r; ++ char *ext_text = NULL; ++ size_t dropped_len; ++ int ret = -ENOMEM; ++ char *text = NULL; ++ char *write_text; ++ size_t len; ++ int error; ++ u64 seq; ++ ++ if (con->flags & CON_EXTENDED) { ++ ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); ++ if (!ext_text) ++ goto out; ++ } ++ text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); ++ dropped_text = kmalloc(64, GFP_KERNEL); ++ if (!text || !dropped_text) ++ goto out; ++ if (con->flags & CON_EXTENDED) ++ write_text = ext_text; ++ else ++ write_text = text; ++ ++ seq = read_console_seq(con); + +-#define CONSOLE_LOG_MAX 0 +-#define printk_time false ++ prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + +-#define prb_read_valid(rb, seq, r) false +-#define prb_first_valid_seq(rb) 0 ++ for (;;) { ++ error = wait_event_interruptible(log_wait, ++ prb_read_valid(prb, seq, &r) || kthread_should_stop()); + +-static u64 syslog_seq; +-static u64 console_seq; +-static u64 exclusive_console_stop_seq; +-static unsigned long console_dropped; ++ if (kthread_should_stop()) ++ break; ++ ++ if (error) ++ continue; ++ ++ if (seq != r.info->seq) { ++ dropped += r.info->seq - seq; ++ seq = r.info->seq; ++ } ++ ++ seq++; ++ ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ ++ if (suppress_message_printing(r.info->level)) ++ continue; ++ ++ if (con->flags & CON_EXTENDED) { ++ len = info_print_ext_header(ext_text, ++ CONSOLE_EXT_LOG_MAX, ++ r.info); ++ len += msg_print_ext_body(ext_text + len, ++ CONSOLE_EXT_LOG_MAX - len, ++ &r.text_buf[0], r.info->text_len, ++ &r.info->dev_info); ++ } else { ++ len = record_print_text(&r, ++ console_msg_format & MSG_FORMAT_SYSLOG, ++ printk_time); ++ } ++ ++ console_lock(); ++ ++ /* ++ * Even though the printk kthread is always preemptible, it is ++ * still not allowed to call cond_resched() from within ++ * console drivers. The task may become non-preemptible in the ++ * console driver call chain. For example, vt_console_print() ++ * takes a spinlock and then can call into fbcon_redraw(), ++ * which can conditionally invoke cond_resched(). ++ */ ++ console_may_schedule = 0; ++ ++ if (kernel_sync_mode() && con->write_atomic) { ++ console_unlock(); ++ break; ++ } ++ ++ if (!(con->flags & CON_EXTENDED) && dropped) { ++ dropped_len = snprintf(dropped_text, 64, ++ "** %lu printk messages dropped **\n", ++ dropped); ++ dropped = 0; ++ ++ con->write(con, dropped_text, dropped_len); ++ printk_delay(r.info->level); ++ } ++ ++ con->write(con, write_text, len); ++ if (len) ++ printk_delay(r.info->level); + +-static size_t record_print_text(const struct printk_record *r, +- bool syslog, bool time) ++ latched_seq_write(&con->printk_seq, seq); ++ ++ console_unlock(); ++ } ++ ret = 0; ++out: ++ kfree(dropped_text); ++ kfree(text); ++ kfree(ext_text); ++ pr_info("%sconsole [%s%d]: printing thread stopped\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return ret; ++} ++ ++/* Must be called within console_lock(). */ ++static void start_printk_kthread(struct console *con) + { +- return 0; ++ con->thread = kthread_run(printk_kthread_func, con, ++ "pr/%s%d", con->name, con->index); ++ if (IS_ERR(con->thread)) { ++ pr_err("%sconsole [%s%d]: unable to start printing thread\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); ++ return; ++ } ++ pr_info("%sconsole [%s%d]: printing thread started\n", ++ (con->flags & CON_BOOT) ? "boot" : "", ++ con->name, con->index); + } +-static ssize_t info_print_ext_header(char *buf, size_t size, +- struct printk_info *info) ++ ++/* protected by console_lock */ ++static bool kthreads_started; ++ ++/* Must be called within console_lock(). */ ++static void console_try_thread(struct console *con) + { +- return 0; ++ if (kthreads_started) { ++ start_printk_kthread(con); ++ return; ++ } ++ ++ /* ++ * The printing threads have not been started yet. If this console ++ * can print synchronously, print all unprinted messages. ++ */ ++ if (console_may_sync(con)) { ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ print_sync_until(con, prb_next_seq(prb), true); ++ local_irq_restore(flags); ++ } + } +-static ssize_t msg_print_ext_body(char *buf, size_t size, +- char *text, size_t text_len, +- struct dev_printk_info *dev_info) { return 0; } +-static void console_lock_spinning_enable(void) { } +-static int console_lock_spinning_disable_and_check(void) { return 0; } +-static void call_console_drivers(const char *ext_text, size_t ext_len, +- const char *text, size_t len) {} +-static bool suppress_message_printing(int level) { return false; } + + #endif /* CONFIG_PRINTK */ + +@@ -2580,34 +2635,6 @@ int is_console_locked(void) + } + EXPORT_SYMBOL(is_console_locked); + +-/* +- * Check if we have any console that is capable of printing while cpu is +- * booting or shutting down. Requires console_sem. +- */ +-static int have_callable_console(void) +-{ +- struct console *con; +- +- for_each_console(con) +- if ((con->flags & CON_ENABLED) && +- (con->flags & CON_ANYTIME)) +- return 1; +- +- return 0; +-} +- +-/* +- * Can we actually use the console at this time on this cpu? +- * +- * Console drivers may assume that per-cpu resources have been allocated. So +- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't +- * call them until this CPU is officially up. +- */ +-static inline int can_use_console(void) +-{ +- return cpu_online(raw_smp_processor_id()) || have_callable_console(); +-} +- + /** + * console_unlock - unlock the console system + * +@@ -2624,140 +2651,13 @@ static inline int can_use_console(void) + */ + void console_unlock(void) + { +- static char ext_text[CONSOLE_EXT_LOG_MAX]; +- static char text[CONSOLE_LOG_MAX]; +- unsigned long flags; +- bool do_cond_resched, retry; +- struct printk_info info; +- struct printk_record r; +- u64 __maybe_unused next_seq; +- + if (console_suspended) { + up_console_sem(); + return; + } + +- prb_rec_init_rd(&r, &info, text, sizeof(text)); +- +- /* +- * Console drivers are called with interrupts disabled, so +- * @console_may_schedule should be cleared before; however, we may +- * end up dumping a lot of lines, for example, if called from +- * console registration path, and should invoke cond_resched() +- * between lines if allowable. Not doing so can cause a very long +- * scheduling stall on a slow console leading to RCU stall and +- * softlockup warnings which exacerbate the issue with more +- * messages practically incapacitating the system. +- * +- * console_trylock() is not able to detect the preemptive +- * context reliably. Therefore the value must be stored before +- * and cleared after the "again" goto label. +- */ +- do_cond_resched = console_may_schedule; +-again: +- console_may_schedule = 0; +- +- /* +- * We released the console_sem lock, so we need to recheck if +- * cpu is online and (if not) is there at least one CON_ANYTIME +- * console. +- */ +- if (!can_use_console()) { +- console_locked = 0; +- up_console_sem(); +- return; +- } +- +- for (;;) { +- size_t ext_len = 0; +- int handover; +- size_t len; +- +-skip: +- if (!prb_read_valid(prb, console_seq, &r)) +- break; +- +- if (console_seq != r.info->seq) { +- console_dropped += r.info->seq - console_seq; +- console_seq = r.info->seq; +- } +- +- if (suppress_message_printing(r.info->level)) { +- /* +- * Skip record we have buffered and already printed +- * directly to the console when we received it, and +- * record that has level above the console loglevel. +- */ +- console_seq++; +- goto skip; +- } +- +- /* Output to all consoles once old messages replayed. */ +- if (unlikely(exclusive_console && +- console_seq >= exclusive_console_stop_seq)) { +- exclusive_console = NULL; +- } +- +- /* +- * Handle extended console text first because later +- * record_print_text() will modify the record buffer in-place. +- */ +- if (nr_ext_console_drivers) { +- ext_len = info_print_ext_header(ext_text, +- sizeof(ext_text), +- r.info); +- ext_len += msg_print_ext_body(ext_text + ext_len, +- sizeof(ext_text) - ext_len, +- &r.text_buf[0], +- r.info->text_len, +- &r.info->dev_info); +- } +- len = record_print_text(&r, +- console_msg_format & MSG_FORMAT_SYSLOG, +- printk_time); +- console_seq++; +- +- /* +- * While actively printing out messages, if another printk() +- * were to occur on another CPU, it may wait for this one to +- * finish. This task can not be preempted if there is a +- * waiter waiting to take over. +- * +- * Interrupts are disabled because the hand over to a waiter +- * must not be interrupted until the hand over is completed +- * (@console_waiter is cleared). +- */ +- printk_safe_enter_irqsave(flags); +- console_lock_spinning_enable(); +- +- stop_critical_timings(); /* don't trace print latency */ +- call_console_drivers(ext_text, ext_len, text, len); +- start_critical_timings(); +- +- handover = console_lock_spinning_disable_and_check(); +- printk_safe_exit_irqrestore(flags); +- if (handover) +- return; +- +- if (do_cond_resched) +- cond_resched(); +- } +- +- /* Get consistent value of the next-to-be-used sequence number. */ +- next_seq = console_seq; +- + console_locked = 0; + up_console_sem(); +- +- /* +- * Someone could have filled up the buffer again, so re-check if there's +- * something to flush. In case we cannot trylock the console_sem again, +- * there's a new owner and the console_unlock() from them will do the +- * flush, no worries. +- */ +- retry = prb_read_valid(prb, next_seq, NULL); +- if (retry && console_trylock()) +- goto again; + } + EXPORT_SYMBOL(console_unlock); + +@@ -2807,18 +2707,20 @@ void console_unblank(void) + */ + void console_flush_on_panic(enum con_flush_mode mode) + { +- /* +- * If someone else is holding the console lock, trylock will fail +- * and may_schedule may be set. Ignore and proceed to unlock so +- * that messages are flushed out. As this can be called from any +- * context and we don't want to get preempted while flushing, +- * ensure may_schedule is cleared. +- */ +- console_trylock(); +- console_may_schedule = 0; ++ if (!console_trylock()) ++ return; ++ ++#ifdef CONFIG_PRINTK ++ if (mode == CONSOLE_REPLAY_ALL) { ++ struct console *c; ++ u64 seq; ++ ++ seq = prb_first_valid_seq(prb); ++ for_each_console(c) ++ latched_seq_write(&c->printk_seq, seq); ++ } ++#endif + +- if (mode == CONSOLE_REPLAY_ALL) +- console_seq = prb_first_valid_seq(prb); + console_unlock(); + } + +@@ -2954,6 +2856,7 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) + void register_console(struct console *newcon) + { + struct console *bcon = NULL; ++ u64 __maybe_unused seq = 0; + int err; + + for_each_console(bcon) { +@@ -2976,6 +2879,8 @@ void register_console(struct console *newcon) + } + } + ++ newcon->thread = NULL; ++ + if (console_drivers && console_drivers->flags & CON_BOOT) + bcon = console_drivers; + +@@ -3017,8 +2922,10 @@ void register_console(struct console *newcon) + * the real console are the same physical device, it's annoying to + * see the beginning boot messages twice + */ +- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) ++ if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { + newcon->flags &= ~CON_PRINTBUFFER; ++ newcon->flags |= CON_HANDOVER; ++ } + + /* + * Put this console in the list - keep the +@@ -3040,27 +2947,21 @@ void register_console(struct console *newcon) + if (newcon->flags & CON_EXTENDED) + nr_ext_console_drivers++; + +- if (newcon->flags & CON_PRINTBUFFER) { +- /* +- * console_unlock(); will print out the buffered messages +- * for us. +- * +- * We're about to replay the log buffer. Only do this to the +- * just-registered console to avoid excessive message spam to +- * the already-registered consoles. +- * +- * Set exclusive_console with disabled interrupts to reduce +- * race window with eventual console_flush_on_panic() that +- * ignores console_lock. +- */ +- exclusive_console = newcon; +- exclusive_console_stop_seq = console_seq; ++#ifdef CONFIG_PRINTK ++ if (!(newcon->flags & CON_PRINTBUFFER)) ++ seq = prb_next_seq(prb); + +- /* Get a consistent copy of @syslog_seq. */ +- mutex_lock(&syslog_lock); +- console_seq = syslog_seq; +- mutex_unlock(&syslog_lock); +- } ++ seqcount_latch_init(&newcon->printk_seq.latch); ++ latched_seq_write(&newcon->printk_seq, seq); ++ seqcount_latch_init(&newcon->printk_sync_seq.latch); ++ latched_seq_write(&newcon->printk_sync_seq, seq); ++#ifdef CONFIG_HAVE_NMI ++ seqcount_latch_init(&newcon->printk_sync_nmi_seq.latch); ++ latched_seq_write(&newcon->printk_sync_nmi_seq, seq); ++#endif ++ ++ console_try_thread(newcon); ++#endif /* CONFIG_PRINTK */ + console_unlock(); + console_sysfs_notify(); + +@@ -3134,6 +3035,9 @@ int unregister_console(struct console *console) + console_unlock(); + console_sysfs_notify(); + ++ if (console->thread && !IS_ERR(console->thread)) ++ kthread_stop(console->thread); ++ + if (console->exit) + res = console->exit(console); + +@@ -3216,6 +3120,15 @@ static int __init printk_late_init(void) + unregister_console(con); + } + } ++ ++#ifdef CONFIG_PRINTK ++ console_lock(); ++ for_each_console(con) ++ start_printk_kthread(con); ++ kthreads_started = true; ++ console_unlock(); ++#endif ++ + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, + console_cpu_notify); + WARN_ON(ret < 0); +@@ -3239,14 +3152,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) + { + int pending = this_cpu_xchg(printk_pending, 0); + +- if (pending & PRINTK_PENDING_OUTPUT) { +- /* If trylock fails, someone else is doing the printing */ +- if (console_trylock()) +- console_unlock(); +- } +- + if (pending & PRINTK_PENDING_WAKEUP) +- wake_up_interruptible(&log_wait); ++ wake_up_interruptible_all(&log_wait); + } + + static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = +@@ -3293,7 +3200,7 @@ void defer_console_output(void) + + void printk_trigger_flush(void) + { +- defer_console_output(); ++ wake_up_klogd(); + } + + int vprintk_deferred(const char *fmt, va_list args) +@@ -3444,6 +3351,24 @@ void kmsg_dump(enum kmsg_dump_reason reason) + { + struct kmsg_dumper *dumper; + ++ if (!oops_in_progress) { ++ /* ++ * If atomic consoles are available, activate kernel sync mode ++ * to make sure any final messages are visible. The trailing ++ * printk message is important to flush any pending messages. ++ */ ++ if (have_atomic_console()) { ++ sync_mode = true; ++ pr_info("enabled sync mode\n"); ++ } ++ ++ /* ++ * Give the printing threads time to flush, allowing up to ++ * 1s of no printing forward progress before giving up. ++ */ ++ pr_flush(1000, true); ++ } ++ + rcu_read_lock(); + list_for_each_entry_rcu(dumper, &dump_list, list) { + enum kmsg_dump_reason max_reason = dumper->max_reason; +@@ -3626,6 +3551,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + #ifdef CONFIG_SMP + static atomic_t printk_cpulock_owner = ATOMIC_INIT(-1); + static atomic_t printk_cpulock_nested = ATOMIC_INIT(0); ++static unsigned int kgdb_cpu = -1; + + /** + * __printk_wait_on_cpu_lock() - Busy wait until the printk cpu-reentrant +@@ -3705,6 +3631,9 @@ EXPORT_SYMBOL(__printk_cpu_trylock); + */ + void __printk_cpu_unlock(void) + { ++ bool trigger_kgdb = false; ++ unsigned int cpu; ++ + if (atomic_read(&printk_cpulock_nested)) { + atomic_dec(&printk_cpulock_nested); + return; +@@ -3715,6 +3644,12 @@ void __printk_cpu_unlock(void) + * LMM(__printk_cpu_unlock:A) + */ + ++ cpu = smp_processor_id(); ++ if (kgdb_cpu == cpu) { ++ trigger_kgdb = true; ++ kgdb_cpu = -1; ++ } ++ + /* + * Guarantee loads and stores from this CPU when it was the + * lock owner are visible to the next lock owner. This pairs +@@ -3735,6 +3670,98 @@ void __printk_cpu_unlock(void) + */ + atomic_set_release(&printk_cpulock_owner, + -1); /* LMM(__printk_cpu_unlock:B) */ ++ ++ if (trigger_kgdb) { ++ pr_warn("re-triggering kgdb roundup for CPU#%d\n", cpu); ++ kgdb_roundup_cpu(cpu); ++ } + } + EXPORT_SYMBOL(__printk_cpu_unlock); ++ ++bool kgdb_roundup_delay(unsigned int cpu) ++{ ++ if (cpu != atomic_read(&printk_cpulock_owner)) ++ return false; ++ ++ kgdb_cpu = cpu; ++ return true; ++} ++EXPORT_SYMBOL(kgdb_roundup_delay); + #endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_PRINTK ++static void pr_msleep(bool may_sleep, int ms) ++{ ++ if (may_sleep) { ++ msleep(ms); ++ } else { ++ while (ms--) ++ udelay(1000); ++ } ++} ++ ++/** ++ * pr_flush() - Wait for printing threads to catch up. ++ * ++ * @timeout_ms: The maximum time (in ms) to wait. ++ * @reset_on_progress: Reset the timeout if forward progress is seen. ++ * ++ * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 ++ * represents infinite waiting. ++ * ++ * If @reset_on_progress is true, the timeout will be reset whenever any ++ * printer has been seen to make some forward progress. ++ * ++ * Context: Any context. ++ * Return: true if all enabled printers are caught up. ++ */ ++bool pr_flush(int timeout_ms, bool reset_on_progress) ++{ ++ int remaining = timeout_ms; ++ struct console *con; ++ u64 last_diff = 0; ++ bool may_sleep; ++ u64 printk_seq; ++ u64 diff; ++ u64 seq; ++ ++ may_sleep = (preemptible() && ++ !in_softirq() && ++ system_state >= SYSTEM_RUNNING); ++ ++ seq = prb_next_seq(prb); ++ ++ for (;;) { ++ diff = 0; ++ ++ for_each_console(con) { ++ if (!(con->flags & CON_ENABLED)) ++ continue; ++ printk_seq = read_console_seq(con); ++ if (printk_seq < seq) ++ diff += seq - printk_seq; ++ } ++ ++ if (diff != last_diff && reset_on_progress) ++ remaining = timeout_ms; ++ ++ if (diff == 0 || remaining == 0) ++ break; ++ ++ if (remaining < 0) { ++ pr_msleep(may_sleep, 100); ++ } else if (remaining < 100) { ++ pr_msleep(may_sleep, remaining); ++ remaining = 0; ++ } else { ++ pr_msleep(may_sleep, 100); ++ remaining -= 100; ++ } ++ ++ last_diff = diff; ++ } ++ ++ return (diff == 0); ++} ++EXPORT_SYMBOL(pr_flush); ++#endif /* CONFIG_PRINTK */ +diff --git a/kernel/ptrace.c b/kernel/ptrace.c +index 0cf547531ddf..0df2de214daa 100644 +--- a/kernel/ptrace.c ++++ b/kernel/ptrace.c +@@ -197,7 +197,18 @@ static bool ptrace_freeze_traced(struct task_struct *task) + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && + !__fatal_signal_pending(task)) { ++#ifdef CONFIG_PREEMPT_RT ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (READ_ONCE(task->__state) & __TASK_TRACED) ++ WRITE_ONCE(task->__state, __TASK_TRACED); ++ else ++ task->saved_state = __TASK_TRACED; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++#else + WRITE_ONCE(task->__state, __TASK_TRACED); ++#endif + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); +@@ -207,7 +218,11 @@ static bool ptrace_freeze_traced(struct task_struct *task) + + static void ptrace_unfreeze_traced(struct task_struct *task) + { +- if (READ_ONCE(task->__state) != __TASK_TRACED) ++ unsigned long flags; ++ bool frozen = true; ++ ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT) && ++ READ_ONCE(task->__state) != __TASK_TRACED) + return; + + WARN_ON(!task->ptrace || task->parent != current); +@@ -217,12 +232,21 @@ static void ptrace_unfreeze_traced(struct task_struct *task) + * Recheck state under the lock to close this race. + */ + spin_lock_irq(&task->sighand->siglock); +- if (READ_ONCE(task->__state) == __TASK_TRACED) { +- if (__fatal_signal_pending(task)) +- wake_up_state(task, __TASK_TRACED); +- else +- WRITE_ONCE(task->__state, TASK_TRACED); +- } ++ raw_spin_lock_irqsave(&task->pi_lock, flags); ++ if (READ_ONCE(task->__state) == __TASK_TRACED) ++ WRITE_ONCE(task->__state, TASK_TRACED); ++ ++#ifdef CONFIG_PREEMPT_RT ++ else if (task->saved_state == __TASK_TRACED) ++ task->saved_state = TASK_TRACED; ++#endif ++ else ++ frozen = false; ++ raw_spin_unlock_irqrestore(&task->pi_lock, flags); ++ ++ if (frozen && __fatal_signal_pending(task)) ++ wake_up_state(task, __TASK_TRACED); ++ + spin_unlock_irq(&task->sighand->siglock); + } + +diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h +index 94b8ee84bc78..509ea934305c 100644 +--- a/kernel/rcu/tasks.h ++++ b/kernel/rcu/tasks.h +@@ -1362,7 +1362,7 @@ static void test_rcu_tasks_callback(struct rcu_head *rhp) + rttd->notrun = true; + } + +-static void rcu_tasks_initiate_self_tests(void) ++void rcu_tasks_initiate_self_tests(void) + { + pr_info("Running RCU-tasks wait API self tests\n"); + #ifdef CONFIG_TASKS_RCU +@@ -1399,9 +1399,7 @@ static int rcu_tasks_verify_self_tests(void) + return ret; + } + late_initcall(rcu_tasks_verify_self_tests); +-#else /* #ifdef CONFIG_PROVE_RCU */ +-static void rcu_tasks_initiate_self_tests(void) { } +-#endif /* #else #ifdef CONFIG_PROVE_RCU */ ++#endif /* #ifdef CONFIG_PROVE_RCU */ + + void __init rcu_init_tasks_generic(void) + { +@@ -1416,9 +1414,6 @@ void __init rcu_init_tasks_generic(void) + #ifdef CONFIG_TASKS_TRACE_RCU + rcu_spawn_tasks_trace_kthread(); + #endif +- +- // Run the self-tests. +- rcu_tasks_initiate_self_tests(); + } + + #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index df016f6d0662..8ea272f7eb18 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -2280,13 +2280,13 @@ rcu_report_qs_rdp(struct rcu_data *rdp) + { + unsigned long flags; + unsigned long mask; +- bool needwake = false; +- const bool offloaded = rcu_rdp_is_offloaded(rdp); ++ bool offloaded, needwake = false; + struct rcu_node *rnp; + + WARN_ON_ONCE(rdp->cpu != smp_processor_id()); + rnp = rdp->mynode; + raw_spin_lock_irqsave_rcu_node(rnp, flags); ++ offloaded = rcu_rdp_is_offloaded(rdp); + if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq || + rdp->gpwrap) { + +@@ -2448,7 +2448,7 @@ static void rcu_do_batch(struct rcu_data *rdp) + int div; + bool __maybe_unused empty; + unsigned long flags; +- const bool offloaded = rcu_rdp_is_offloaded(rdp); ++ bool offloaded; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + long bl, count = 0; +@@ -2474,6 +2474,7 @@ static void rcu_do_batch(struct rcu_data *rdp) + rcu_nocb_lock(rdp); + WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); + pending = rcu_segcblist_n_cbs(&rdp->cblist); ++ offloaded = rcu_rdp_is_offloaded(rdp); + div = READ_ONCE(rcu_divisor); + div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div; + bl = max(rdp->blimit, pending >> div); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index d34a56f16d13..cd0983900823 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -75,7 +75,11 @@ __read_mostly int sysctl_resched_latency_warn_once = 1; + * Number of tasks to iterate in a single balance run. + * Limited because this is done with IRQs disabled. + */ ++#ifdef CONFIG_PREEMPT_RT ++const_debug unsigned int sysctl_sched_nr_migrate = 8; ++#else + const_debug unsigned int sysctl_sched_nr_migrate = 32; ++#endif + + /* + * period over which we measure -rt task CPU usage in us. +@@ -983,6 +987,46 @@ void resched_curr(struct rq *rq) + trace_sched_wake_idle_without_ipi(cpu); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++ ++static int tsk_is_polling(struct task_struct *p) ++{ ++#ifdef TIF_POLLING_NRFLAG ++ return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); ++#else ++ return 0; ++#endif ++} ++ ++void resched_curr_lazy(struct rq *rq) ++{ ++ struct task_struct *curr = rq->curr; ++ int cpu; ++ ++ if (!sched_feat(PREEMPT_LAZY)) { ++ resched_curr(rq); ++ return; ++ } ++ ++ if (test_tsk_need_resched(curr)) ++ return; ++ ++ if (test_tsk_need_resched_lazy(curr)) ++ return; ++ ++ set_tsk_need_resched_lazy(curr); ++ ++ cpu = cpu_of(rq); ++ if (cpu == smp_processor_id()) ++ return; ++ ++ /* NEED_RESCHED_LAZY must be visible before we test polling */ ++ smp_mb(); ++ if (!tsk_is_polling(curr)) ++ smp_send_reschedule(cpu); ++} ++#endif ++ + void resched_cpu(int cpu) + { + struct rq *rq = cpu_rq(cpu); +@@ -2141,6 +2185,7 @@ void migrate_disable(void) + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; ++ preempt_lazy_disable(); + preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_disable); +@@ -2152,6 +2197,8 @@ void migrate_enable(void) + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; ++ } else if (WARN_ON_ONCE(p->migration_disabled == 0)) { ++ return; + } + + /* +@@ -2169,6 +2216,7 @@ void migrate_enable(void) + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; ++ preempt_lazy_enable(); + preempt_enable(); + } + EXPORT_SYMBOL_GPL(migrate_enable); +@@ -3235,7 +3283,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state + * is actually now running somewhere else! + */ + while (task_running(rq, p)) { +- if (match_state && unlikely(READ_ONCE(p->__state) != match_state)) ++ if (match_state && !task_match_state_lock(p, match_state)) + return 0; + cpu_relax(); + } +@@ -3250,7 +3298,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state + running = task_running(rq, p); + queued = task_on_rq_queued(p); + ncsw = 0; +- if (!match_state || READ_ONCE(p->__state) == match_state) ++ if (!match_state || task_match_state_or_saved(p, match_state)) + ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ + task_rq_unlock(rq, p, &rf); + +@@ -3284,7 +3332,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + +@@ -4427,6 +4475,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + p->on_cpu = 0; + #endif + init_task_preempt_count(p); ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(p)->preempt_lazy_count = 0; ++#endif + #ifdef CONFIG_SMP + plist_node_init(&p->pushable_tasks, MAX_PRIO); + RB_CLEAR_NODE(&p->pushable_dl_tasks); +@@ -4922,20 +4973,18 @@ static struct rq *finish_task_switch(struct task_struct *prev) + */ + if (mm) { + membarrier_mm_sync_core_before_usermode(mm); +- mmdrop(mm); ++ mmdrop_sched(mm); + } + if (unlikely(prev_state == TASK_DEAD)) { + if (prev->sched_class->task_dead) + prev->sched_class->task_dead(prev); + + /* +- * Remove function-return probe instances associated with this +- * task and put them back on the free list. ++ * Release VMAP'ed task stack immediate for reuse. On RT ++ * enabled kernels this is delayed for latency reasons. + */ +- kprobe_flush_task(prev); +- +- /* Task is done with its stack. */ +- put_task_stack(prev); ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) ++ put_task_stack(prev); + + put_task_struct_rcu_user(prev); + } +@@ -6335,6 +6384,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) + + next = pick_next_task(rq, prev, &rf); + clear_tsk_need_resched(prev); ++ clear_tsk_need_resched_lazy(prev); + clear_preempt_need_resched(); + #ifdef CONFIG_SCHED_DEBUG + rq->last_seen_need_resched_ns = 0; +@@ -6556,6 +6606,30 @@ static void __sched notrace preempt_schedule_common(void) + } while (need_resched()); + } + ++#ifdef CONFIG_PREEMPT_LAZY ++/* ++ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is ++ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as ++ * preempt_lazy_count counter >0. ++ */ ++static __always_inline int preemptible_lazy(void) ++{ ++ if (test_thread_flag(TIF_NEED_RESCHED)) ++ return 1; ++ if (current_thread_info()->preempt_lazy_count) ++ return 0; ++ return 1; ++} ++ ++#else ++ ++static inline int preemptible_lazy(void) ++{ ++ return 1; ++} ++ ++#endif ++ + #ifdef CONFIG_PREEMPTION + /* + * This is the entry point to schedule() from in-kernel preemption +@@ -6569,7 +6643,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) + */ + if (likely(!preemptible())) + return; +- ++ if (!preemptible_lazy()) ++ return; + preempt_schedule_common(); + } + NOKPROBE_SYMBOL(preempt_schedule); +@@ -6602,6 +6677,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) + if (likely(!preemptible())) + return; + ++ if (!preemptible_lazy()) ++ return; ++ + do { + /* + * Because the function tracer can trace preempt_count_sub() +@@ -8754,7 +8832,9 @@ void __init init_idle(struct task_struct *idle, int cpu) + + /* Set the preempt count _outside_ the spinlocks! */ + init_idle_preempt_count(idle, cpu); +- ++#ifdef CONFIG_HAVE_PREEMPT_LAZY ++ task_thread_info(idle)->preempt_lazy_count = 0; ++#endif + /* + * The idle tasks have their own, simple scheduling class: + */ +@@ -9555,14 +9635,8 @@ void __init sched_init(void) + } + + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP +-static inline int preempt_count_equals(int preempt_offset) +-{ +- int nested = preempt_count() + rcu_preempt_depth(); +- +- return (nested == preempt_offset); +-} + +-void __might_sleep(const char *file, int line, int preempt_offset) ++void __might_sleep(const char *file, int line) + { + unsigned int state = get_current_state(); + /* +@@ -9576,11 +9650,32 @@ void __might_sleep(const char *file, int line, int preempt_offset) + (void *)current->task_state_change, + (void *)current->task_state_change); + +- ___might_sleep(file, line, preempt_offset); ++ __might_resched(file, line, 0); + } + EXPORT_SYMBOL(__might_sleep); + +-void ___might_sleep(const char *file, int line, int preempt_offset) ++static void print_preempt_disable_ip(int preempt_offset, unsigned long ip) ++{ ++ if (!IS_ENABLED(CONFIG_DEBUG_PREEMPT)) ++ return; ++ ++ if (preempt_count() == preempt_offset) ++ return; ++ ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(KERN_ERR, ip); ++} ++ ++static inline bool resched_offsets_ok(unsigned int offsets) ++{ ++ unsigned int nested = preempt_count(); ++ ++ nested += rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT; ++ ++ return nested == offsets; ++} ++ ++void __might_resched(const char *file, int line, unsigned int offsets) + { + /* Ratelimiting timestamp: */ + static unsigned long prev_jiffy; +@@ -9590,7 +9685,7 @@ void ___might_sleep(const char *file, int line, int preempt_offset) + /* WARN_ON_ONCE() by default, no rate limit required: */ + rcu_sleep_check(); + +- if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ if ((resched_offsets_ok(offsets) && !irqs_disabled() && + !is_idle_task(current) && !current->non_block_count) || + system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || + oops_in_progress) +@@ -9603,29 +9698,33 @@ void ___might_sleep(const char *file, int line, int preempt_offset) + /* Save this before calling printk(), since that will clobber it: */ + preempt_disable_ip = get_preempt_disable_ip(current); + +- printk(KERN_ERR +- "BUG: sleeping function called from invalid context at %s:%d\n", +- file, line); +- printk(KERN_ERR +- "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", +- in_atomic(), irqs_disabled(), current->non_block_count, +- current->pid, current->comm); ++ pr_err("BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ pr_err("in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), current->non_block_count, ++ current->pid, current->comm); ++ pr_err("preempt_count: %x, expected: %x\n", preempt_count(), ++ offsets & MIGHT_RESCHED_PREEMPT_MASK); ++ ++ if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { ++ pr_err("RCU nest depth: %d, expected: %u\n", ++ rcu_preempt_depth(), offsets >> MIGHT_RESCHED_RCU_SHIFT); ++ } + + if (task_stack_end_corrupted(current)) +- printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ pr_emerg("Thread overran stack, or stack corrupted\n"); + + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); +- if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) +- && !preempt_count_equals(preempt_offset)) { +- pr_err("Preemption disabled at:"); +- print_ip_sym(KERN_ERR, preempt_disable_ip); +- } ++ ++ print_preempt_disable_ip(offsets & MIGHT_RESCHED_PREEMPT_MASK, ++ preempt_disable_ip); ++ + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); + } +-EXPORT_SYMBOL(___might_sleep); ++EXPORT_SYMBOL(__might_resched); + + void __cant_sleep(const char *file, int line, int preempt_offset) + { +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 646a6ae4b250..c02ecc105f0c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -4651,7 +4651,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + ideal_runtime = sched_slice(cfs_rq, curr); + delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; + if (delta_exec > ideal_runtime) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + /* + * The current task ran long enough, ensure it doesn't get + * re-elected due to buddy favours. +@@ -4675,7 +4675,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) + return; + + if (delta > ideal_runtime) +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + } + + static void +@@ -4821,7 +4821,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) + * validating it and just reschedule. + */ + if (queued) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + return; + } + /* +@@ -4961,7 +4961,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) + * hierarchy can be throttled + */ + if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + } + + static __always_inline +@@ -5724,7 +5724,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) + + if (delta < 0) { + if (task_current(rq, p)) +- resched_curr(rq); ++ resched_curr_lazy(rq); + return; + } + hrtick_start(rq, delta); +@@ -7449,7 +7449,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ + return; + + preempt: +- resched_curr(rq); ++ resched_curr_lazy(rq); + /* + * Only set the backward buddy when the current task is still + * on the rq. This can happen when a wakeup gets interleaved +@@ -11508,7 +11508,7 @@ static void task_fork_fair(struct task_struct *p) + * 'current' within the tree based on its new key value. + */ + swap(curr->vruntime, se->vruntime); +- resched_curr(rq); ++ resched_curr_lazy(rq); + } + + se->vruntime -= cfs_rq->min_vruntime; +@@ -11535,7 +11535,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) + */ + if (task_current(rq, p)) { + if (p->prio > oldprio) +- resched_curr(rq); ++ resched_curr_lazy(rq); + } else + check_preempt_curr(rq, p, 0); + } +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index c4947c1b5edb..e13090e33f3c 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -46,11 +46,19 @@ SCHED_FEAT(DOUBLE_TICK, false) + */ + SCHED_FEAT(NONTASK_CAPACITY, true) + ++#ifdef CONFIG_PREEMPT_RT ++SCHED_FEAT(TTWU_QUEUE, false) ++# ifdef CONFIG_PREEMPT_LAZY ++SCHED_FEAT(PREEMPT_LAZY, true) ++# endif ++#else ++ + /* + * Queue remote wakeups on the target CPU and process them + * using the scheduler IPI. Reduces rq->lock contention/bounces. + */ + SCHED_FEAT(TTWU_QUEUE, true) ++#endif + + /* + * When doing wakeups, attempt to limit superfluous scans of the LLC domain. +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 6312f1904825..36483b794a00 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2327,6 +2327,15 @@ extern void reweight_task(struct task_struct *p, int prio); + extern void resched_curr(struct rq *rq); + extern void resched_cpu(int cpu); + ++#ifdef CONFIG_PREEMPT_LAZY ++extern void resched_curr_lazy(struct rq *rq); ++#else ++static inline void resched_curr_lazy(struct rq *rq) ++{ ++ resched_curr(rq); ++} ++#endif ++ + extern struct rt_bandwidth def_rt_bandwidth; + extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + +diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c +index e1c655f928c7..f230b1ac7f91 100644 +--- a/kernel/sched/swait.c ++++ b/kernel/sched/swait.c +@@ -64,6 +64,7 @@ void swake_up_all(struct swait_queue_head *q) + struct swait_queue *curr; + LIST_HEAD(tmp); + ++ WARN_ON(irqs_disabled()); + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 4e8698e62f07..3d0157bd4e14 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -526,7 +526,7 @@ static int init_rootdomain(struct root_domain *rd) + #ifdef HAVE_RT_PUSH_IPI + rd->rto_cpu = -1; + raw_spin_lock_init(&rd->rto_lock); +- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); ++ rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); + #endif + + rd->visit_gen = 0; +diff --git a/kernel/signal.c b/kernel/signal.c +index c7dbb19219b9..0bbd89fbf240 100644 +--- a/kernel/signal.c ++++ b/kernel/signal.c +@@ -1324,6 +1324,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t, + struct k_sigaction *action; + int sig = info->si_signo; + ++ /* ++ * On some archs, PREEMPT_RT has to delay sending a signal from a trap ++ * since it can not enable preemption, and the signal code's spin_locks ++ * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will ++ * send the signal on exit of the trap. ++ */ ++#ifdef ARCH_RT_DELAYS_SIGNAL_SEND ++ if (in_atomic()) { ++ struct task_struct *t = current; ++ ++ if (WARN_ON_ONCE(t->forced_info.si_signo)) ++ return 0; ++ ++ if (is_si_special(info)) { ++ WARN_ON_ONCE(info != SEND_SIG_PRIV); ++ t->forced_info.si_signo = info->si_signo; ++ t->forced_info.si_errno = 0; ++ t->forced_info.si_code = SI_KERNEL; ++ t->forced_info.si_pid = 0; ++ t->forced_info.si_uid = 0; ++ } else { ++ t->forced_info = *info; ++ } ++ ++ set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); ++ return 0; ++ } ++#endif + spin_lock_irqsave(&t->sighand->siglock, flags); + action = &t->sighand->action[sig-1]; + ignored = action->sa.sa_handler == SIG_IGN; +@@ -2308,16 +2336,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t + if (gstop_done && ptrace_reparented(current)) + do_notify_parent_cldstop(current, false, why); + +- /* +- * Don't want to allow preemption here, because +- * sys_ptrace() needs this task to be inactive. +- * +- * XXX: implement read_unlock_no_resched(). +- */ +- preempt_disable(); + read_unlock(&tasklist_lock); + cgroup_enter_frozen(); +- preempt_enable_no_resched(); + freezable_schedule(); + cgroup_leave_frozen(true); + } else { +diff --git a/kernel/smp.c b/kernel/smp.c +index 82825345432c..9d3c8c56d904 100644 +--- a/kernel/smp.c ++++ b/kernel/smp.c +@@ -690,10 +690,20 @@ void flush_smp_call_function_from_idle(void) + + cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU, + smp_processor_id(), CFD_SEQ_IDLE); ++ + local_irq_save(flags); + flush_smp_call_function_queue(true); +- if (local_softirq_pending()) +- do_softirq(); ++ ++ if (local_softirq_pending()) { ++ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ do_softirq(); ++ } else { ++ struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); ++ ++ if (ksoftirqd && !task_is_running(ksoftirqd)) ++ wake_up_process(ksoftirqd); ++ } ++ } + + local_irq_restore(flags); + } +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 0202f23ae960..7fc118c87b9d 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2646,7 +2646,13 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; +- return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | ++#ifdef CONFIG_PREEMPT_LAZY ++ if (need_resched_lazy()) ++ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; ++#endif ++ ++ return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) | ++ (preempt_lazy_count() & 0xff) << 16 | + (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; + } + +@@ -4227,15 +4233,17 @@ unsigned long trace_total_entries(struct trace_array *tr) + + static void print_lat_help_header(struct seq_file *m) + { +- seq_puts(m, "# _------=> CPU# \n" +- "# / _-----=> irqs-off \n" +- "# | / _----=> need-resched \n" +- "# || / _---=> hardirq/softirq \n" +- "# ||| / _--=> preempt-depth \n" +- "# |||| / _-=> migrate-disable \n" +- "# ||||| / delay \n" +- "# cmd pid |||||| time | caller \n" +- "# \\ / |||||| \\ | / \n"); ++ seq_puts(m, "# _--------=> CPU# \n" ++ "# / _-------=> irqs-off \n" ++ "# | / _------=> need-resched \n" ++ "# || / _-----=> need-resched-lazy\n" ++ "# ||| / _----=> hardirq/softirq \n" ++ "# |||| / _---=> preempt-depth \n" ++ "# ||||| / _--=> preempt-lazy-depth\n" ++ "# |||||| / _-=> migrate-disable \n" ++ "# ||||||| / delay \n" ++ "# cmd pid |||||||| time | caller \n" ++ "# \\ / |||||||| \\ | / \n"); + } + + static void print_event_info(struct array_buffer *buf, struct seq_file *m) +@@ -4269,14 +4277,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file + + print_event_info(buf, m); + +- seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); +- seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); +- seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); +- seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); +- seq_printf(m, "# %.*s||| / _-=> migrate-disable\n", prec, space); +- seq_printf(m, "# %.*s|||| / delay\n", prec, space); +- seq_printf(m, "# TASK-PID %.*s CPU# ||||| TIMESTAMP FUNCTION\n", prec, " TGID "); +- seq_printf(m, "# | | %.*s | ||||| | |\n", prec, " | "); ++ seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space); ++ seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); ++ seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); ++ seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); ++ seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); ++ seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); ++ seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); ++ seq_printf(m, "# %.*s|||||| / delay\n", prec, space); ++ seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); ++ seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); + } + + void +diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c +index 160298d285c0..9ec3c6c38cc3 100644 +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -193,6 +193,7 @@ static int trace_define_common_fields(void) + /* Holds both preempt_count and migrate_disable */ + __common_field(unsigned char, preempt_count); + __common_field(int, pid); ++ __common_field(unsigned char, preempt_lazy_count); + + return ret; + } +diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c +index 6b4d3f3abdae..460bc8245e4a 100644 +--- a/kernel/trace/trace_output.c ++++ b/kernel/trace/trace_output.c +@@ -451,6 +451,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + { + char hardsoft_irq; + char need_resched; ++ char need_resched_lazy; + char irqs_off; + int hardirq; + int softirq; +@@ -481,6 +482,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + break; + } + ++ need_resched_lazy = ++ (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; ++ + hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : +@@ -489,14 +493,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + softirq ? 's' : + '.' ; + +- trace_seq_printf(s, "%c%c%c", +- irqs_off, need_resched, hardsoft_irq); ++ trace_seq_printf(s, "%c%c%c%c", ++ irqs_off, need_resched, need_resched_lazy, ++ hardsoft_irq); + + if (entry->preempt_count & 0xf) + trace_seq_printf(s, "%x", entry->preempt_count & 0xf); + else + trace_seq_putc(s, '.'); + ++ if (entry->preempt_lazy_count) ++ trace_seq_printf(s, "%x", entry->preempt_lazy_count); ++ else ++ trace_seq_putc(s, '.'); ++ + if (entry->preempt_count & 0xf0) + trace_seq_printf(s, "%x", entry->preempt_count >> 4); + else +diff --git a/lib/bug.c b/lib/bug.c +index 45a0584f6541..03a87df69ed2 100644 +--- a/lib/bug.c ++++ b/lib/bug.c +@@ -206,6 +206,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) + else + pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", + (void *)bugaddr); ++ pr_flush(1000, true); + + return BUG_TRAP_TYPE_BUG; + } +diff --git a/lib/dump_stack.c b/lib/dump_stack.c +index 6b7f1bf6715d..6e8ae42c7e27 100644 +--- a/lib/dump_stack.c ++++ b/lib/dump_stack.c +@@ -102,9 +102,9 @@ asmlinkage __visible void dump_stack_lvl(const char *log_lvl) + * Permit this cpu to perform nested stack dumps while serialising + * against other CPUs + */ +- printk_cpu_lock_irqsave(flags); ++ raw_printk_cpu_lock_irqsave(flags); + __dump_stack(log_lvl); +- printk_cpu_unlock_irqrestore(flags); ++ raw_printk_cpu_unlock_irqrestore(flags); + } + EXPORT_SYMBOL(dump_stack_lvl); + +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 2f17b488d58e..2b9f797642f6 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -191,11 +191,13 @@ static int irq_poll_cpu_dead(unsigned int cpu) + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ ++ local_bh_disable(); + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), + this_cpu_ptr(&blk_cpu_iopoll)); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); + local_irq_enable(); ++ local_bh_enable(); + + return 0; + } +diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c +index 161108e5d2fe..1266ea3726d7 100644 +--- a/lib/locking-selftest.c ++++ b/lib/locking-selftest.c +@@ -26,6 +26,12 @@ + #include + #include + ++#ifdef CONFIG_PREEMPT_RT ++# define NON_RT(...) ++#else ++# define NON_RT(...) __VA_ARGS__ ++#endif ++ + /* + * Change this to 1 if you want to see the failure printouts: + */ +@@ -139,7 +145,7 @@ static DEFINE_RT_MUTEX(rtmutex_Z2); + + #endif + +-static local_lock_t local_A = INIT_LOCAL_LOCK(local_A); ++static DEFINE_PER_CPU(local_lock_t, local_A); + + /* + * non-inlined runtime initializers, to let separate locks share +@@ -712,12 +718,18 @@ GENERATE_TESTCASE(ABCDBCDA_rtmutex); + + #undef E + ++#ifdef CONFIG_PREEMPT_RT ++# define RT_PREPARE_DBL_UNLOCK() { migrate_disable(); rcu_read_lock(); } ++#else ++# define RT_PREPARE_DBL_UNLOCK() ++#endif + /* + * Double unlock: + */ + #define E() \ + \ + LOCK(A); \ ++ RT_PREPARE_DBL_UNLOCK(); \ + UNLOCK(A); \ + UNLOCK(A); /* fail */ + +@@ -802,6 +814,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_spin) + +@@ -810,10 +823,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) ++#endif + + #undef E1 + #undef E2 + ++#ifndef CONFIG_PREEMPT_RT + /* + * Enabling hardirqs with a softirq-safe lock held: + */ +@@ -846,6 +861,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) + #undef E1 + #undef E2 + ++#endif ++ + /* + * Enabling irqs with an irq-safe lock held: + */ +@@ -875,6 +892,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_spin) + +@@ -883,6 +901,7 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -921,6 +940,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_spin) + +@@ -929,6 +949,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -969,6 +990,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_spin) + +@@ -977,6 +999,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -1031,6 +1054,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_rlock) + #include "locking-selftest-wlock-hardirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-spin-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_spin) + +@@ -1039,6 +1063,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_rlock) + + #include "locking-selftest-wlock-softirq.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_inversion_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -1206,12 +1231,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_rlock) + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-softirq.h" + #include "locking-selftest-rlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_rlock) + + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -1252,12 +1279,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_rlock) + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-softirq.h" + #include "locking-selftest-rlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_rlock) + + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion2_soft_wlock) ++#endif + + #undef E1 + #undef E2 +@@ -1306,12 +1335,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_rlock) + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_hard_wlock) + ++#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-softirq.h" + #include "locking-selftest-rlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_rlock) + + #include "locking-selftest-wlock.h" + GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) ++#endif + + #ifdef CONFIG_DEBUG_LOCK_ALLOC + # define I_SPINLOCK(x) lockdep_reset_lock(&lock_##x.dep_map) +@@ -1320,7 +1351,7 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion3_soft_wlock) + # define I_MUTEX(x) lockdep_reset_lock(&mutex_##x.dep_map) + # define I_RWSEM(x) lockdep_reset_lock(&rwsem_##x.dep_map) + # define I_WW(x) lockdep_reset_lock(&x.dep_map) +-# define I_LOCAL_LOCK(x) lockdep_reset_lock(&local_##x.dep_map) ++# define I_LOCAL_LOCK(x) lockdep_reset_lock(this_cpu_ptr(&local_##x.dep_map)) + #ifdef CONFIG_RT_MUTEXES + # define I_RTMUTEX(x) lockdep_reset_lock(&rtmutex_##x.dep_map) + #endif +@@ -1380,7 +1411,7 @@ static void reset_locks(void) + init_shared_classes(); + raw_spin_lock_init(&raw_lock_A); + raw_spin_lock_init(&raw_lock_B); +- local_lock_init(&local_A); ++ local_lock_init(this_cpu_ptr(&local_A)); + + ww_mutex_init(&o, &ww_lockdep); ww_mutex_init(&o2, &ww_lockdep); ww_mutex_init(&o3, &ww_lockdep); + memset(&t, 0, sizeof(t)); memset(&t2, 0, sizeof(t2)); +@@ -1398,7 +1429,13 @@ static int unexpected_testcase_failures; + + static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) + { +- unsigned long saved_preempt_count = preempt_count(); ++ int saved_preempt_count = preempt_count(); ++#ifdef CONFIG_PREEMPT_RT ++#ifdef CONFIG_SMP ++ int saved_mgd_count = current->migration_disabled; ++#endif ++ int saved_rcu_count = current->rcu_read_lock_nesting; ++#endif + + WARN_ON(irqs_disabled()); + +@@ -1432,6 +1469,18 @@ static void dotest(void (*testcase_fn)(void), int expected, int lockclass_mask) + * count, so restore it: + */ + preempt_count_set(saved_preempt_count); ++ ++#ifdef CONFIG_PREEMPT_RT ++#ifdef CONFIG_SMP ++ while (current->migration_disabled > saved_mgd_count) ++ migrate_enable(); ++#endif ++ ++ while (current->rcu_read_lock_nesting > saved_rcu_count) ++ rcu_read_unlock(); ++ WARN_ON_ONCE(current->rcu_read_lock_nesting < saved_rcu_count); ++#endif ++ + #ifdef CONFIG_TRACE_IRQFLAGS + if (softirq_count()) + current->softirqs_enabled = 0; +@@ -1499,7 +1548,7 @@ static inline void print_testname(const char *testname) + + #define DO_TESTCASE_2x2RW(desc, name, nr) \ + DO_TESTCASE_2RW("hard-"desc, name##_hard, nr) \ +- DO_TESTCASE_2RW("soft-"desc, name##_soft, nr) \ ++ NON_RT(DO_TESTCASE_2RW("soft-"desc, name##_soft, nr)) \ + + #define DO_TESTCASE_6x2x2RW(desc, name) \ + DO_TESTCASE_2x2RW(desc, name, 123); \ +@@ -1547,19 +1596,19 @@ static inline void print_testname(const char *testname) + + #define DO_TESTCASE_2I(desc, name, nr) \ + DO_TESTCASE_1("hard-"desc, name##_hard, nr); \ +- DO_TESTCASE_1("soft-"desc, name##_soft, nr); ++ NON_RT(DO_TESTCASE_1("soft-"desc, name##_soft, nr)); + + #define DO_TESTCASE_2IB(desc, name, nr) \ + DO_TESTCASE_1B("hard-"desc, name##_hard, nr); \ +- DO_TESTCASE_1B("soft-"desc, name##_soft, nr); ++ NON_RT(DO_TESTCASE_1B("soft-"desc, name##_soft, nr)); + + #define DO_TESTCASE_6I(desc, name, nr) \ + DO_TESTCASE_3("hard-"desc, name##_hard, nr); \ +- DO_TESTCASE_3("soft-"desc, name##_soft, nr); ++ NON_RT(DO_TESTCASE_3("soft-"desc, name##_soft, nr)); + + #define DO_TESTCASE_6IRW(desc, name, nr) \ + DO_TESTCASE_3RW("hard-"desc, name##_hard, nr); \ +- DO_TESTCASE_3RW("soft-"desc, name##_soft, nr); ++ NON_RT(DO_TESTCASE_3RW("soft-"desc, name##_soft, nr)); + + #define DO_TESTCASE_2x3(desc, name) \ + DO_TESTCASE_3(desc, name, 12); \ +@@ -1651,6 +1700,20 @@ static void ww_test_fail_acquire(void) + #endif + } + ++#ifdef CONFIG_PREEMPT_RT ++#define ww_mutex_base_lock(b) rt_mutex_lock(b) ++#define ww_mutex_base_lock_nest_lock(b, b2) rt_mutex_lock_nest_lock(b, b2) ++#define ww_mutex_base_lock_interruptible(b) rt_mutex_lock_interruptible(b) ++#define ww_mutex_base_lock_killable(b) rt_mutex_lock_killable(b) ++#define ww_mutex_base_unlock(b) rt_mutex_unlock(b) ++#else ++#define ww_mutex_base_lock(b) mutex_lock(b) ++#define ww_mutex_base_lock_nest_lock(b, b2) mutex_lock_nest_lock(b, b2) ++#define ww_mutex_base_lock_interruptible(b) mutex_lock_interruptible(b) ++#define ww_mutex_base_lock_killable(b) mutex_lock_killable(b) ++#define ww_mutex_base_unlock(b) mutex_unlock(b) ++#endif ++ + static void ww_test_normal(void) + { + int ret; +@@ -1665,50 +1728,50 @@ static void ww_test_normal(void) + + /* mutex_lock (and indirectly, mutex_lock_nested) */ + o.ctx = (void *)~0UL; +- mutex_lock(&o.base); +- mutex_unlock(&o.base); ++ ww_mutex_base_lock(&o.base); ++ ww_mutex_base_unlock(&o.base); + WARN_ON(o.ctx != (void *)~0UL); + + /* mutex_lock_interruptible (and *_nested) */ + o.ctx = (void *)~0UL; +- ret = mutex_lock_interruptible(&o.base); ++ ret = ww_mutex_base_lock_interruptible(&o.base); + if (!ret) +- mutex_unlock(&o.base); ++ ww_mutex_base_unlock(&o.base); + else + WARN_ON(1); + WARN_ON(o.ctx != (void *)~0UL); + + /* mutex_lock_killable (and *_nested) */ + o.ctx = (void *)~0UL; +- ret = mutex_lock_killable(&o.base); ++ ret = ww_mutex_base_lock_killable(&o.base); + if (!ret) +- mutex_unlock(&o.base); ++ ww_mutex_base_unlock(&o.base); + else + WARN_ON(1); + WARN_ON(o.ctx != (void *)~0UL); + + /* trylock, succeeding */ + o.ctx = (void *)~0UL; +- ret = mutex_trylock(&o.base); ++ ret = ww_mutex_base_trylock(&o.base); + WARN_ON(!ret); + if (ret) +- mutex_unlock(&o.base); ++ ww_mutex_base_unlock(&o.base); + else + WARN_ON(1); + WARN_ON(o.ctx != (void *)~0UL); + + /* trylock, failing */ + o.ctx = (void *)~0UL; +- mutex_lock(&o.base); +- ret = mutex_trylock(&o.base); ++ ww_mutex_base_lock(&o.base); ++ ret = ww_mutex_base_trylock(&o.base); + WARN_ON(ret); +- mutex_unlock(&o.base); ++ ww_mutex_base_unlock(&o.base); + WARN_ON(o.ctx != (void *)~0UL); + + /* nest_lock */ + o.ctx = (void *)~0UL; +- mutex_lock_nest_lock(&o.base, &t); +- mutex_unlock(&o.base); ++ ww_mutex_base_lock_nest_lock(&o.base, &t); ++ ww_mutex_base_unlock(&o.base); + WARN_ON(o.ctx != (void *)~0UL); + } + +@@ -1721,7 +1784,7 @@ static void ww_test_two_contexts(void) + static void ww_test_diff_class(void) + { + WWAI(&t); +-#ifdef CONFIG_DEBUG_MUTEXES ++#ifdef DEBUG_WW_MUTEXES + t.ww_class = NULL; + #endif + WWL(&o, &t); +@@ -1785,7 +1848,7 @@ static void ww_test_edeadlk_normal(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + o2.ctx = &t2; + mutex_release(&o2.base.dep_map, _THIS_IP_); + +@@ -1801,7 +1864,7 @@ static void ww_test_edeadlk_normal(void) + + o2.ctx = NULL; + mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); +- mutex_unlock(&o2.base); ++ ww_mutex_base_unlock(&o2.base); + WWU(&o); + + WWL(&o2, &t); +@@ -1811,7 +1874,7 @@ static void ww_test_edeadlk_normal_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -1827,7 +1890,7 @@ static void ww_test_edeadlk_normal_slow(void) + + o2.ctx = NULL; + mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); +- mutex_unlock(&o2.base); ++ ww_mutex_base_unlock(&o2.base); + WWU(&o); + + ww_mutex_lock_slow(&o2, &t); +@@ -1837,7 +1900,7 @@ static void ww_test_edeadlk_no_unlock(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + o2.ctx = &t2; + mutex_release(&o2.base.dep_map, _THIS_IP_); + +@@ -1853,7 +1916,7 @@ static void ww_test_edeadlk_no_unlock(void) + + o2.ctx = NULL; + mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); +- mutex_unlock(&o2.base); ++ ww_mutex_base_unlock(&o2.base); + + WWL(&o2, &t); + } +@@ -1862,7 +1925,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -1878,7 +1941,7 @@ static void ww_test_edeadlk_no_unlock_slow(void) + + o2.ctx = NULL; + mutex_acquire(&o2.base.dep_map, 0, 1, _THIS_IP_); +- mutex_unlock(&o2.base); ++ ww_mutex_base_unlock(&o2.base); + + ww_mutex_lock_slow(&o2, &t); + } +@@ -1887,7 +1950,7 @@ static void ww_test_edeadlk_acquire_more(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -1908,7 +1971,7 @@ static void ww_test_edeadlk_acquire_more_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -1929,11 +1992,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +- mutex_lock(&o3.base); ++ ww_mutex_base_lock(&o3.base); + mutex_release(&o3.base.dep_map, _THIS_IP_); + o3.ctx = &t2; + +@@ -1955,11 +2018,11 @@ static void ww_test_edeadlk_acquire_more_edeadlk_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +- mutex_lock(&o3.base); ++ ww_mutex_base_lock(&o3.base); + mutex_release(&o3.base.dep_map, _THIS_IP_); + o3.ctx = &t2; + +@@ -1980,7 +2043,7 @@ static void ww_test_edeadlk_acquire_wrong(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -2005,7 +2068,7 @@ static void ww_test_edeadlk_acquire_wrong_slow(void) + { + int ret; + +- mutex_lock(&o2.base); ++ ww_mutex_base_lock(&o2.base); + mutex_release(&o2.base.dep_map, _THIS_IP_); + o2.ctx = &t2; + +@@ -2646,8 +2709,8 @@ static void wait_context_tests(void) + + static void local_lock_2(void) + { +- local_lock_acquire(&local_A); /* IRQ-ON */ +- local_lock_release(&local_A); ++ local_lock(&local_A); /* IRQ-ON */ ++ local_unlock(&local_A); + + HARDIRQ_ENTER(); + spin_lock(&lock_A); /* IN-IRQ */ +@@ -2656,18 +2719,18 @@ static void local_lock_2(void) + + HARDIRQ_DISABLE(); + spin_lock(&lock_A); +- local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ +- local_lock_release(&local_A); ++ local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle, false */ ++ local_unlock(&local_A); + spin_unlock(&lock_A); + HARDIRQ_ENABLE(); + } + + static void local_lock_3A(void) + { +- local_lock_acquire(&local_A); /* IRQ-ON */ ++ local_lock(&local_A); /* IRQ-ON */ + spin_lock(&lock_B); /* IRQ-ON */ + spin_unlock(&lock_B); +- local_lock_release(&local_A); ++ local_unlock(&local_A); + + HARDIRQ_ENTER(); + spin_lock(&lock_A); /* IN-IRQ */ +@@ -2676,18 +2739,18 @@ static void local_lock_3A(void) + + HARDIRQ_DISABLE(); + spin_lock(&lock_A); +- local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ +- local_lock_release(&local_A); ++ local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ ++ local_unlock(&local_A); + spin_unlock(&lock_A); + HARDIRQ_ENABLE(); + } + + static void local_lock_3B(void) + { +- local_lock_acquire(&local_A); /* IRQ-ON */ ++ local_lock(&local_A); /* IRQ-ON */ + spin_lock(&lock_B); /* IRQ-ON */ + spin_unlock(&lock_B); +- local_lock_release(&local_A); ++ local_unlock(&local_A); + + HARDIRQ_ENTER(); + spin_lock(&lock_A); /* IN-IRQ */ +@@ -2696,8 +2759,8 @@ static void local_lock_3B(void) + + HARDIRQ_DISABLE(); + spin_lock(&lock_A); +- local_lock_acquire(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ +- local_lock_release(&local_A); ++ local_lock(&local_A); /* IN-IRQ <-> IRQ-ON cycle only if we count local_lock(), false */ ++ local_unlock(&local_A); + spin_unlock(&lock_A); + HARDIRQ_ENABLE(); + +@@ -2812,7 +2875,7 @@ void locking_selftest(void) + printk("------------------------\n"); + printk("| Locking API testsuite:\n"); + printk("----------------------------------------------------------------------------\n"); +- printk(" | spin |wlock |rlock |mutex | wsem | rsem |\n"); ++ printk(" | spin |wlock |rlock |mutex | wsem | rsem |rtmutex\n"); + printk(" --------------------------------------------------------------------------\n"); + + init_shared_classes(); +@@ -2885,12 +2948,11 @@ void locking_selftest(void) + DO_TESTCASE_6x1RR("rlock W1R2/R2R3/W3W1", W1R2_R2R3_W3W1); + + printk(" --------------------------------------------------------------------------\n"); +- + /* + * irq-context testcases: + */ + DO_TESTCASE_2x6("irqs-on + irq-safe-A", irqsafe1); +- DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A); ++ NON_RT(DO_TESTCASE_2x3("sirq-safe-A => hirqs-on", irqsafe2A)); + DO_TESTCASE_2x6("safe-A + irqs-on", irqsafe2B); + DO_TESTCASE_6x6("safe-A + unsafe-B #1", irqsafe3); + DO_TESTCASE_6x6("safe-A + unsafe-B #2", irqsafe4); +diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c +index 199ab201d501..06410209197a 100644 +--- a/lib/nmi_backtrace.c ++++ b/lib/nmi_backtrace.c +@@ -99,7 +99,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) + * Allow nested NMI backtraces while serializing + * against other CPUs. + */ +- printk_cpu_lock_irqsave(flags); ++ raw_printk_cpu_lock_irqsave(flags); + if (!READ_ONCE(backtrace_idle) && regs && cpu_in_idle(instruction_pointer(regs))) { + pr_warn("NMI backtrace for cpu %d skipped: idling at %pS\n", + cpu, (void *)instruction_pointer(regs)); +@@ -110,7 +110,7 @@ bool nmi_cpu_backtrace(struct pt_regs *regs) + else + dump_stack(); + } +- printk_cpu_unlock_irqrestore(flags); ++ raw_printk_cpu_unlock_irqrestore(flags); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return true; + } +diff --git a/lib/scatterlist.c b/lib/scatterlist.c +index abb3432ed744..d5e82e4a57ad 100644 +--- a/lib/scatterlist.c ++++ b/lib/scatterlist.c +@@ -828,8 +828,7 @@ static bool sg_miter_get_next_page(struct sg_mapping_iter *miter) + * stops @miter. + * + * Context: +- * Don't care if @miter is stopped, or not proceeded yet. +- * Otherwise, preemption disabled if the SG_MITER_ATOMIC is set. ++ * Don't care. + * + * Returns: + * true if @miter contains the valid mapping. false if end of sg +@@ -865,8 +864,7 @@ EXPORT_SYMBOL(sg_miter_skip); + * @miter->addr and @miter->length point to the current mapping. + * + * Context: +- * Preemption disabled if SG_MITER_ATOMIC. Preemption must stay disabled +- * till @miter is stopped. May sleep if !SG_MITER_ATOMIC. ++ * May sleep if !SG_MITER_ATOMIC. + * + * Returns: + * true if @miter contains the next mapping. false if end of sg +@@ -906,8 +904,7 @@ EXPORT_SYMBOL(sg_miter_next); + * need to be released during iteration. + * + * Context: +- * Preemption disabled if the SG_MITER_ATOMIC is set. Don't care +- * otherwise. ++ * Don't care otherwise. + */ + void sg_miter_stop(struct sg_mapping_iter *miter) + { +@@ -922,7 +919,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) + flush_dcache_page(miter->page); + + if (miter->__flags & SG_MITER_ATOMIC) { +- WARN_ON_ONCE(preemptible()); ++ WARN_ON_ONCE(!pagefault_disabled()); + kunmap_atomic(miter->addr); + } else + kunmap(miter->page); +diff --git a/localversion-rt b/localversion-rt +new file mode 100644 +index 000000000000..e2eb19782d4c +--- /dev/null ++++ b/localversion-rt +@@ -0,0 +1 @@ ++-rt65 +diff --git a/mm/Kconfig b/mm/Kconfig +index c048dea7e342..88778414465b 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -371,7 +371,7 @@ config NOMMU_INITIAL_TRIM_EXCESS + + config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" +- depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE ++ depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT + select COMPACTION + select XARRAY_MULTI + help +diff --git a/mm/memcontrol.c b/mm/memcontrol.c +index b68b2fe639fd..71b7b7371595 100644 +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -654,6 +654,35 @@ static u64 flush_next_time; + + #define FLUSH_TIME (2UL*HZ) + ++/* ++ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can ++ * not rely on this as part of an acquired spinlock_t lock. These functions are ++ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion ++ * is sufficient. ++ */ ++static void memcg_stats_lock(void) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ preempt_disable(); ++#else ++ VM_BUG_ON(!irqs_disabled()); ++#endif ++} ++ ++static void __memcg_stats_lock(void) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ preempt_disable(); ++#endif ++} ++ ++static void memcg_stats_unlock(void) ++{ ++#ifdef CONFIG_PREEMPT_RT ++ preempt_enable(); ++#endif ++} ++ + static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) + { + unsigned int x; +@@ -737,6 +766,27 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + memcg = pn->memcg; + ++ /* ++ * The caller from rmap relay on disabled preemption becase they never ++ * update their counter from in-interrupt context. For these two ++ * counters we check that the update is never performed from an ++ * interrupt context while other caller need to have disabled interrupt. ++ */ ++ __memcg_stats_lock(); ++ if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ switch (idx) { ++ case NR_ANON_MAPPED: ++ case NR_FILE_MAPPED: ++ case NR_ANON_THPS: ++ case NR_SHMEM_PMDMAPPED: ++ case NR_FILE_PMDMAPPED: ++ WARN_ON_ONCE(!in_task()); ++ break; ++ default: ++ WARN_ON_ONCE(!irqs_disabled()); ++ } ++ } ++ + /* Update memcg */ + __this_cpu_add(memcg->vmstats_percpu->state[idx], val); + +@@ -744,6 +794,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, + __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); + + memcg_rstat_updated(memcg, val); ++ memcg_stats_unlock(); + } + + /** +@@ -844,8 +895,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, + if (mem_cgroup_disabled()) + return; + ++ memcg_stats_lock(); + __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + memcg_rstat_updated(memcg, count); ++ memcg_stats_unlock(); + } + + static unsigned long memcg_events(struct mem_cgroup *memcg, int event) +@@ -909,6 +962,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, + */ + static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) + { ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ return; ++ + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(mem_cgroup_event_ratelimit(memcg, + MEM_CGROUP_TARGET_THRESH))) { +@@ -2102,39 +2158,37 @@ void unlock_page_memcg(struct page *page) + } + EXPORT_SYMBOL(unlock_page_memcg); + +-struct obj_stock { ++struct memcg_stock_pcp { ++ local_lock_t stock_lock; ++ struct mem_cgroup *cached; /* this never be root cgroup */ ++ unsigned int nr_pages; ++ + #ifdef CONFIG_MEMCG_KMEM + struct obj_cgroup *cached_objcg; + struct pglist_data *cached_pgdat; + unsigned int nr_bytes; + int nr_slab_reclaimable_b; + int nr_slab_unreclaimable_b; +-#else +- int dummy[0]; + #endif +-}; +- +-struct memcg_stock_pcp { +- struct mem_cgroup *cached; /* this never be root cgroup */ +- unsigned int nr_pages; +- struct obj_stock task_obj; +- struct obj_stock irq_obj; + + struct work_struct work; + unsigned long flags; + #define FLUSHING_CACHED_CHARGE 0 + }; +-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); ++static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = { ++ .stock_lock = INIT_LOCAL_LOCK(stock_lock), ++}; + static DEFINE_MUTEX(percpu_charge_mutex); + + #ifdef CONFIG_MEMCG_KMEM +-static void drain_obj_stock(struct obj_stock *stock); ++static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock); + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg); + + #else +-static inline void drain_obj_stock(struct obj_stock *stock) ++static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) + { ++ return NULL; + } + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + struct mem_cgroup *root_memcg) +@@ -2144,41 +2198,6 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + #endif + + /* +- * Most kmem_cache_alloc() calls are from user context. The irq disable/enable +- * sequence used in this case to access content from object stock is slow. +- * To optimize for user context access, there are now two object stocks for +- * task context and interrupt context access respectively. +- * +- * The task context object stock can be accessed by disabling preemption only +- * which is cheap in non-preempt kernel. The interrupt context object stock +- * can only be accessed after disabling interrupt. User context code can +- * access interrupt object stock, but not vice versa. +- */ +-static inline struct obj_stock *get_obj_stock(unsigned long *pflags) +-{ +- struct memcg_stock_pcp *stock; +- +- if (likely(in_task())) { +- *pflags = 0UL; +- preempt_disable(); +- stock = this_cpu_ptr(&memcg_stock); +- return &stock->task_obj; +- } +- +- local_irq_save(*pflags); +- stock = this_cpu_ptr(&memcg_stock); +- return &stock->irq_obj; +-} +- +-static inline void put_obj_stock(unsigned long flags) +-{ +- if (likely(in_task())) +- preempt_enable(); +- else +- local_irq_restore(flags); +-} +- +-/** + * consume_stock: Try to consume stocked charge on this cpu. + * @memcg: memcg to consume from. + * @nr_pages: how many pages to charge. +@@ -2198,7 +2217,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + if (nr_pages > MEMCG_CHARGE_BATCH) + return ret; + +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); + if (memcg == stock->cached && stock->nr_pages >= nr_pages) { +@@ -2206,7 +2225,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + ret = true; + } + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + return ret; + } +@@ -2235,6 +2254,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) + static void drain_local_stock(struct work_struct *dummy) + { + struct memcg_stock_pcp *stock; ++ struct obj_cgroup *old = NULL; + unsigned long flags; + + /* +@@ -2242,28 +2262,25 @@ static void drain_local_stock(struct work_struct *dummy) + * drain_stock races is that we always operate on local CPU stock + * here with IRQ disabled + */ +- local_irq_save(flags); ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); + + stock = this_cpu_ptr(&memcg_stock); +- drain_obj_stock(&stock->irq_obj); +- if (in_task()) +- drain_obj_stock(&stock->task_obj); ++ old = drain_obj_stock(stock); + drain_stock(stock); + clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); + +- local_irq_restore(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); ++ if (old) ++ obj_cgroup_put(old); + } + + /* + * Cache charges(val) to local per_cpu area. + * This will be consumed by consume_stock() function, later. + */ +-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ++static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + { + struct memcg_stock_pcp *stock; +- unsigned long flags; +- +- local_irq_save(flags); + + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached != memcg) { /* reset if necessary */ +@@ -2275,8 +2292,15 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) + + if (stock->nr_pages > MEMCG_CHARGE_BATCH) + drain_stock(stock); ++} + +- local_irq_restore(flags); ++static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ++{ ++ unsigned long flags; ++ ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ __refill_stock(memcg, nr_pages); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + } + + /* +@@ -2296,7 +2320,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + * as well as workers from this path always operate on the local + * per-cpu data. CPU up doesn't touch memcg_stock at all. + */ +- curcpu = get_cpu(); ++ migrate_disable(); ++ curcpu = smp_processor_id(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + struct mem_cgroup *memcg; +@@ -2319,7 +2344,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) + schedule_work_on(cpu, &stock->work); + } + } +- put_cpu(); ++ migrate_enable(); + mutex_unlock(&percpu_charge_mutex); + } + +@@ -3084,17 +3109,21 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) + void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + enum node_stat_item idx, int nr) + { ++ struct memcg_stock_pcp *stock; ++ struct obj_cgroup *old = NULL; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); + int *bytes; + ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ stock = this_cpu_ptr(&memcg_stock); ++ + /* + * Save vmstat data in stock and skip vmstat array update unless + * accumulating over a page of vmstat data or when pgdat or idx + * changes. + */ + if (stock->cached_objcg != objcg) { +- drain_obj_stock(stock); ++ old = drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) + ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0; +@@ -3138,38 +3167,53 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, + if (nr) + mod_objcg_mlstate(objcg, pgdat, idx, nr); + +- put_obj_stock(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); ++ if (old) ++ obj_cgroup_put(old); + } + + static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) + { ++ struct memcg_stock_pcp *stock; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); + bool ret = false; + ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ ++ stock = this_cpu_ptr(&memcg_stock); + if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { + stock->nr_bytes -= nr_bytes; + ret = true; + } + +- put_obj_stock(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); + + return ret; + } + +-static void drain_obj_stock(struct obj_stock *stock) ++static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock) + { + struct obj_cgroup *old = stock->cached_objcg; + + if (!old) +- return; ++ return NULL; + + if (stock->nr_bytes) { + unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT; + unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1); + +- if (nr_pages) +- obj_cgroup_uncharge_pages(old, nr_pages); ++ if (nr_pages) { ++ struct mem_cgroup *memcg; ++ ++ memcg = get_mem_cgroup_from_objcg(old); ++ ++ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) ++ page_counter_uncharge(&memcg->kmem, nr_pages); ++ ++ __refill_stock(memcg, nr_pages); ++ ++ css_put(&memcg->css); ++ } + + /* + * The leftover is flushed to the centralized per-memcg value. +@@ -3204,8 +3248,12 @@ static void drain_obj_stock(struct obj_stock *stock) + stock->cached_pgdat = NULL; + } + +- obj_cgroup_put(old); + stock->cached_objcg = NULL; ++ /* ++ * The `old' objects needs to be released by the caller via ++ * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock. ++ */ ++ return old; + } + + static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, +@@ -3213,13 +3261,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + { + struct mem_cgroup *memcg; + +- if (in_task() && stock->task_obj.cached_objcg) { +- memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); +- if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) +- return true; +- } +- if (stock->irq_obj.cached_objcg) { +- memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg); ++ if (stock->cached_objcg) { ++ memcg = obj_cgroup_memcg(stock->cached_objcg); + if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) + return true; + } +@@ -3230,12 +3273,16 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, + static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + bool allow_uncharge) + { ++ struct memcg_stock_pcp *stock; ++ struct obj_cgroup *old = NULL; + unsigned long flags; +- struct obj_stock *stock = get_obj_stock(&flags); + unsigned int nr_pages = 0; + ++ local_lock_irqsave(&memcg_stock.stock_lock, flags); ++ ++ stock = this_cpu_ptr(&memcg_stock); + if (stock->cached_objcg != objcg) { /* reset if necessary */ +- drain_obj_stock(stock); ++ old = drain_obj_stock(stock); + obj_cgroup_get(objcg); + stock->cached_objcg = objcg; + stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes) +@@ -3249,7 +3296,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, + stock->nr_bytes &= (PAGE_SIZE - 1); + } + +- put_obj_stock(flags); ++ local_unlock_irqrestore(&memcg_stock.stock_lock, flags); ++ if (old) ++ obj_cgroup_put(old); + + if (nr_pages) + obj_cgroup_uncharge_pages(objcg, nr_pages); +@@ -3816,8 +3865,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, + } + break; + case RES_SOFT_LIMIT: +- memcg->soft_limit = nr_pages; +- ret = 0; ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { ++ ret = -EOPNOTSUPP; ++ } else { ++ memcg->soft_limit = nr_pages; ++ ret = 0; ++ } + break; + } + return ret ?: nbytes; +@@ -4798,6 +4851,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of, + char *endp; + int ret; + ++ if (IS_ENABLED(CONFIG_PREEMPT_RT)) ++ return -EOPNOTSUPP; ++ + buf = strstrip(buf); + + efd = simple_strtoul(buf, &endp, 10); +@@ -6889,7 +6945,6 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) + unsigned long nr_pages; + struct mem_cgroup *memcg; + struct obj_cgroup *objcg; +- bool use_objcg = PageMemcgKmem(page); + + VM_BUG_ON_PAGE(PageLRU(page), page); + +@@ -6898,7 +6953,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) + * page memcg or objcg at this point, we have fully + * exclusive access to the page. + */ +- if (use_objcg) { ++ if (PageMemcgKmem(page)) { + objcg = __page_objcg(page); + /* + * This get matches the put at the end of the function and +@@ -6926,7 +6981,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug) + + nr_pages = compound_nr(page); + +- if (use_objcg) { ++ if (PageMemcgKmem(page)) { + ug->nr_memory += nr_pages; + ug->nr_kmem += nr_pages; + +@@ -7256,8 +7311,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) + * important here to have the interrupts disabled because it is the + * only synchronisation we have for updating the per-CPU variables. + */ +- VM_BUG_ON(!irqs_disabled()); ++ memcg_stats_lock(); + mem_cgroup_charge_statistics(memcg, page, -nr_entries); ++ memcg_stats_unlock(); + memcg_check_events(memcg, page); + + css_put(&memcg->css); +diff --git a/mm/memory.c b/mm/memory.c +index 8d71a82462dd..e2a9f89bbcf2 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -5305,7 +5305,7 @@ void __might_fault(const char *file, int line) + return; + if (pagefault_disabled()) + return; +- __might_sleep(file, line, 0); ++ __might_sleep(file, line); + #if defined(CONFIG_DEBUG_ATOMIC_SLEEP) + if (current->mm) + might_lock_read(¤t->mm->mmap_lock); +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index f320ee2bd34a..33355028122a 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -3149,9 +3149,9 @@ static void drain_local_pages_wq(struct work_struct *work) + * cpu which is alright but we also have to make sure to not move to + * a different one. + */ +- preempt_disable(); ++ migrate_disable(); + drain_local_pages(drain->zone); +- preempt_enable(); ++ migrate_enable(); + } + + /* +diff --git a/mm/vmalloc.c b/mm/vmalloc.c +index 3e482209a1c4..1a59b7b4ff67 100644 +--- a/mm/vmalloc.c ++++ b/mm/vmalloc.c +@@ -1918,11 +1918,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) + return ERR_PTR(err); + } + +- vbq = &get_cpu_var(vmap_block_queue); ++ get_cpu_light(); ++ vbq = this_cpu_ptr(&vmap_block_queue); + spin_lock(&vbq->lock); + list_add_tail_rcu(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); +- put_cpu_var(vmap_block_queue); ++ put_cpu_light(); + + return vaddr; + } +@@ -2001,7 +2002,8 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + order = get_order(size); + + rcu_read_lock(); +- vbq = &get_cpu_var(vmap_block_queue); ++ get_cpu_light(); ++ vbq = this_cpu_ptr(&vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + unsigned long pages_off; + +@@ -2024,7 +2026,7 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) + break; + } + +- put_cpu_var(vmap_block_queue); ++ put_cpu_light(); + rcu_read_unlock(); + + /* Allocate new block if nothing was found */ +diff --git a/mm/workingset.c b/mm/workingset.c +index 880d882f3325..2a9ed5aeb6fa 100644 +--- a/mm/workingset.c ++++ b/mm/workingset.c +@@ -433,6 +433,8 @@ static struct list_lru shadow_nodes; + + void workingset_update_node(struct xa_node *node) + { ++ struct address_space *mapping; ++ + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. +@@ -441,7 +443,8 @@ void workingset_update_node(struct xa_node *node) + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by the i_pages lock. + */ +- VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */ ++ mapping = container_of(node->array, struct address_space, i_pages); ++ lockdep_assert_held(&mapping->i_pages.xa_lock); + + if (node->count && node->count == node->nr_values) { + if (list_empty(&node->private_list)) { +diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c +index 439deb8decbc..a66431853394 100644 +--- a/mm/zsmalloc.c ++++ b/mm/zsmalloc.c +@@ -57,6 +57,7 @@ + #include + #include + #include ++#include + + #define ZSPAGE_MAGIC 0x58 + +@@ -77,6 +78,20 @@ + + #define ZS_HANDLE_SIZE (sizeof(unsigned long)) + ++#ifdef CONFIG_PREEMPT_RT ++ ++struct zsmalloc_handle { ++ unsigned long addr; ++ spinlock_t lock; ++}; ++ ++#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) ++ ++#else ++ ++#define ZS_HANDLE_ALLOC_SIZE (sizeof(unsigned long)) ++#endif ++ + /* + * Object location (, ) is encoded as + * a single (unsigned long) handle value. +@@ -293,6 +308,7 @@ struct zspage { + }; + + struct mapping_area { ++ local_lock_t lock; + char *vm_buf; /* copy buffer for objects that span pages */ + char *vm_addr; /* address of kmap_atomic()'ed pages */ + enum zs_mapmode vm_mm; /* mapping mode */ +@@ -322,7 +338,7 @@ static void SetZsPageMovable(struct zs_pool *pool, struct zspage *zspage) {} + + static int create_cache(struct zs_pool *pool) + { +- pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_SIZE, ++ pool->handle_cachep = kmem_cache_create("zs_handle", ZS_HANDLE_ALLOC_SIZE, + 0, 0, NULL); + if (!pool->handle_cachep) + return 1; +@@ -346,10 +362,27 @@ static void destroy_cache(struct zs_pool *pool) + + static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) + { +- return (unsigned long)kmem_cache_alloc(pool->handle_cachep, +- gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); ++ void *p; ++ ++ p = kmem_cache_alloc(pool->handle_cachep, ++ gfp & ~(__GFP_HIGHMEM|__GFP_MOVABLE)); ++#ifdef CONFIG_PREEMPT_RT ++ if (p) { ++ struct zsmalloc_handle *zh = p; ++ ++ spin_lock_init(&zh->lock); ++ } ++#endif ++ return (unsigned long)p; + } + ++#ifdef CONFIG_PREEMPT_RT ++static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) ++{ ++ return (void *)(handle & ~((1 << OBJ_TAG_BITS) - 1)); ++} ++#endif ++ + static void cache_free_handle(struct zs_pool *pool, unsigned long handle) + { + kmem_cache_free(pool->handle_cachep, (void *)handle); +@@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_pool *pool, struct zspage *zspage) + + static void record_obj(unsigned long handle, unsigned long obj) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ WRITE_ONCE(zh->addr, obj); ++#else + /* + * lsb of @obj represents handle lock while other bits + * represent object value the handle is pointing so + * updating shouldn't do store tearing. + */ + WRITE_ONCE(*(unsigned long *)handle, obj); ++#endif + } + + /* zpool driver */ +@@ -455,7 +494,9 @@ MODULE_ALIAS("zpool-zsmalloc"); + #endif /* CONFIG_ZPOOL */ + + /* per-cpu VM mapping areas for zspage accesses that cross page boundaries */ +-static DEFINE_PER_CPU(struct mapping_area, zs_map_area); ++static DEFINE_PER_CPU(struct mapping_area, zs_map_area) = { ++ .lock = INIT_LOCAL_LOCK(lock), ++}; + + static bool is_zspage_isolated(struct zspage *zspage) + { +@@ -862,7 +903,13 @@ static unsigned long location_to_obj(struct page *page, unsigned int obj_idx) + + static unsigned long handle_to_obj(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return zh->addr; ++#else + return *(unsigned long *)handle; ++#endif + } + + static unsigned long obj_to_head(struct page *page, void *obj) +@@ -876,22 +923,46 @@ static unsigned long obj_to_head(struct page *page, void *obj) + + static inline int testpin_tag(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_is_locked(&zh->lock); ++#else + return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static inline int trypin_tag(unsigned long handle) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_trylock(&zh->lock); ++#else + return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void pin_tag(unsigned long handle) __acquires(bitlock) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_lock(&zh->lock); ++#else + bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void unpin_tag(unsigned long handle) __releases(bitlock) + { ++#ifdef CONFIG_PREEMPT_RT ++ struct zsmalloc_handle *zh = zs_get_pure_handle(handle); ++ ++ return spin_unlock(&zh->lock); ++#else + bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); ++#endif + } + + static void reset_page(struct page *page) +@@ -1274,7 +1345,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, + class = pool->size_class[class_idx]; + off = (class->size * obj_idx) & ~PAGE_MASK; + +- area = &get_cpu_var(zs_map_area); ++ local_lock(&zs_map_area.lock); ++ area = this_cpu_ptr(&zs_map_area); + area->vm_mm = mm; + if (off + class->size <= PAGE_SIZE) { + /* this object is contained entirely within a page */ +@@ -1328,7 +1400,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) + + __zs_unmap_object(area, pages, off, class->size); + } +- put_cpu_var(zs_map_area); ++ local_unlock(&zs_map_area.lock); + + migrate_read_unlock(zspage); + unpin_tag(handle); +diff --git a/net/Kconfig b/net/Kconfig +index 76a3385943e5..bd7386eede23 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -292,7 +292,7 @@ config CGROUP_NET_CLASSID + + config NET_RX_BUSY_POLL + bool +- default y ++ default y if !PREEMPT_RT + + config BQL + bool +diff --git a/net/core/dev.c b/net/core/dev.c +index 4d698ccf4172..4bed27338ed9 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -225,14 +225,14 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex) + static inline void rps_lock(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- spin_lock(&sd->input_pkt_queue.lock); ++ raw_spin_lock(&sd->input_pkt_queue.raw_lock); + #endif + } + + static inline void rps_unlock(struct softnet_data *sd) + { + #ifdef CONFIG_RPS +- spin_unlock(&sd->input_pkt_queue.lock); ++ raw_spin_unlock(&sd->input_pkt_queue.raw_lock); + #endif + } + +@@ -3046,6 +3046,7 @@ static void __netif_reschedule(struct Qdisc *q) + sd->output_queue_tailp = &q->next_sched; + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + + void __netif_schedule(struct Qdisc *q) +@@ -3108,6 +3109,7 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) + __this_cpu_write(softnet_data.completion_queue, skb); + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(__dev_kfree_skb_irq); + +@@ -3841,7 +3843,11 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, + * This permits qdisc->running owner to get the lock more + * often and dequeue packets faster. + */ ++#ifdef CONFIG_PREEMPT_RT ++ contended = true; ++#else + contended = qdisc_is_running(q); ++#endif + if (unlikely(contended)) + spin_lock(&q->busylock); + +@@ -4669,6 +4675,7 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu, + rps_unlock(sd); + + local_irq_restore(flags); ++ preempt_check_resched_rt(); + + atomic_long_inc(&skb->dev->rx_dropped); + kfree_skb(skb); +@@ -4909,7 +4916,7 @@ static int netif_rx_internal(struct sk_buff *skb) + struct rps_dev_flow voidflow, *rflow = &voidflow; + int cpu; + +- preempt_disable(); ++ migrate_disable(); + rcu_read_lock(); + + cpu = get_rps_cpu(skb->dev, skb, &rflow); +@@ -4919,14 +4926,14 @@ static int netif_rx_internal(struct sk_buff *skb) + ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail); + + rcu_read_unlock(); +- preempt_enable(); ++ migrate_enable(); + } else + #endif + { + unsigned int qtail; + +- ret = enqueue_to_backlog(skb, get_cpu(), &qtail); +- put_cpu(); ++ ret = enqueue_to_backlog(skb, get_cpu_light(), &qtail); ++ put_cpu_light(); + } + return ret; + } +@@ -4965,11 +4972,9 @@ int netif_rx_ni(struct sk_buff *skb) + + trace_netif_rx_ni_entry(skb); + +- preempt_disable(); ++ local_bh_disable(); + err = netif_rx_internal(skb); +- if (local_softirq_pending()) +- do_softirq(); +- preempt_enable(); ++ local_bh_enable(); + trace_netif_rx_ni_exit(err); + + return err; +@@ -6413,12 +6418,14 @@ static void net_rps_action_and_irq_enable(struct softnet_data *sd) + sd->rps_ipi_list = NULL; + + local_irq_enable(); ++ preempt_check_resched_rt(); + + /* Send pending IPI's to kick RPS processing on remote cpus. */ + net_rps_send_ipi(remsd); + } else + #endif + local_irq_enable(); ++ preempt_check_resched_rt(); + } + + static bool sd_has_rps_ipi_waiting(struct softnet_data *sd) +@@ -6496,6 +6503,7 @@ void __napi_schedule(struct napi_struct *n) + local_irq_save(flags); + ____napi_schedule(this_cpu_ptr(&softnet_data), n); + local_irq_restore(flags); ++ preempt_check_resched_rt(); + } + EXPORT_SYMBOL(__napi_schedule); + +@@ -11316,6 +11324,7 @@ static int dev_cpu_dead(unsigned int oldcpu) + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); ++ preempt_check_resched_rt(); + + #ifdef CONFIG_RPS + remsd = oldsd->rps_ipi_list; +@@ -11329,7 +11338,7 @@ static int dev_cpu_dead(unsigned int oldcpu) + netif_rx_ni(skb); + input_queue_head_incr(oldsd); + } +- while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) { ++ while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) { + netif_rx_ni(skb); + input_queue_head_incr(oldsd); + } +@@ -11644,7 +11653,7 @@ static int __init net_dev_init(void) + + INIT_WORK(flush, flush_backlog); + +- skb_queue_head_init(&sd->input_pkt_queue); ++ skb_queue_head_init_raw(&sd->input_pkt_queue); + skb_queue_head_init(&sd->process_queue); + #ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c +index 8e582e29a41e..4fcbdd71c59f 100644 +--- a/net/core/gen_estimator.c ++++ b/net/core/gen_estimator.c +@@ -40,10 +40,10 @@ + */ + + struct net_rate_estimator { +- struct gnet_stats_basic_packed *bstats; ++ struct gnet_stats_basic_sync *bstats; + spinlock_t *stats_lock; +- seqcount_t *running; +- struct gnet_stats_basic_cpu __percpu *cpu_bstats; ++ bool running; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats; + u8 ewma_log; + u8 intvl_log; /* period : (250ms << intvl_log) */ + +@@ -60,13 +60,13 @@ struct net_rate_estimator { + }; + + static void est_fetch_counters(struct net_rate_estimator *e, +- struct gnet_stats_basic_packed *b) ++ struct gnet_stats_basic_sync *b) + { +- memset(b, 0, sizeof(*b)); ++ gnet_stats_basic_sync_init(b); + if (e->stats_lock) + spin_lock(e->stats_lock); + +- __gnet_stats_copy_basic(e->running, b, e->cpu_bstats, e->bstats); ++ gnet_stats_add_basic(b, e->cpu_bstats, e->bstats, e->running); + + if (e->stats_lock) + spin_unlock(e->stats_lock); +@@ -76,14 +76,18 @@ static void est_fetch_counters(struct net_rate_estimator *e, + static void est_timer(struct timer_list *t) + { + struct net_rate_estimator *est = from_timer(est, t, timer); +- struct gnet_stats_basic_packed b; ++ struct gnet_stats_basic_sync b; ++ u64 b_bytes, b_packets; + u64 rate, brate; + + est_fetch_counters(est, &b); +- brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); ++ b_bytes = u64_stats_read(&b.bytes); ++ b_packets = u64_stats_read(&b.packets); ++ ++ brate = (b_bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); + +- rate = (b.packets - est->last_packets) << (10 - est->intvl_log); ++ rate = (b_packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); + + write_seqcount_begin(&est->seq); +@@ -91,8 +95,8 @@ static void est_timer(struct timer_list *t) + est->avpps += rate; + write_seqcount_end(&est->seq); + +- est->last_bytes = b.bytes; +- est->last_packets = b.packets; ++ est->last_bytes = b_bytes; ++ est->last_packets = b_packets; + + est->next_jiffies += ((HZ/4) << est->intvl_log); + +@@ -109,7 +113,9 @@ static void est_timer(struct timer_list *t) + * @cpu_bstats: bstats per cpu + * @rate_est: rate estimator statistics + * @lock: lock for statistics and control path +- * @running: qdisc running seqcount ++ * @running: true if @bstats represents a running qdisc, thus @bstats' ++ * internal values might change during basic reads. Only used ++ * if @bstats_cpu is NULL + * @opt: rate estimator configuration TLV + * + * Creates a new rate estimator with &bstats as source and &rate_est +@@ -121,16 +127,16 @@ static void est_timer(struct timer_list *t) + * Returns 0 on success or a negative error code. + * + */ +-int gen_new_estimator(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu_bstats, ++int gen_new_estimator(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, ++ bool running, + struct nlattr *opt) + { + struct gnet_estimator *parm = nla_data(opt); + struct net_rate_estimator *old, *est; +- struct gnet_stats_basic_packed b; ++ struct gnet_stats_basic_sync b; + int intvl_log; + + if (nla_len(opt) < sizeof(*parm)) +@@ -164,8 +170,8 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, + est_fetch_counters(est, &b); + if (lock) + local_bh_enable(); +- est->last_bytes = b.bytes; +- est->last_packets = b.packets; ++ est->last_bytes = u64_stats_read(&b.bytes); ++ est->last_packets = u64_stats_read(&b.packets); + + if (lock) + spin_lock_bh(lock); +@@ -214,7 +220,9 @@ EXPORT_SYMBOL(gen_kill_estimator); + * @cpu_bstats: bstats per cpu + * @rate_est: rate estimator statistics + * @lock: lock for statistics and control path +- * @running: qdisc running seqcount (might be NULL) ++ * @running: true if @bstats represents a running qdisc, thus @bstats' ++ * internal values might change during basic reads. Only used ++ * if @cpu_bstats is NULL + * @opt: rate estimator configuration TLV + * + * Replaces the configuration of a rate estimator by calling +@@ -222,11 +230,11 @@ EXPORT_SYMBOL(gen_kill_estimator); + * + * Returns 0 on success or a negative error code. + */ +-int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu_bstats, ++int gen_replace_estimator(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu_bstats, + struct net_rate_estimator __rcu **rate_est, + spinlock_t *lock, +- seqcount_t *running, struct nlattr *opt) ++ bool running, struct nlattr *opt) + { + return gen_new_estimator(bstats, cpu_bstats, rate_est, + lock, running, opt); +diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c +index e491b083b348..a10335b4ba2d 100644 +--- a/net/core/gen_stats.c ++++ b/net/core/gen_stats.c +@@ -18,7 +18,7 @@ + #include + #include + #include +- ++#include + + static inline int + gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size, int padattr) +@@ -114,63 +114,112 @@ gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + } + EXPORT_SYMBOL(gnet_stats_start_copy); + +-static void +-__gnet_stats_copy_basic_cpu(struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu) ++/* Must not be inlined, due to u64_stats seqcount_t lockdep key */ ++void gnet_stats_basic_sync_init(struct gnet_stats_basic_sync *b) + { ++ u64_stats_set(&b->bytes, 0); ++ u64_stats_set(&b->packets, 0); ++ u64_stats_init(&b->syncp); ++} ++EXPORT_SYMBOL(gnet_stats_basic_sync_init); ++ ++static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu) ++{ ++ u64 t_bytes = 0, t_packets = 0; + int i; + + for_each_possible_cpu(i) { +- struct gnet_stats_basic_cpu *bcpu = per_cpu_ptr(cpu, i); ++ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); + unsigned int start; + u64 bytes, packets; + + do { + start = u64_stats_fetch_begin_irq(&bcpu->syncp); +- bytes = bcpu->bstats.bytes; +- packets = bcpu->bstats.packets; ++ bytes = u64_stats_read(&bcpu->bytes); ++ packets = u64_stats_read(&bcpu->packets); + } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + +- bstats->bytes += bytes; +- bstats->packets += packets; ++ t_bytes += bytes; ++ t_packets += packets; ++ } ++ _bstats_update(bstats, t_bytes, t_packets); ++} ++ ++void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running) ++{ ++ unsigned int start; ++ u64 bytes = 0; ++ u64 packets = 0; ++ ++ WARN_ON_ONCE((cpu || running) && in_hardirq()); ++ ++ if (cpu) { ++ gnet_stats_add_basic_cpu(bstats, cpu); ++ return; + } ++ do { ++ if (running) ++ start = u64_stats_fetch_begin_irq(&b->syncp); ++ bytes = u64_stats_read(&b->bytes); ++ packets = u64_stats_read(&b->packets); ++ } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); ++ ++ _bstats_update(bstats, bytes, packets); + } ++EXPORT_SYMBOL(gnet_stats_add_basic); + +-void +-__gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_stats_basic_packed *bstats, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b) ++static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, bool running) + { +- unsigned int seq; ++ unsigned int start; + + if (cpu) { +- __gnet_stats_copy_basic_cpu(bstats, cpu); ++ u64 t_bytes = 0, t_packets = 0; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ struct gnet_stats_basic_sync *bcpu = per_cpu_ptr(cpu, i); ++ unsigned int start; ++ u64 bytes, packets; ++ ++ do { ++ start = u64_stats_fetch_begin_irq(&bcpu->syncp); ++ bytes = u64_stats_read(&bcpu->bytes); ++ packets = u64_stats_read(&bcpu->packets); ++ } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); ++ ++ t_bytes += bytes; ++ t_packets += packets; ++ } ++ *ret_bytes = t_bytes; ++ *ret_packets = t_packets; + return; + } + do { + if (running) +- seq = read_seqcount_begin(running); +- bstats->bytes = b->bytes; +- bstats->packets = b->packets; +- } while (running && read_seqcount_retry(running, seq)); ++ start = u64_stats_fetch_begin_irq(&b->syncp); ++ *ret_bytes = u64_stats_read(&b->bytes); ++ *ret_packets = u64_stats_read(&b->packets); ++ } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + } +-EXPORT_SYMBOL(__gnet_stats_copy_basic); + + static int +-___gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b, +- int type) ++___gnet_stats_copy_basic(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, ++ int type, bool running) + { +- struct gnet_stats_basic_packed bstats = {0}; ++ u64 bstats_bytes, bstats_packets; + +- __gnet_stats_copy_basic(running, &bstats, cpu, b); ++ gnet_stats_read_basic(&bstats_bytes, &bstats_packets, cpu, b, running); + + if (d->compat_tc_stats && type == TCA_STATS_BASIC) { +- d->tc_stats.bytes = bstats.bytes; +- d->tc_stats.packets = bstats.packets; ++ d->tc_stats.bytes = bstats_bytes; ++ d->tc_stats.packets = bstats_packets; + } + + if (d->tail) { +@@ -178,24 +227,28 @@ ___gnet_stats_copy_basic(const seqcount_t *running, + int res; + + memset(&sb, 0, sizeof(sb)); +- sb.bytes = bstats.bytes; +- sb.packets = bstats.packets; ++ sb.bytes = bstats_bytes; ++ sb.packets = bstats_packets; + res = gnet_stats_copy(d, type, &sb, sizeof(sb), TCA_STATS_PAD); +- if (res < 0 || sb.packets == bstats.packets) ++ if (res < 0 || sb.packets == bstats_packets) + return res; + /* emit 64bit stats only if needed */ +- return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats.packets, +- sizeof(bstats.packets), TCA_STATS_PAD); ++ return gnet_stats_copy(d, TCA_STATS_PKT64, &bstats_packets, ++ sizeof(bstats_packets), TCA_STATS_PAD); + } + return 0; + } + + /** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV +- * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics ++ * @running: true if @b represents a running qdisc, thus @b's ++ * internal values might change during basic reads. ++ * Only used if @cpu is NULL ++ * ++ * Context: task; must not be run from IRQ or BH contexts + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). +@@ -204,22 +257,25 @@ ___gnet_stats_copy_basic(const seqcount_t *running, + * if the room in the socket buffer was not sufficient. + */ + int +-gnet_stats_copy_basic(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b) ++gnet_stats_copy_basic(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, ++ bool running) + { +- return ___gnet_stats_copy_basic(running, d, cpu, b, +- TCA_STATS_BASIC); ++ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC, running); + } + EXPORT_SYMBOL(gnet_stats_copy_basic); + + /** + * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV +- * @running: seqcount_t pointer + * @d: dumping handle + * @cpu: copy statistic per cpu + * @b: basic statistics ++ * @running: true if @b represents a running qdisc, thus @b's ++ * internal values might change during basic reads. ++ * Only used if @cpu is NULL ++ * ++ * Context: task; must not be run from IRQ or BH contexts + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). +@@ -228,13 +284,12 @@ EXPORT_SYMBOL(gnet_stats_copy_basic); + * if the room in the socket buffer was not sufficient. + */ + int +-gnet_stats_copy_basic_hw(const seqcount_t *running, +- struct gnet_dump *d, +- struct gnet_stats_basic_cpu __percpu *cpu, +- struct gnet_stats_basic_packed *b) ++gnet_stats_copy_basic_hw(struct gnet_dump *d, ++ struct gnet_stats_basic_sync __percpu *cpu, ++ struct gnet_stats_basic_sync *b, ++ bool running) + { +- return ___gnet_stats_copy_basic(running, d, cpu, b, +- TCA_STATS_BASIC_HW); ++ return ___gnet_stats_copy_basic(d, cpu, b, TCA_STATS_BASIC_HW, running); + } + EXPORT_SYMBOL(gnet_stats_copy_basic_hw); + +@@ -282,16 +337,15 @@ gnet_stats_copy_rate_est(struct gnet_dump *d, + } + EXPORT_SYMBOL(gnet_stats_copy_rate_est); + +-static void +-__gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, +- const struct gnet_stats_queue __percpu *q) ++static void gnet_stats_add_queue_cpu(struct gnet_stats_queue *qstats, ++ const struct gnet_stats_queue __percpu *q) + { + int i; + + for_each_possible_cpu(i) { + const struct gnet_stats_queue *qcpu = per_cpu_ptr(q, i); + +- qstats->qlen = 0; ++ qstats->qlen += qcpu->backlog; + qstats->backlog += qcpu->backlog; + qstats->drops += qcpu->drops; + qstats->requeues += qcpu->requeues; +@@ -299,24 +353,21 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, + } + } + +-void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, +- const struct gnet_stats_queue __percpu *cpu, +- const struct gnet_stats_queue *q, +- __u32 qlen) ++void gnet_stats_add_queue(struct gnet_stats_queue *qstats, ++ const struct gnet_stats_queue __percpu *cpu, ++ const struct gnet_stats_queue *q) + { + if (cpu) { +- __gnet_stats_copy_queue_cpu(qstats, cpu); ++ gnet_stats_add_queue_cpu(qstats, cpu); + } else { +- qstats->qlen = q->qlen; +- qstats->backlog = q->backlog; +- qstats->drops = q->drops; +- qstats->requeues = q->requeues; +- qstats->overlimits = q->overlimits; ++ qstats->qlen += q->qlen; ++ qstats->backlog += q->backlog; ++ qstats->drops += q->drops; ++ qstats->requeues += q->requeues; ++ qstats->overlimits += q->overlimits; + } +- +- qstats->qlen = qlen; + } +-EXPORT_SYMBOL(__gnet_stats_copy_queue); ++EXPORT_SYMBOL(gnet_stats_add_queue); + + /** + * gnet_stats_copy_queue - copy queue statistics into statistics TLV +@@ -339,7 +390,8 @@ gnet_stats_copy_queue(struct gnet_dump *d, + { + struct gnet_stats_queue qstats = {0}; + +- __gnet_stats_copy_queue(&qstats, cpu_q, q, qlen); ++ gnet_stats_add_queue(&qstats, cpu_q, q); ++ qstats.qlen = qlen; + + if (d->compat_tc_stats) { + d->tc_stats.drops = qstats.drops; +diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c +index 0d5c422f8745..8aec1b529364 100644 +--- a/net/netfilter/xt_RATEEST.c ++++ b/net/netfilter/xt_RATEEST.c +@@ -94,11 +94,11 @@ static unsigned int + xt_rateest_tg(struct sk_buff *skb, const struct xt_action_param *par) + { + const struct xt_rateest_target_info *info = par->targinfo; +- struct gnet_stats_basic_packed *stats = &info->est->bstats; ++ struct gnet_stats_basic_sync *stats = &info->est->bstats; + + spin_lock_bh(&info->est->lock); +- stats->bytes += skb->len; +- stats->packets++; ++ u64_stats_add(&stats->bytes, skb->len); ++ u64_stats_inc(&stats->packets); + spin_unlock_bh(&info->est->lock); + + return XT_CONTINUE; +@@ -143,6 +143,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) + if (!est) + goto err1; + ++ gnet_stats_basic_sync_init(&est->bstats); + strlcpy(est->name, info->name, sizeof(est->name)); + spin_lock_init(&est->lock); + est->refcnt = 1; +diff --git a/net/sched/act_api.c b/net/sched/act_api.c +index d775676956bf..94c05713ecf8 100644 +--- a/net/sched/act_api.c ++++ b/net/sched/act_api.c +@@ -486,16 +486,18 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + atomic_set(&p->tcfa_bindcnt, 1); + + if (cpustats) { +- p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); ++ p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); + if (!p->cpu_bstats) + goto err1; +- p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); ++ p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); + if (!p->cpu_bstats_hw) + goto err2; + p->cpu_qstats = alloc_percpu(struct gnet_stats_queue); + if (!p->cpu_qstats) + goto err3; + } ++ gnet_stats_basic_sync_init(&p->tcfa_bstats); ++ gnet_stats_basic_sync_init(&p->tcfa_bstats_hw); + spin_lock_init(&p->tcfa_lock); + p->tcfa_index = index; + p->tcfa_tm.install = jiffies; +@@ -505,7 +507,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est, + if (est) { + err = gen_new_estimator(&p->tcfa_bstats, p->cpu_bstats, + &p->tcfa_rate_est, +- &p->tcfa_lock, NULL, est); ++ &p->tcfa_lock, false, est); + if (err) + goto err4; + } +@@ -1141,13 +1143,13 @@ void tcf_action_update_stats(struct tc_action *a, u64 bytes, u64 packets, + u64 drops, bool hw) + { + if (a->cpu_bstats) { +- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); ++ _bstats_update(this_cpu_ptr(a->cpu_bstats), bytes, packets); + + this_cpu_ptr(a->cpu_qstats)->drops += drops; + + if (hw) +- _bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw), +- bytes, packets); ++ _bstats_update(this_cpu_ptr(a->cpu_bstats_hw), ++ bytes, packets); + return; + } + +@@ -1186,9 +1188,10 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p, + if (err < 0) + goto errout; + +- if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 || +- gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw, +- &p->tcfa_bstats_hw) < 0 || ++ if (gnet_stats_copy_basic(&d, p->cpu_bstats, ++ &p->tcfa_bstats, false) < 0 || ++ gnet_stats_copy_basic_hw(&d, p->cpu_bstats_hw, ++ &p->tcfa_bstats_hw, false) < 0 || + gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 || + gnet_stats_copy_queue(&d, p->cpu_qstats, + &p->tcfa_qstats, +diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c +index 2a05bad56ef3..a77d8908e737 100644 +--- a/net/sched/act_bpf.c ++++ b/net/sched/act_bpf.c +@@ -41,7 +41,7 @@ static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, + int action, filter_res; + + tcf_lastuse_update(&prog->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(prog->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(prog->common.cpu_bstats), skb); + + filter = rcu_dereference(prog->filter); + if (at_ingress) { +diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c +index ec987ec75807..41ba55e60b1b 100644 +--- a/net/sched/act_ife.c ++++ b/net/sched/act_ife.c +@@ -718,7 +718,7 @@ static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a, + u8 *tlv_data; + u16 metalen; + +- bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + tcf_lastuse_update(&ife->tcf_tm); + + if (skb_at_tc_ingress(skb)) +@@ -806,7 +806,7 @@ static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a, + exceed_mtu = true; + } + +- bstats_cpu_update(this_cpu_ptr(ife->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(ife->common.cpu_bstats), skb); + tcf_lastuse_update(&ife->tcf_tm); + + if (!metalen) { /* no metadata to send */ +diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c +index d010c5b8e83b..d39b74331c26 100644 +--- a/net/sched/act_mpls.c ++++ b/net/sched/act_mpls.c +@@ -59,7 +59,7 @@ static int tcf_mpls_act(struct sk_buff *skb, const struct tc_action *a, + int ret, mac_len; + + tcf_lastuse_update(&m->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(m->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(m->common.cpu_bstats), skb); + + /* Ensure 'data' points at mac_header prior calling mpls manipulating + * functions. +diff --git a/net/sched/act_police.c b/net/sched/act_police.c +index db1d021c16be..d4ac56e4579c 100644 +--- a/net/sched/act_police.c ++++ b/net/sched/act_police.c +@@ -125,7 +125,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, + police->common.cpu_bstats, + &police->tcf_rate_est, + &police->tcf_lock, +- NULL, est); ++ false, est); + if (err) + goto failure; + } else if (tb[TCA_POLICE_AVRATE] && +@@ -262,7 +262,7 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a, + int ret; + + tcf_lastuse_update(&police->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(police->common.cpu_bstats), skb); + + ret = READ_ONCE(police->tcf_action); + p = rcu_dereference_bh(police->params); +diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c +index ca67d9644917..ef35df94182f 100644 +--- a/net/sched/act_sample.c ++++ b/net/sched/act_sample.c +@@ -170,7 +170,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, + int retval; + + tcf_lastuse_update(&s->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(s->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(s->common.cpu_bstats), skb); + retval = READ_ONCE(s->tcf_action); + + psample_group = rcu_dereference_bh(s->psample_group); +diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c +index 788527154025..8c1d60bde93e 100644 +--- a/net/sched/act_simple.c ++++ b/net/sched/act_simple.c +@@ -36,7 +36,8 @@ static int tcf_simp_act(struct sk_buff *skb, const struct tc_action *a, + * then it would look like "hello_3" (without quotes) + */ + pr_info("simple: %s_%llu\n", +- (char *)d->tcfd_defdata, d->tcf_bstats.packets); ++ (char *)d->tcfd_defdata, ++ u64_stats_read(&d->tcf_bstats.packets)); + spin_unlock(&d->tcf_lock); + return d->tcf_action; + } +diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c +index 6088ceaf582e..f6df717b9f17 100644 +--- a/net/sched/act_skbedit.c ++++ b/net/sched/act_skbedit.c +@@ -31,7 +31,7 @@ static int tcf_skbedit_act(struct sk_buff *skb, const struct tc_action *a, + int action; + + tcf_lastuse_update(&d->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); + + params = rcu_dereference_bh(d->params); + action = READ_ONCE(d->tcf_action); +diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c +index ee9cc0abf9e1..2083612d8780 100644 +--- a/net/sched/act_skbmod.c ++++ b/net/sched/act_skbmod.c +@@ -31,7 +31,7 @@ static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, + u64 flags; + + tcf_lastuse_update(&d->tcf_tm); +- bstats_cpu_update(this_cpu_ptr(d->common.cpu_bstats), skb); ++ bstats_update(this_cpu_ptr(d->common.cpu_bstats), skb); + + action = READ_ONCE(d->tcf_action); + if (unlikely(action == TC_ACT_SHOT)) +diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c +index 328db5e1b0ea..c910046bbe4f 100644 +--- a/net/sched/sch_api.c ++++ b/net/sched/sch_api.c +@@ -884,7 +884,7 @@ static void qdisc_offload_graft_root(struct net_device *dev, + static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, + u32 portid, u32 seq, u16 flags, int event) + { +- struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL; ++ struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL; + struct gnet_stats_queue __percpu *cpu_qstats = NULL; + struct tcmsg *tcm; + struct nlmsghdr *nlh; +@@ -942,8 +942,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, + cpu_qstats = q->cpu_qstats; + } + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q), +- &d, cpu_bstats, &q->bstats) < 0 || ++ if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 || + gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || + gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0) + goto nla_put_failure; +@@ -1275,26 +1274,17 @@ static struct Qdisc *qdisc_create(struct net_device *dev, + rcu_assign_pointer(sch->stab, stab); + } + if (tca[TCA_RATE]) { +- seqcount_t *running; +- + err = -EOPNOTSUPP; + if (sch->flags & TCQ_F_MQROOT) { + NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc"); + goto err_out4; + } + +- if (sch->parent != TC_H_ROOT && +- !(sch->flags & TCQ_F_INGRESS) && +- (!p || !(p->flags & TCQ_F_MQROOT))) +- running = qdisc_root_sleeping_running(sch); +- else +- running = &sch->running; +- + err = gen_new_estimator(&sch->bstats, + sch->cpu_bstats, + &sch->rate_est, + NULL, +- running, ++ true, + tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to generate new estimator"); +@@ -1370,7 +1360,7 @@ static int qdisc_change(struct Qdisc *sch, struct nlattr **tca, + sch->cpu_bstats, + &sch->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + } + out: +diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c +index 33737169cc2d..28e1897e0da7 100644 +--- a/net/sched/sch_atm.c ++++ b/net/sched/sch_atm.c +@@ -52,7 +52,7 @@ struct atm_flow_data { + struct atm_qdisc_data *parent; /* parent qdisc */ + struct socket *sock; /* for closing */ + int ref; /* reference count */ +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct list_head list; + struct atm_flow_data *excess; /* flow for excess traffic; +@@ -551,6 +551,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt, + pr_debug("atm_tc_init(sch %p,[qdisc %p],opt %p)\n", sch, p, opt); + INIT_LIST_HEAD(&p->flows); + INIT_LIST_HEAD(&p->link.list); ++ gnet_stats_basic_sync_init(&p->link.bstats); + list_add(&p->link.list, &p->flows); + p->link.q = qdisc_create_dflt(sch->dev_queue, + &pfifo_qdisc_ops, sch->handle, extack); +@@ -654,8 +655,7 @@ atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + { + struct atm_flow_data *flow = (struct atm_flow_data *)arg; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &flow->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &flow->bstats, true) < 0 || + gnet_stats_copy_queue(d, NULL, &flow->qstats, flow->q->q.qlen) < 0) + return -1; + +diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c +index 46b3dd71777d..c3a74a2266b0 100644 +--- a/net/sched/sch_cbq.c ++++ b/net/sched/sch_cbq.c +@@ -116,7 +116,7 @@ struct cbq_class { + long avgidle; + long deficit; /* Saved deficit for WRR */ + psched_time_t penalized; +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct net_rate_estimator __rcu *rate_est; + struct tc_cbq_xstats xstats; +@@ -565,8 +565,7 @@ cbq_update(struct cbq_sched_data *q) + long avgidle = cl->avgidle; + long idle; + +- cl->bstats.packets++; +- cl->bstats.bytes += len; ++ _bstats_update(&cl->bstats, len, 1); + + /* + * (now - last) is total time between packet right edges. +@@ -1383,8 +1382,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + if (cl->undertime != PSCHED_PASTPERFECT) + cl->xstats.undertime = cl->undertime - q->now; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) + return -1; +@@ -1518,7 +1516,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator"); +@@ -1610,6 +1608,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t + if (cl == NULL) + goto failure; + ++ gnet_stats_basic_sync_init(&cl->bstats); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); + if (err) { + kfree(cl); +@@ -1618,9 +1617,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, +- NULL, +- qdisc_root_sleeping_running(sch), +- tca[TCA_RATE]); ++ NULL, true, tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Couldn't create new estimator"); + tcf_block_put(cl->block); +diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c +index 80a88e208d2b..4e5b1cf11b85 100644 +--- a/net/sched/sch_drr.c ++++ b/net/sched/sch_drr.c +@@ -19,7 +19,7 @@ struct drr_class { + struct Qdisc_class_common common; + unsigned int filter_cnt; + +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct net_rate_estimator __rcu *rate_est; + struct list_head alist; +@@ -85,8 +85,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, +- NULL, +- qdisc_root_sleeping_running(sch), ++ NULL, true, + tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace estimator"); +@@ -106,6 +105,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + if (cl == NULL) + return -ENOBUFS; + ++ gnet_stats_basic_sync_init(&cl->bstats); + cl->common.classid = classid; + cl->quantum = quantum; + cl->qdisc = qdisc_create_dflt(sch->dev_queue, +@@ -118,9 +118,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + + if (tca[TCA_RATE]) { + err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, +- NULL, +- qdisc_root_sleeping_running(sch), +- tca[TCA_RATE]); ++ NULL, true, tca[TCA_RATE]); + if (err) { + NL_SET_ERR_MSG(extack, "Failed to replace estimator"); + qdisc_put(cl->qdisc); +@@ -267,8 +265,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg, + if (qlen) + xstats.deficit = cl->deficit; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, cl_q->cpu_qstats, &cl_q->qstats, qlen) < 0) + return -1; +diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c +index 175e07b3d25c..8de4365886e8 100644 +--- a/net/sched/sch_ets.c ++++ b/net/sched/sch_ets.c +@@ -41,7 +41,7 @@ struct ets_class { + struct Qdisc *qdisc; + u32 quantum; + u32 deficit; +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + }; + +@@ -325,8 +325,7 @@ static int ets_class_dump_stats(struct Qdisc *sch, unsigned long arg, + struct ets_class *cl = ets_class_from_arg(sch, arg); + struct Qdisc *cl_q = cl->qdisc; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl_q->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl_q->bstats, true) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + +@@ -661,7 +660,6 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + + q->nbands = nbands; + for (i = nstrict; i < q->nstrict; i++) { +- INIT_LIST_HEAD(&q->classes[i].alist); + if (q->classes[i].qdisc->q.qlen) { + list_add_tail(&q->classes[i].alist, &q->active); + q->classes[i].deficit = quanta[i]; +@@ -689,7 +687,11 @@ static int ets_qdisc_change(struct Qdisc *sch, struct nlattr *opt, + ets_offload_change(sch); + for (i = q->nbands; i < oldbands; i++) { + qdisc_put(q->classes[i].qdisc); +- memset(&q->classes[i], 0, sizeof(q->classes[i])); ++ q->classes[i].qdisc = NULL; ++ q->classes[i].quantum = 0; ++ q->classes[i].deficit = 0; ++ gnet_stats_basic_sync_init(&q->classes[i].bstats); ++ memset(&q->classes[i].qstats, 0, sizeof(q->classes[i].qstats)); + } + return 0; + } +@@ -698,7 +700,7 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) + { + struct ets_sched *q = qdisc_priv(sch); +- int err; ++ int err, i; + + if (!opt) + return -EINVAL; +@@ -708,6 +710,9 @@ static int ets_qdisc_init(struct Qdisc *sch, struct nlattr *opt, + return err; + + INIT_LIST_HEAD(&q->active); ++ for (i = 0; i < TCQ_ETS_MAX_BANDS; i++) ++ INIT_LIST_HEAD(&q->classes[i].alist); ++ + return ets_qdisc_change(sch, opt, extack); + } + +diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c +index 02299785209c..b979ae2f551c 100644 +--- a/net/sched/sch_generic.c ++++ b/net/sched/sch_generic.c +@@ -304,8 +304,8 @@ static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate, + + /* + * Transmit possibly several skbs, and handle the return status as +- * required. Owning running seqcount bit guarantees that +- * only one CPU can execute this function. ++ * required. Owning qdisc running bit guarantees that only one CPU ++ * can execute this function. + * + * Returns to the caller: + * false - hardware queue frozen backoff +@@ -606,7 +606,6 @@ struct Qdisc noop_qdisc = { + .ops = &noop_qdisc_ops, + .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), + .dev_queue = &noop_netdev_queue, +- .running = SEQCNT_ZERO(noop_qdisc.running), + .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), + .gso_skb = { + .next = (struct sk_buff *)&noop_qdisc.gso_skb, +@@ -867,7 +866,6 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = { + EXPORT_SYMBOL(pfifo_fast_ops); + + static struct lock_class_key qdisc_tx_busylock; +-static struct lock_class_key qdisc_running_key; + + struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + const struct Qdisc_ops *ops, +@@ -892,11 +890,12 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + __skb_queue_head_init(&sch->gso_skb); + __skb_queue_head_init(&sch->skb_bad_txq); + qdisc_skb_head_init(&sch->q); ++ gnet_stats_basic_sync_init(&sch->bstats); + spin_lock_init(&sch->q.lock); + + if (ops->static_flags & TCQ_F_CPUSTATS) { + sch->cpu_bstats = +- netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu); ++ netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync); + if (!sch->cpu_bstats) + goto errout1; + +@@ -916,10 +915,6 @@ struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue, + lockdep_set_class(&sch->seqlock, + dev->qdisc_tx_busylock ?: &qdisc_tx_busylock); + +- seqcount_init(&sch->running); +- lockdep_set_class(&sch->running, +- dev->qdisc_running_key ?: &qdisc_running_key); +- + sch->ops = ops; + sch->flags = ops->static_flags; + sch->enqueue = ops->enqueue; +diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c +index 621dc6afde8f..1073c76d05c4 100644 +--- a/net/sched/sch_gred.c ++++ b/net/sched/sch_gred.c +@@ -56,6 +56,7 @@ struct gred_sched { + u32 DPs; + u32 def; + struct red_vars wred_set; ++ struct tc_gred_qopt_offload *opt; + }; + + static inline int gred_wred_mode(struct gred_sched *table) +@@ -311,48 +312,50 @@ static void gred_offload(struct Qdisc *sch, enum tc_gred_command command) + { + struct gred_sched *table = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); +- struct tc_gred_qopt_offload opt = { +- .command = command, +- .handle = sch->handle, +- .parent = sch->parent, +- }; ++ struct tc_gred_qopt_offload *opt = table->opt; + + if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc) + return; + ++ memset(opt, 0, sizeof(*opt)); ++ opt->command = command; ++ opt->handle = sch->handle; ++ opt->parent = sch->parent; ++ + if (command == TC_GRED_REPLACE) { + unsigned int i; + +- opt.set.grio_on = gred_rio_mode(table); +- opt.set.wred_on = gred_wred_mode(table); +- opt.set.dp_cnt = table->DPs; +- opt.set.dp_def = table->def; ++ opt->set.grio_on = gred_rio_mode(table); ++ opt->set.wred_on = gred_wred_mode(table); ++ opt->set.dp_cnt = table->DPs; ++ opt->set.dp_def = table->def; + + for (i = 0; i < table->DPs; i++) { + struct gred_sched_data *q = table->tab[i]; + + if (!q) + continue; +- opt.set.tab[i].present = true; +- opt.set.tab[i].limit = q->limit; +- opt.set.tab[i].prio = q->prio; +- opt.set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; +- opt.set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; +- opt.set.tab[i].is_ecn = gred_use_ecn(q); +- opt.set.tab[i].is_harddrop = gred_use_harddrop(q); +- opt.set.tab[i].probability = q->parms.max_P; +- opt.set.tab[i].backlog = &q->backlog; ++ opt->set.tab[i].present = true; ++ opt->set.tab[i].limit = q->limit; ++ opt->set.tab[i].prio = q->prio; ++ opt->set.tab[i].min = q->parms.qth_min >> q->parms.Wlog; ++ opt->set.tab[i].max = q->parms.qth_max >> q->parms.Wlog; ++ opt->set.tab[i].is_ecn = gred_use_ecn(q); ++ opt->set.tab[i].is_harddrop = gred_use_harddrop(q); ++ opt->set.tab[i].probability = q->parms.max_P; ++ opt->set.tab[i].backlog = &q->backlog; + } +- opt.set.qstats = &sch->qstats; ++ opt->set.qstats = &sch->qstats; + } + +- dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, &opt); ++ dev->netdev_ops->ndo_setup_tc(dev, TC_SETUP_QDISC_GRED, opt); + } + + static int gred_offload_dump_stats(struct Qdisc *sch) + { + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_qopt_offload *hw_stats; ++ u64 bytes = 0, packets = 0; + unsigned int i; + int ret; + +@@ -364,9 +367,11 @@ static int gred_offload_dump_stats(struct Qdisc *sch) + hw_stats->handle = sch->handle; + hw_stats->parent = sch->parent; + +- for (i = 0; i < MAX_DPs; i++) ++ for (i = 0; i < MAX_DPs; i++) { ++ gnet_stats_basic_sync_init(&hw_stats->stats.bstats[i]); + if (table->tab[i]) + hw_stats->stats.xstats[i] = &table->tab[i]->stats; ++ } + + ret = qdisc_offload_dump_helper(sch, TC_SETUP_QDISC_GRED, hw_stats); + /* Even if driver returns failure adjust the stats - in case offload +@@ -375,19 +380,19 @@ static int gred_offload_dump_stats(struct Qdisc *sch) + for (i = 0; i < MAX_DPs; i++) { + if (!table->tab[i]) + continue; +- table->tab[i]->packetsin += hw_stats->stats.bstats[i].packets; +- table->tab[i]->bytesin += hw_stats->stats.bstats[i].bytes; ++ table->tab[i]->packetsin += u64_stats_read(&hw_stats->stats.bstats[i].packets); ++ table->tab[i]->bytesin += u64_stats_read(&hw_stats->stats.bstats[i].bytes); + table->tab[i]->backlog += hw_stats->stats.qstats[i].backlog; + +- _bstats_update(&sch->bstats, +- hw_stats->stats.bstats[i].bytes, +- hw_stats->stats.bstats[i].packets); ++ bytes += u64_stats_read(&hw_stats->stats.bstats[i].bytes); ++ packets += u64_stats_read(&hw_stats->stats.bstats[i].packets); + sch->qstats.qlen += hw_stats->stats.qstats[i].qlen; + sch->qstats.backlog += hw_stats->stats.qstats[i].backlog; + sch->qstats.drops += hw_stats->stats.qstats[i].drops; + sch->qstats.requeues += hw_stats->stats.qstats[i].requeues; + sch->qstats.overlimits += hw_stats->stats.qstats[i].overlimits; + } ++ _bstats_update(&sch->bstats, bytes, packets); + + kfree(hw_stats); + return ret; +@@ -728,6 +733,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, + static int gred_init(struct Qdisc *sch, struct nlattr *opt, + struct netlink_ext_ack *extack) + { ++ struct gred_sched *table = qdisc_priv(sch); + struct nlattr *tb[TCA_GRED_MAX + 1]; + int err; + +@@ -751,6 +757,12 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt, + sch->limit = qdisc_dev(sch)->tx_queue_len + * psched_mtu(qdisc_dev(sch)); + ++ if (qdisc_dev(sch)->netdev_ops->ndo_setup_tc) { ++ table->opt = kzalloc(sizeof(*table->opt), GFP_KERNEL); ++ if (!table->opt) ++ return -ENOMEM; ++ } ++ + return gred_change_table_def(sch, tb[TCA_GRED_DPS], extack); + } + +@@ -907,6 +919,7 @@ static void gred_destroy(struct Qdisc *sch) + gred_destroy_vq(table->tab[i]); + } + gred_offload(sch, TC_GRED_DESTROY); ++ kfree(table->opt); + } + + static struct Qdisc_ops gred_qdisc_ops __read_mostly = { +diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c +index c802a027b4f3..03efc40e42fc 100644 +--- a/net/sched/sch_hfsc.c ++++ b/net/sched/sch_hfsc.c +@@ -111,7 +111,7 @@ enum hfsc_class_flags { + struct hfsc_class { + struct Qdisc_class_common cl_common; + +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct net_rate_estimator __rcu *rate_est; + struct tcf_proto __rcu *filter_list; /* filter list */ +@@ -965,7 +965,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) + return err; +@@ -1033,9 +1033,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + + if (tca[TCA_RATE]) { + err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est, +- NULL, +- qdisc_root_sleeping_running(sch), +- tca[TCA_RATE]); ++ NULL, true, tca[TCA_RATE]); + if (err) { + tcf_block_put(cl->block); + kfree(cl); +@@ -1328,7 +1326,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + xstats.work = cl->cl_total; + xstats.rtwork = cl->cl_cumul; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0) + return -1; +@@ -1406,6 +1404,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt, + if (err) + return err; + ++ gnet_stats_basic_sync_init(&q->root.bstats); + q->root.cl_common.classid = sch->handle; + q->root.sched = q; + q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, +diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c +index 8ce999e4ca32..a90e4fffdfd9 100644 +--- a/net/sched/sch_htb.c ++++ b/net/sched/sch_htb.c +@@ -113,8 +113,8 @@ struct htb_class { + /* + * Written often fields + */ +- struct gnet_stats_basic_packed bstats; +- struct gnet_stats_basic_packed bstats_bias; ++ struct gnet_stats_basic_sync bstats; ++ struct gnet_stats_basic_sync bstats_bias; + struct tc_htb_xstats xstats; /* our special stats */ + + /* token bucket parameters */ +@@ -1309,10 +1309,11 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg, + static void htb_offload_aggregate_stats(struct htb_sched *q, + struct htb_class *cl) + { ++ u64 bytes = 0, packets = 0; + struct htb_class *c; + unsigned int i; + +- memset(&cl->bstats, 0, sizeof(cl->bstats)); ++ gnet_stats_basic_sync_init(&cl->bstats); + + for (i = 0; i < q->clhash.hashsize; i++) { + hlist_for_each_entry(c, &q->clhash.hash[i], common.hnode) { +@@ -1324,14 +1325,15 @@ static void htb_offload_aggregate_stats(struct htb_sched *q, + if (p != cl) + continue; + +- cl->bstats.bytes += c->bstats_bias.bytes; +- cl->bstats.packets += c->bstats_bias.packets; ++ bytes += u64_stats_read(&c->bstats_bias.bytes); ++ packets += u64_stats_read(&c->bstats_bias.packets); + if (c->level == 0) { +- cl->bstats.bytes += c->leaf.q->bstats.bytes; +- cl->bstats.packets += c->leaf.q->bstats.packets; ++ bytes += u64_stats_read(&c->leaf.q->bstats.bytes); ++ packets += u64_stats_read(&c->leaf.q->bstats.packets); + } + } + } ++ _bstats_update(&cl->bstats, bytes, packets); + } + + static int +@@ -1358,16 +1360,16 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d) + if (cl->leaf.q) + cl->bstats = cl->leaf.q->bstats; + else +- memset(&cl->bstats, 0, sizeof(cl->bstats)); +- cl->bstats.bytes += cl->bstats_bias.bytes; +- cl->bstats.packets += cl->bstats_bias.packets; ++ gnet_stats_basic_sync_init(&cl->bstats); ++ _bstats_update(&cl->bstats, ++ u64_stats_read(&cl->bstats_bias.bytes), ++ u64_stats_read(&cl->bstats_bias.packets)); + } else { + htb_offload_aggregate_stats(q, cl); + } + } + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0) + return -1; +@@ -1582,8 +1584,9 @@ static int htb_destroy_class_offload(struct Qdisc *sch, struct htb_class *cl, + } + + if (cl->parent) { +- cl->parent->bstats_bias.bytes += q->bstats.bytes; +- cl->parent->bstats_bias.packets += q->bstats.packets; ++ _bstats_update(&cl->parent->bstats_bias, ++ u64_stats_read(&q->bstats.bytes), ++ u64_stats_read(&q->bstats.packets)); + } + + offload_opt = (struct tc_htb_qopt_offload) { +@@ -1875,6 +1878,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, + if (!cl) + goto failure; + ++ gnet_stats_basic_sync_init(&cl->bstats); ++ gnet_stats_basic_sync_init(&cl->bstats_bias); ++ + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); + if (err) { + kfree(cl); +@@ -1884,7 +1890,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE] ? : &est.nla); + if (err) + goto err_block_put; +@@ -1948,8 +1954,9 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, + htb_graft_helper(dev_queue, old_q); + goto err_kill_estimator; + } +- parent->bstats_bias.bytes += old_q->bstats.bytes; +- parent->bstats_bias.packets += old_q->bstats.packets; ++ _bstats_update(&parent->bstats_bias, ++ u64_stats_read(&old_q->bstats.bytes), ++ u64_stats_read(&old_q->bstats.packets)); + qdisc_put(old_q); + } + new_q = qdisc_create_dflt(dev_queue, &pfifo_qdisc_ops, +@@ -2009,7 +2016,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) + return err; +diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c +index db18d8a860f9..24c5d97d88dd 100644 +--- a/net/sched/sch_mq.c ++++ b/net/sched/sch_mq.c +@@ -153,10 +153,9 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) + struct net_device *dev = qdisc_dev(sch); + struct Qdisc *qdisc; + unsigned int ntx; +- __u32 qlen = 0; + + sch->q.qlen = 0; +- memset(&sch->bstats, 0, sizeof(sch->bstats)); ++ gnet_stats_basic_sync_init(&sch->bstats); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + /* MQ supports lockless qdiscs. However, statistics accounting needs +@@ -168,25 +167,11 @@ static int mq_dump(struct Qdisc *sch, struct sk_buff *skb) + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; + spin_lock_bh(qdisc_lock(qdisc)); + +- if (qdisc_is_percpu_stats(qdisc)) { +- qlen = qdisc_qlen_sum(qdisc); +- __gnet_stats_copy_basic(NULL, &sch->bstats, +- qdisc->cpu_bstats, +- &qdisc->bstats); +- __gnet_stats_copy_queue(&sch->qstats, +- qdisc->cpu_qstats, +- &qdisc->qstats, qlen); +- sch->q.qlen += qlen; +- } else { +- sch->q.qlen += qdisc->q.qlen; +- sch->bstats.bytes += qdisc->bstats.bytes; +- sch->bstats.packets += qdisc->bstats.packets; +- sch->qstats.qlen += qdisc->qstats.qlen; +- sch->qstats.backlog += qdisc->qstats.backlog; +- sch->qstats.drops += qdisc->qstats.drops; +- sch->qstats.requeues += qdisc->qstats.requeues; +- sch->qstats.overlimits += qdisc->qstats.overlimits; +- } ++ gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, ++ &qdisc->bstats, false); ++ gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, ++ &qdisc->qstats); ++ sch->q.qlen += qdisc_qlen(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + } +@@ -269,8 +254,7 @@ static int mq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct netdev_queue *dev_queue = mq_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; +- if (gnet_stats_copy_basic(&sch->running, d, sch->cpu_bstats, +- &sch->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, sch->cpu_bstats, &sch->bstats, true) < 0 || + qdisc_qstats_copy(d, sch) < 0) + return -1; + return 0; +diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c +index 50e15add6068..42d4101e4f3d 100644 +--- a/net/sched/sch_mqprio.c ++++ b/net/sched/sch_mqprio.c +@@ -412,7 +412,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) + unsigned int ntx, tc; + + sch->q.qlen = 0; +- memset(&sch->bstats, 0, sizeof(sch->bstats)); ++ gnet_stats_basic_sync_init(&sch->bstats); + memset(&sch->qstats, 0, sizeof(sch->qstats)); + + /* MQ supports lockless qdiscs. However, statistics accounting needs +@@ -424,25 +424,11 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) + qdisc = netdev_get_tx_queue(dev, ntx)->qdisc_sleeping; + spin_lock_bh(qdisc_lock(qdisc)); + +- if (qdisc_is_percpu_stats(qdisc)) { +- __u32 qlen = qdisc_qlen_sum(qdisc); +- +- __gnet_stats_copy_basic(NULL, &sch->bstats, +- qdisc->cpu_bstats, +- &qdisc->bstats); +- __gnet_stats_copy_queue(&sch->qstats, +- qdisc->cpu_qstats, +- &qdisc->qstats, qlen); +- sch->q.qlen += qlen; +- } else { +- sch->q.qlen += qdisc->q.qlen; +- sch->bstats.bytes += qdisc->bstats.bytes; +- sch->bstats.packets += qdisc->bstats.packets; +- sch->qstats.backlog += qdisc->qstats.backlog; +- sch->qstats.drops += qdisc->qstats.drops; +- sch->qstats.requeues += qdisc->qstats.requeues; +- sch->qstats.overlimits += qdisc->qstats.overlimits; +- } ++ gnet_stats_add_basic(&sch->bstats, qdisc->cpu_bstats, ++ &qdisc->bstats, false); ++ gnet_stats_add_queue(&sch->qstats, qdisc->cpu_qstats, ++ &qdisc->qstats); ++ sch->q.qlen += qdisc_qlen(qdisc); + + spin_unlock_bh(qdisc_lock(qdisc)); + } +@@ -534,12 +520,13 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + { + if (cl >= TC_H_MIN_PRIORITY) { + int i; +- __u32 qlen = 0; ++ __u32 qlen; + struct gnet_stats_queue qstats = {0}; +- struct gnet_stats_basic_packed bstats = {0}; ++ struct gnet_stats_basic_sync bstats; + struct net_device *dev = qdisc_dev(sch); + struct netdev_tc_txq tc = dev->tc_to_txq[cl & TC_BITMASK]; + ++ gnet_stats_basic_sync_init(&bstats); + /* Drop lock here it will be reclaimed before touching + * statistics this is required because the d->lock we + * hold here is the look on dev_queue->qdisc_sleeping +@@ -554,40 +541,28 @@ static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + + spin_lock_bh(qdisc_lock(qdisc)); + +- if (qdisc_is_percpu_stats(qdisc)) { +- qlen = qdisc_qlen_sum(qdisc); +- +- __gnet_stats_copy_basic(NULL, &bstats, +- qdisc->cpu_bstats, +- &qdisc->bstats); +- __gnet_stats_copy_queue(&qstats, +- qdisc->cpu_qstats, +- &qdisc->qstats, +- qlen); +- } else { +- qlen += qdisc->q.qlen; +- bstats.bytes += qdisc->bstats.bytes; +- bstats.packets += qdisc->bstats.packets; +- qstats.backlog += qdisc->qstats.backlog; +- qstats.drops += qdisc->qstats.drops; +- qstats.requeues += qdisc->qstats.requeues; +- qstats.overlimits += qdisc->qstats.overlimits; +- } ++ gnet_stats_add_basic(&bstats, qdisc->cpu_bstats, ++ &qdisc->bstats, false); ++ gnet_stats_add_queue(&qstats, qdisc->cpu_qstats, ++ &qdisc->qstats); ++ sch->q.qlen += qdisc_qlen(qdisc); ++ + spin_unlock_bh(qdisc_lock(qdisc)); + } ++ qlen = qdisc_qlen(sch) + qstats.qlen; + + /* Reclaim root sleeping lock before completing stats */ + if (d->lock) + spin_lock_bh(d->lock); +- if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &bstats, false) < 0 || + gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) + return -1; + } else { + struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, +- sch->cpu_bstats, &sch->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, sch->cpu_bstats, ++ &sch->bstats, true) < 0 || + qdisc_qstats_copy(d, sch) < 0) + return -1; + } +diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c +index 8b99f07aa3a7..f28050c7f12d 100644 +--- a/net/sched/sch_multiq.c ++++ b/net/sched/sch_multiq.c +@@ -337,8 +337,7 @@ static int multiq_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, &cl_q->bstats, true) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + +diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c +index 2e0b1e7f5466..c03a11dd990f 100644 +--- a/net/sched/sch_prio.c ++++ b/net/sched/sch_prio.c +@@ -359,8 +359,8 @@ static int prio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct Qdisc *cl_q; + + cl_q = q->queues[cl - 1]; +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, cl_q->cpu_bstats, &cl_q->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, cl_q->cpu_bstats, ++ &cl_q->bstats, true) < 0 || + qdisc_qstats_copy(d, cl_q) < 0) + return -1; + +diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c +index 4c51aeb78f14..e591c3547b12 100644 +--- a/net/sched/sch_qfq.c ++++ b/net/sched/sch_qfq.c +@@ -131,7 +131,7 @@ struct qfq_class { + + unsigned int filter_cnt; + +- struct gnet_stats_basic_packed bstats; ++ struct gnet_stats_basic_sync bstats; + struct gnet_stats_queue qstats; + struct net_rate_estimator __rcu *rate_est; + struct Qdisc *qdisc; +@@ -452,7 +452,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + err = gen_replace_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) + return err; +@@ -466,6 +466,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + if (cl == NULL) + return -ENOBUFS; + ++ gnet_stats_basic_sync_init(&cl->bstats); + cl->common.classid = classid; + cl->deficit = lmax; + +@@ -478,7 +479,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + err = gen_new_estimator(&cl->bstats, NULL, + &cl->rate_est, + NULL, +- qdisc_root_sleeping_running(sch), ++ true, + tca[TCA_RATE]); + if (err) + goto destroy_class; +@@ -640,8 +641,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + xstats.weight = cl->agg->class_weight; + xstats.lmax = cl->agg->lmax; + +- if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), +- d, NULL, &cl->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &cl->bstats, true) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + qdisc_qstats_copy(d, cl->qdisc) < 0) + return -1; +@@ -1235,8 +1235,7 @@ static int qfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, + return err; + } + +- cl->bstats.bytes += len; +- cl->bstats.packets += gso_segs; ++ _bstats_update(&cl->bstats, len, gso_segs); + sch->qstats.backlog += len; + ++sch->q.qlen; + +diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c +index e203deacc953..30497d446af5 100644 +--- a/net/sched/sch_taprio.c ++++ b/net/sched/sch_taprio.c +@@ -1987,7 +1987,7 @@ static int taprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, + struct netdev_queue *dev_queue = taprio_queue_get(sch, cl); + + sch = dev_queue->qdisc_sleeping; +- if (gnet_stats_copy_basic(&sch->running, d, NULL, &sch->bstats) < 0 || ++ if (gnet_stats_copy_basic(d, NULL, &sch->bstats, true) < 0 || + qdisc_qstats_copy(d, sch) < 0) + return -1; + return 0; +diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c +index 5ff8f902f14d..2ea5c3f18fd4 100644 +--- a/net/sunrpc/svc_xprt.c ++++ b/net/sunrpc/svc_xprt.c +@@ -441,7 +441,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) + if (test_and_set_bit(XPT_BUSY, &xprt->xpt_flags)) + return; + +- cpu = get_cpu(); ++ cpu = get_cpu_light(); + pool = svc_pool_for_cpu(xprt->xpt_server, cpu); + + atomic_long_inc(&pool->sp_stats.packets); +@@ -465,7 +465,7 @@ void svc_xprt_do_enqueue(struct svc_xprt *xprt) + rqstp = NULL; + out_unlock: + rcu_read_unlock(); +- put_cpu(); ++ put_cpu_light(); + trace_svc_xprt_do_enqueue(xprt, rqstp); + } + EXPORT_SYMBOL_GPL(svc_xprt_do_enqueue); +diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c +index 5a90aa527877..642d0748c169 100644 +--- a/samples/kfifo/bytestream-example.c ++++ b/samples/kfifo/bytestream-example.c +@@ -22,10 +22,10 @@ + #define PROC_FIFO "bytestream-fifo" + + /* lock for procfs read access */ +-static DEFINE_MUTEX(read_lock); ++static DEFINE_MUTEX(read_access); + + /* lock for procfs write access */ +-static DEFINE_MUTEX(write_lock); ++static DEFINE_MUTEX(write_access); + + /* + * define DYNAMIC in this example for a dynamically allocated fifo. +@@ -116,12 +116,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&write_lock)) ++ if (mutex_lock_interruptible(&write_access)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + +- mutex_unlock(&write_lock); ++ mutex_unlock(&write_access); + if (ret) + return ret; + +@@ -134,12 +134,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&read_lock)) ++ if (mutex_lock_interruptible(&read_access)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + +- mutex_unlock(&read_lock); ++ mutex_unlock(&read_access); + if (ret) + return ret; + +diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c +index e5403d8c971a..c61482ba94f4 100644 +--- a/samples/kfifo/inttype-example.c ++++ b/samples/kfifo/inttype-example.c +@@ -22,10 +22,10 @@ + #define PROC_FIFO "int-fifo" + + /* lock for procfs read access */ +-static DEFINE_MUTEX(read_lock); ++static DEFINE_MUTEX(read_access); + + /* lock for procfs write access */ +-static DEFINE_MUTEX(write_lock); ++static DEFINE_MUTEX(write_access); + + /* + * define DYNAMIC in this example for a dynamically allocated fifo. +@@ -109,12 +109,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&write_lock)) ++ if (mutex_lock_interruptible(&write_access)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + +- mutex_unlock(&write_lock); ++ mutex_unlock(&write_access); + if (ret) + return ret; + +@@ -127,12 +127,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&read_lock)) ++ if (mutex_lock_interruptible(&read_access)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + +- mutex_unlock(&read_lock); ++ mutex_unlock(&read_access); + if (ret) + return ret; + +diff --git a/samples/kfifo/record-example.c b/samples/kfifo/record-example.c +index f64f3d62d6c2..e4087b2d3fc4 100644 +--- a/samples/kfifo/record-example.c ++++ b/samples/kfifo/record-example.c +@@ -22,10 +22,10 @@ + #define PROC_FIFO "record-fifo" + + /* lock for procfs read access */ +-static DEFINE_MUTEX(read_lock); ++static DEFINE_MUTEX(read_access); + + /* lock for procfs write access */ +-static DEFINE_MUTEX(write_lock); ++static DEFINE_MUTEX(write_access); + + /* + * define DYNAMIC in this example for a dynamically allocated fifo. +@@ -123,12 +123,12 @@ static ssize_t fifo_write(struct file *file, const char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&write_lock)) ++ if (mutex_lock_interruptible(&write_access)) + return -ERESTARTSYS; + + ret = kfifo_from_user(&test, buf, count, &copied); + +- mutex_unlock(&write_lock); ++ mutex_unlock(&write_access); + if (ret) + return ret; + +@@ -141,12 +141,12 @@ static ssize_t fifo_read(struct file *file, char __user *buf, + int ret; + unsigned int copied; + +- if (mutex_lock_interruptible(&read_lock)) ++ if (mutex_lock_interruptible(&read_access)) + return -ERESTARTSYS; + + ret = kfifo_to_user(&test, buf, count, &copied); + +- mutex_unlock(&read_lock); ++ mutex_unlock(&read_access); + if (ret) + return ret; + +diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c +index 3f3f56f6be4d..5dbcdc5b22b5 100644 +--- a/security/smack/smack_lsm.c ++++ b/security/smack/smack_lsm.c +@@ -51,8 +51,10 @@ + #define SMK_RECEIVING 1 + #define SMK_SENDING 2 + ++#ifdef SMACK_IPV6_PORT_LABELING + static DEFINE_MUTEX(smack_ipv6_lock); + static LIST_HEAD(smk_ipv6_port_list); ++#endif + struct kmem_cache *smack_rule_cache; + int smack_enabled __initdata; + +@@ -2603,7 +2605,6 @@ static void smk_ipv6_port_label(struct socket *sock, struct sockaddr *address) + mutex_unlock(&smack_ipv6_lock); + return; + } +-#endif + + /** + * smk_ipv6_port_check - check Smack port access +@@ -2666,6 +2667,7 @@ static int smk_ipv6_port_check(struct sock *sk, struct sockaddr_in6 *address, + + return smk_ipv6_check(skp, object, address, act); + } ++#endif + + /** + * smack_inode_setsecurity - set smack xattrs +@@ -2852,8 +2854,9 @@ static int smack_socket_connect(struct socket *sock, struct sockaddr *sap, + rc = smk_ipv6_check(ssp->smk_out, rsp, sip, + SMK_CONNECTING); + } +- if (__is_defined(SMACK_IPV6_PORT_LABELING)) +- rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); ++#ifdef SMACK_IPV6_PORT_LABELING ++ rc = smk_ipv6_port_check(sock->sk, sip, SMK_CONNECTING); ++#endif + + return rc; + } +diff --git a/sound/soc/mediatek/common/mtk-afe-fe-dai.c b/sound/soc/mediatek/common/mtk-afe-fe-dai.c +index e95c7c018e7d..4f2c2379531b 100644 +--- a/sound/soc/mediatek/common/mtk-afe-fe-dai.c ++++ b/sound/soc/mediatek/common/mtk-afe-fe-dai.c +@@ -288,7 +288,6 @@ const struct snd_soc_dai_ops mtk_afe_fe_ops = { + }; + EXPORT_SYMBOL_GPL(mtk_afe_fe_ops); + +-static DEFINE_MUTEX(irqs_lock); + int mtk_dynamic_irq_acquire(struct mtk_base_afe *afe) + { + int i; diff --git a/meta-digi-arm/recipes-kernel/linux/linux-dey_5.15.bb b/meta-digi-arm/recipes-kernel/linux/linux-dey_5.15.bb index cc65d7536..21efc5450 100644 --- a/meta-digi-arm/recipes-kernel/linux/linux-dey_5.15.bb +++ b/meta-digi-arm/recipes-kernel/linux/linux-dey_5.15.bb @@ -7,6 +7,24 @@ SRCBRANCH:stm32mpcommon = "v5.15.118/stm/master" SRCREV = "${AUTOREV}" SRCREV:stm32mpcommon = "${AUTOREV}" +STM_RT_PATCHES = " \ + file://patch-5.15.119-rt65.patch \ + file://0023-5.15-stm32mp-rt-49-r1-CLOCK.patch \ + file://0024-5.15-stm32mp-rt-49-r1-DMA.patch \ + file://0025-5.15-stm32mp-rt-49-r1-MFD.patch \ + file://0026-5.15-stm32mp-rt-49-r1-NET-TTY.patch \ + file://0027-5.15-stm32mp-rt-49-r1-DEVICETREE.patch \ + file://0028-5.15-stm32mp-rt-49-r1-CONFIG.patch \ +" + +SRC_URI:append:stm32mpcommon = " \ + ${@bb.utils.contains('DISTRO_FEATURES', 'rt', '${STM_RT_PATCHES}', '', d)} \ +" + +KERNEL_CONFIG_FRAGMENTS:append:stm32mpcommon = " ${@bb.utils.contains('DISTRO_FEATURES', 'rt', '${S}/arch/arm/configs/fragment-07-rt.config', '', d)}" +KERNEL_CONFIG_FRAGMENTS:append:stm32mpcommon = " ${@bb.utils.contains('DISTRO_FEATURES', 'rt', '${S}/arch/arm/configs/fragment-07-rt-sysvinit.config', '', d)}" +KERNEL_CONFIG_FRAGMENTS:append:ccmp13 = " ${@bb.utils.contains('DISTRO_FEATURES', 'rt', '${S}/arch/arm/configs/fragment-08-rt-mp13.config', '', d)}" + do_assemble_fitimage:append:ccmp1() { # # Step 9: Add public keys to the different U-Boot dtb files