meta-digi/meta-digi-arm/recipes-kernel/linux/linux-dey/ccimx93/0006-kernel-add-NXP-RT-supp...

From 117dccc8d01ab70decb5d6a529f4780aaba20e0b Mon Sep 17 00:00:00 2001
From: Mike Engel <Mike.Engel@digi.com>
Date: Sat, 24 Feb 2024 17:03:36 +0100
Subject: [PATCH 06/10] kernel: add NXP RT support

Signed-off-by: Mike Engel <Mike.Engel@digi.com>
---
 kernel/Kconfig.preempt      |   6 +
 kernel/bpf/syscall.c        |   4 +-
 kernel/dma/coherent.c       |  15 +-
 kernel/entry/common.c       |   4 +-
 kernel/hung_task.c          |  11 +-
 kernel/ksysfs.c             |  12 +
 kernel/panic.c              |  10 +-
 kernel/printk/internal.h    |   2 +
 kernel/printk/printk.c      | 890 ++++++++++++++++++++++++++++++++----
 kernel/printk/printk_safe.c |  32 ++
 kernel/rcu/rcutorture.c     |   6 +
 kernel/rcu/tree_stall.h     |   2 +
 kernel/reboot.c             |  16 +-
 kernel/sched/core.c         | 160 ++++++-
 kernel/sched/fair.c         |  16 +-
 kernel/sched/features.h     |   3 +
 kernel/sched/sched.h        |   9 +
 kernel/signal.c             |   8 +-
 kernel/softirq.c            |  82 +++-
 kernel/time/hrtimer.c       |   4 +-
 kernel/time/tick-sched.c    |   2 +-
 kernel/time/timer.c         |   2 +-
 kernel/trace/trace.c        |  50 +-
 kernel/trace/trace_events.c |   1 +
 kernel/trace/trace_output.c |  18 +-
 kernel/watchdog.c           |   4 +
 kernel/watchdog_hld.c       |   4 +
 27 files changed, 1234 insertions(+), 139 deletions(-)

diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..260c08efeb48 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,5 +1,11 @@
 # SPDX-License-Identifier: GPL-2.0-only

+config HAVE_PREEMPT_LAZY
+	bool
+
+config PREEMPT_LAZY
+	def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT
+
 config PREEMPT_NONE_BUILD
 	bool

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0c8b7733573e..c0915e2424f1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2115,11 +2115,11 @@ static void bpf_prog_get_stats(const struct bpf_prog *prog,

 		st = per_cpu_ptr(prog->stats, cpu);
 		do {
-			start = u64_stats_fetch_begin_irq(&st->syncp);
+			start = u64_stats_fetch_begin(&st->syncp);
 			tnsecs = u64_stats_read(&st->nsecs);
 			tcnt = u64_stats_read(&st->cnt);
 			tmisses = u64_stats_read(&st->misses);
-		} while (u64_stats_fetch_retry_irq(&st->syncp, start));
+		} while (u64_stats_fetch_retry(&st->syncp, start));
 		nsecs += tnsecs;
 		cnt += tcnt;
 		misses += tmisses;
diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c
index c21abc77c53e..f15ba6c6358e 100644
--- a/kernel/dma/coherent.c
+++ b/kernel/dma/coherent.c
@@ -36,7 +36,8 @@ static inline dma_addr_t dma_get_device_base(struct device *dev,
 }

 static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
-		dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset)
+		dma_addr_t device_addr, size_t size, bool use_dma_pfn_offset,
+		bool cacheable)
 {
 	struct dma_coherent_mem *dma_mem;
 	int pages = size >> PAGE_SHIFT;
@@ -45,7 +46,8 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
 	if (!size)
 		return ERR_PTR(-EINVAL);

-	mem_base = memremap(phys_addr, size, MEMREMAP_WC);
+	mem_base = memremap(phys_addr, size, cacheable ? MEMREMAP_WB :
+			    MEMREMAP_WC);
 	if (!mem_base)
 		return ERR_PTR(-EINVAL);

@@ -119,8 +121,10 @@ int dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
 {
 	struct dma_coherent_mem *mem;
 	int ret;
+	bool cacheable = dev_is_dma_coherent(dev);

-	mem = dma_init_coherent_memory(phys_addr, device_addr, size, false);
+	mem = dma_init_coherent_memory(phys_addr, device_addr, size, false,
+				       cacheable);
 	if (IS_ERR(mem))
 		return PTR_ERR(mem);

@@ -310,7 +314,7 @@ int dma_init_global_coherent(phys_addr_t phys_addr, size_t size)
 {
 	struct dma_coherent_mem *mem;

-	mem = dma_init_coherent_memory(phys_addr, phys_addr, size, true);
+	mem = dma_init_coherent_memory(phys_addr, phys_addr, size, true, false);
 	if (IS_ERR(mem))
 		return PTR_ERR(mem);
 	dma_coherent_default_memory = mem;
@@ -335,9 +339,10 @@ static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev)
 {
 	if (!rmem->priv) {
 		struct dma_coherent_mem *mem;
+		bool cacheable = dev_is_dma_coherent(dev);

 		mem = dma_init_coherent_memory(rmem->base, rmem->base,
-					       rmem->size, true);
+					       rmem->size, true, cacheable);
 		if (IS_ERR(mem))
 			return PTR_ERR(mem);
 		rmem->priv = mem;
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index be61332c66b5..c6301e520d47 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -155,7 +155,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,

 		local_irq_enable_exit_to_user(ti_work);

-		if (ti_work & _TIF_NEED_RESCHED)
+		if (ti_work & _TIF_NEED_RESCHED_MASK)
 			schedule();

 		if (ti_work & _TIF_UPROBE)
@@ -386,7 +386,7 @@ void raw_irqentry_exit_cond_resched(void)
 		rcu_irq_exit_check_preempt();
 		if (IS_ENABLED(CONFIG_DEBUG_ENTRY))
 			WARN_ON_ONCE(!on_thread_stack());
-		if (need_resched())
+		if (should_resched(0))
 			preempt_schedule_irq();
 	}
 }
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index c71889f3f3fc..e2d2344cb9f4 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -127,6 +127,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 	 * complain:
 	 */
 	if (sysctl_hung_task_warnings) {
+		printk_prefer_direct_enter();
+
 		if (sysctl_hung_task_warnings > 0)
 			sysctl_hung_task_warnings--;
 		pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
@@ -142,6 +144,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)

 		if (sysctl_hung_task_all_cpu_backtrace)
 			hung_task_show_all_bt = true;
+
+		printk_prefer_direct_exit();
 	}

 	touch_nmi_watchdog();
@@ -212,12 +216,17 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	}
  unlock:
 	rcu_read_unlock();
-	if (hung_task_show_lock)
+	if (hung_task_show_lock) {
+		printk_prefer_direct_enter();
 		debug_show_all_locks();
+		printk_prefer_direct_exit();
+	}

 	if (hung_task_show_all_bt) {
 		hung_task_show_all_bt = false;
+		printk_prefer_direct_enter();
 		trigger_all_cpu_backtrace();
+		printk_prefer_direct_exit();
 	}

 	if (hung_task_call_panic)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65dba9076f31..ab18048e2186 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -142,6 +142,15 @@ KERNEL_ATTR_RO(vmcoreinfo);

 #endif /* CONFIG_CRASH_CORE */

+#if defined(CONFIG_PREEMPT_RT)
+static ssize_t realtime_show(struct kobject *kobj,
+			     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", 1);
+}
+KERNEL_ATTR_RO(realtime);
+#endif
+
 /* whether file capabilities are enabled */
 static ssize_t fscaps_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
@@ -232,6 +241,9 @@ static struct attribute * kernel_attrs[] = {
 #ifndef CONFIG_TINY_RCU
 	&rcu_expedited_attr.attr,
 	&rcu_normal_attr.attr,
+#endif
+#ifdef CONFIG_PREEMPT_RT
+	&realtime_attr.attr,
 #endif
 	NULL
 };
diff --git a/kernel/panic.c b/kernel/panic.c
index 63e94f3bd8dc..97cc495d95f8 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -322,7 +322,6 @@ void panic(const char *fmt, ...)
 		panic_smp_self_stop();

 	console_verbose();
-	bust_spinlocks(1);
 	va_start(args, fmt);
 	len = vscnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
@@ -339,6 +338,11 @@ void panic(const char *fmt, ...)
 		dump_stack();
 #endif

+	/* If atomic consoles are available, flush the kernel log. */
+	console_flush_on_panic(CONSOLE_ATOMIC_FLUSH_PENDING);
+
+	bust_spinlocks(1);
+
 	/*
 	 * If kgdb is enabled, give it a chance to run before we stop all
 	 * the other CPUs or else we won't be able to debug processes left
@@ -653,6 +657,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,
 {
 	disable_trace_on_warning();

+	printk_prefer_direct_enter();
+
 	if (file)
 		pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n",
 			raw_smp_processor_id(), current->pid, file, line,
@@ -681,6 +687,8 @@ void __warn(const char *file, int line, void *caller, unsigned taint,

 	/* Just a warning, don't kill lockdep. */
 	add_taint(taint, LOCKDEP_STILL_OK);
+
+	printk_prefer_direct_exit();
 }

 #ifndef __WARN_FLAGS
diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h
index d947ca6c84f9..e7d8578860ad 100644
--- a/kernel/printk/internal.h
+++ b/kernel/printk/internal.h
@@ -20,6 +20,8 @@ enum printk_info_flags {
 	LOG_CONT	= 8,	/* text is a fragment of a continuation line */
 };

+extern bool block_console_kthreads;
+
 __printf(4, 0)
 int vprintk_store(int facility, int level,
 		  const struct dev_printk_info *dev_info,
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index cc53fb77f77c..b245b2f60e6a 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -44,6 +44,7 @@
 #include <linux/irq_work.h>
 #include <linux/ctype.h>
 #include <linux/uio.h>
+#include <linux/clocksource.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/task_stack.h>
@@ -220,6 +221,36 @@ int devkmsg_sysctl_set_loglvl(struct ctl_table *table, int write,
 }
 #endif /* CONFIG_PRINTK && CONFIG_SYSCTL */

+/*
+ * Used to synchronize printing kthreads against direct printing via
+ * console_trylock/console_unlock.
+ *
+ * Values:
+ * -1 = console kthreads atomically blocked (via global trylock)
+ *  0 = no kthread printing, console not locked (via trylock)
+ * >0 = kthread(s) actively printing
+ *
+ * Note: For synchronizing against direct printing via
+ *       console_lock/console_unlock, see the @lock variable in
+ *       struct console.
+ */
+static atomic_t console_kthreads_active = ATOMIC_INIT(0);
+
+#define console_kthreads_atomic_tryblock() \
+	(atomic_cmpxchg(&console_kthreads_active, 0, -1) == 0)
+#define console_kthreads_atomic_unblock() \
+	atomic_cmpxchg(&console_kthreads_active, -1, 0)
+#define console_kthreads_atomically_blocked() \
+	(atomic_read(&console_kthreads_active) == -1)
+
+#define console_kthread_printing_tryenter() \
+	atomic_inc_unless_negative(&console_kthreads_active)
+#define console_kthread_printing_exit() \
+	atomic_dec(&console_kthreads_active)
+
+/* Block console kthreads to avoid processing new messages. */
+bool block_console_kthreads;
+
 /*
  * Helper macros to handle lockdep when locking/unlocking console_sem. We use
  * macros instead of functions so that _RET_IP_ contains useful information.
@@ -268,14 +299,49 @@ static bool panic_in_progress(void)
 }

 /*
- * This is used for debugging the mess that is the VT code by
- * keeping track if we have the console semaphore held. It's
- * definitely not the perfect debug tool (we don't know if _WE_
- * hold it and are racing, but it helps tracking those weird code
- * paths in the console code where we end up in places I want
- * locked without the console semaphore held).
+ * Tracks whether kthread printers are all blocked. A value of true implies
+ * that the console is locked via console_lock() or the console is suspended.
+ * Writing to this variable requires holding @console_sem.
+ */
+static bool console_kthreads_blocked;
+
+/*
+ * Block all kthread printers from a schedulable context.
+ *
+ * Requires holding @console_sem.
+ */
+static void console_kthreads_block(void)
+{
+	struct console *con;
+
+	for_each_console(con) {
+		mutex_lock(&con->lock);
+		con->blocked = true;
+		mutex_unlock(&con->lock);
+	}
+
+	console_kthreads_blocked = true;
+}
+
+/*
+ * Unblock all kthread printers from a schedulable context.
+ *
+ * Requires holding @console_sem.
  */
-static int console_locked, console_suspended;
+static void console_kthreads_unblock(void)
+{
+	struct console *con;
+
+	for_each_console(con) {
+		mutex_lock(&con->lock);
+		con->blocked = false;
+		mutex_unlock(&con->lock);
+	}
+
+	console_kthreads_blocked = false;
+}
+
+static int console_suspended;

 /*
  *	Array of consoles built from command line options (console=)
@@ -358,7 +424,75 @@ static int console_msg_format = MSG_FORMAT_DEFAULT;
 /* syslog_lock protects syslog_* variables and write access to clear_seq. */
 static DEFINE_MUTEX(syslog_lock);

+/*
+ * A flag to signify if printk_activate_kthreads() has already started the
+ * kthread printers. If true, any later registered consoles must start their
+ * own kthread directly. The flag is write protected by the console_lock.
+ */
+static bool printk_kthreads_available;
+
 #ifdef CONFIG_PRINTK
+static atomic_t printk_prefer_direct = ATOMIC_INIT(0);
+
+/**
+ * printk_prefer_direct_enter - cause printk() calls to attempt direct
+ *                              printing to all enabled consoles
+ *
+ * Since it is not possible to call into the console printing code from any
+ * context, there is no guarantee that direct printing will occur.
+ *
+ * This globally effects all printk() callers.
+ *
+ * Context: Any context.
+ */
+void printk_prefer_direct_enter(void)
+{
+	atomic_inc(&printk_prefer_direct);
+}
+
+/**
+ * printk_prefer_direct_exit - restore printk() behavior
+ *
+ * Context: Any context.
+ */
+void printk_prefer_direct_exit(void)
+{
+	WARN_ON(atomic_dec_if_positive(&printk_prefer_direct) < 0);
+}
+
+/*
+ * Calling printk() always wakes kthread printers so that they can
+ * flush the new message to their respective consoles. Also, if direct
+ * printing is allowed, printk() tries to flush the messages directly.
+ *
+ * Direct printing is allowed in situations when the kthreads
+ * are not available or the system is in a problematic state.
+ *
+ * See the implementation about possible races.
+ */
+static inline bool allow_direct_printing(void)
+{
+	/*
+	 * Checking kthread availability is a possible race because the
+	 * kthread printers can become permanently disabled during runtime.
+	 * However, doing that requires holding the console_lock, so any
+	 * pending messages will be direct printed by console_unlock().
+	 */
+	if (!printk_kthreads_available)
+		return true;
+
+	/*
+	 * Prefer direct printing when the system is in a problematic state.
+	 * The context that sets this state will always see the updated value.
+	 * The other contexts do not care. Anyway, direct printing is just a
+	 * best effort. The direct output is only possible when console_lock
+	 * is not already taken and no kthread printers are actively printing.
+	 */
+	return (system_state > SYSTEM_RUNNING ||
+		oops_in_progress ||
+		atomic_read(&printk_prefer_direct));
+}
+
 DECLARE_WAIT_QUEUE_HEAD(log_wait);
 /* All 3 protected by @syslog_lock. */
 /* the next printk record to read by syslog(READ) or /proc/kmsg */
@@ -1847,6 +1981,7 @@ static int console_lock_spinning_disable_and_check(void)
 	return 1;
 }

+#if !IS_ENABLED(CONFIG_PREEMPT_RT)
 /**
  * console_trylock_spinning - try to get console_lock by busy waiting
  *
@@ -1920,6 +2055,7 @@ static int console_trylock_spinning(void)

 	return 1;
 }
+#endif /* CONFIG_PREEMPT_RT */

 /*
  * Call the specified console driver, asking it to write out the specified
@@ -1927,19 +2063,28 @@ static int console_trylock_spinning(void)
  * dropped, a dropped message will be written out first.
  */
 static void call_console_driver(struct console *con, const char *text, size_t len,
-				char *dropped_text)
+				char *dropped_text, bool atomic_printing)
 {
+	unsigned long dropped = 0;
 	size_t dropped_len;

-	if (con->dropped && dropped_text) {
+	if (dropped_text)
+		dropped = atomic_long_xchg_relaxed(&con->dropped, 0);
+
+	if (dropped) {
 		dropped_len = snprintf(dropped_text, DROPPED_TEXT_MAX,
 				       "** %lu printk messages dropped **\n",
-				       con->dropped);
-		con->dropped = 0;
-		con->write(con, dropped_text, dropped_len);
+				       dropped);
+		if (atomic_printing)
+			con->write_atomic(con, dropped_text, dropped_len);
+		else
+			con->write(con, dropped_text, dropped_len);
 	}

-	con->write(con, text, len);
+	if (atomic_printing)
+		con->write_atomic(con, text, len);
+	else
+		con->write(con, text, len);
 }

 /*
@@ -2249,10 +2394,22 @@ asmlinkage int vprintk_emit(int facility, int level,
 	printed_len = vprintk_store(facility, level, dev_info, fmt, args);

 	/* If called from the scheduler, we can not call up(). */
-	if (!in_sched) {
+	if (!in_sched && allow_direct_printing()) {
+#if IS_ENABLED(CONFIG_PREEMPT_RT)
+		/*
+		 * Use the non-spinning trylock since PREEMPT_RT does not
+		 * support console lock handovers.
+		 *
+		 * Direct printing will most likely involve taking spinlocks.
+		 * For PREEMPT_RT, this is only allowed if in a preemptible
+		 * context.
+		 */
+		if (preemptible() && console_trylock())
+			console_unlock();
+#else
 		/*
 		 * The caller may be holding system-critical or
-		 * timing-sensitive locks. Disable preemption during
+		 * timing-sensitive locks. Disable preemption during direct
 		 * printing of all remaining records to all consoles so that
 		 * this context can return as soon as possible. Hopefully
 		 * another printk() caller will take over the printing.
@@ -2267,6 +2424,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		if (console_trylock_spinning())
 			console_unlock();
 		preempt_enable();
+#endif
 	}

 	if (in_sched)
@@ -2297,9 +2455,81 @@ asmlinkage __visible int _printk(const char *fmt, ...)
 }
 EXPORT_SYMBOL(_printk);

+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+static void __free_atomic_data(struct console_atomic_data *d)
+{
+	kfree(d->text);
+	kfree(d->ext_text);
+	kfree(d->dropped_text);
+}
+
+static void free_atomic_data(struct console_atomic_data *d)
+{
+	int count = 1;
+	int i;
+
+	if (!d)
+		return;
+
+#ifdef CONFIG_HAVE_NMI
+	count = 2;
+#endif
+
+	for (i = 0; i < count; i++)
+		__free_atomic_data(&d[i]);
+	kfree(d);
+}
+
+static int __alloc_atomic_data(struct console_atomic_data *d, short flags)
+{
+	d->text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+	if (!d->text)
+		return -1;
+
+	if (flags & CON_EXTENDED) {
+		d->ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+		if (!d->ext_text)
+			return -1;
+	} else {
+		d->dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL);
+		if (!d->dropped_text)
+			return -1;
+	}
+
+	return 0;
+}
+
+static struct console_atomic_data *alloc_atomic_data(short flags)
+{
+	struct console_atomic_data *d;
+	int count = 1;
+	int i;
+
+#ifdef CONFIG_HAVE_NMI
+	count = 2;
+#endif
+
+	d = kzalloc(sizeof(*d) * count, GFP_KERNEL);
+	if (!d)
+		goto err_out;
+
+	for (i = 0; i < count; i++) {
+		if (__alloc_atomic_data(&d[i], flags) != 0)
+			goto err_out;
+	}
+
+	return d;
+err_out:
+	free_atomic_data(d);
+	return NULL;
+}
+#endif /* CONFIG_HAVE_ATOMIC_CONSOLE */
+
 static bool pr_flush(int timeout_ms, bool reset_on_progress);
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress);

+static void printk_start_kthread(struct console *con);
+
 #else /* CONFIG_PRINTK */

 #define CONSOLE_LOG_MAX		0
@@ -2310,6 +2540,8 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 #define prb_first_valid_seq(rb)		0
 #define prb_next_seq(rb)		0

+#define free_atomic_data(d)
+
 static u64 syslog_seq;

 static size_t record_print_text(const struct printk_record *r,
@@ -2328,12 +2560,14 @@ static ssize_t msg_print_ext_body(char *buf, size_t size,
 static void console_lock_spinning_enable(void) { }
 static int console_lock_spinning_disable_and_check(void) { return 0; }
 static void call_console_driver(struct console *con, const char *text, size_t len,
-				char *dropped_text)
+				char *dropped_text, bool atomic_printing)
 {
 }
 static bool suppress_message_printing(int level) { return false; }
 static bool pr_flush(int timeout_ms, bool reset_on_progress) { return true; }
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress) { return true; }
+static void printk_start_kthread(struct console *con) { }
+static bool allow_direct_printing(void) { return true; }

 #endif /* CONFIG_PRINTK */

@@ -2552,6 +2786,14 @@ static int console_cpu_notify(unsigned int cpu)
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
+		else {
+			/*
+			 * If a new CPU comes online, the conditions for
+			 * printer_should_wake() may have changed for some
+			 * kthread printer with !CON_ANYTIME.
+			 */
+			wake_up_klogd();
+		}
 	}
 	return 0;
 }
@@ -2594,7 +2836,7 @@ void console_lock(void)
 	down_console_sem();
 	if (console_suspended)
 		return;
-	console_locked = 1;
+	console_kthreads_block();
 	console_may_schedule = 1;
 }
 EXPORT_SYMBOL(console_lock);
@@ -2618,30 +2860,36 @@ int console_trylock(void)
 		up_console_sem();
 		return 0;
 	}
-	console_locked = 1;
+	if (!console_kthreads_atomic_tryblock()) {
+		up_console_sem();
+		return 0;
+	}
 	console_may_schedule = 0;
 	return 1;
 }
 EXPORT_SYMBOL(console_trylock);

+/*
+ * This is used to help to make sure that certain paths within the VT code are
+ * running with the console lock held. It is definitely not the perfect debug
+ * tool (it is not known if the VT code is the task holding the console lock),
+ * but it helps tracking those weird code paths in the console code such as
+ * when the console is suspended: where the console is not locked but no
+ * console printing may occur.
+ *
+ * Note: This returns true when the console is suspended but is not locked.
+ *       This is intentional because the VT code must consider that situation
+ *       the same as if the console was locked.
+ */
 int is_console_locked(void)
 {
-	return console_locked;
+	return (console_kthreads_blocked || atomic_read(&console_kthreads_active));
 }
 EXPORT_SYMBOL(is_console_locked);

-/*
- * Check if the given console is currently capable and allowed to print
- * records.
- *
- * Requires the console_lock.
- */
-static inline bool console_is_usable(struct console *con)
+static inline bool __console_is_usable(short flags)
 {
-	if (!(con->flags & CON_ENABLED))
-		return false;
-
-	if (!con->write)
+	if (!(flags & CON_ENABLED))
 		return false;

 	/*
@@ -2650,18 +2898,116 @@ static inline bool console_is_usable(struct console *con)
 	 * cope (CON_ANYTIME) don't call them until this CPU is officially up.
 	 */
 	if (!cpu_online(raw_smp_processor_id()) &&
-	    !(con->flags & CON_ANYTIME))
+	    !(flags & CON_ANYTIME))
 		return false;

 	return true;
 }

+/*
+ * Check if the given console is currently capable and allowed to print
+ * records.
+ *
+ * Requires holding the console_lock.
+ */
+static inline bool console_is_usable(struct console *con, bool atomic_printing)
+{
+	if (atomic_printing) {
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+		if (!con->write_atomic)
+			return false;
+		if (!con->atomic_data)
+			return false;
+#else
+		return false;
+#endif
+	} else if (!con->write) {
+		return false;
+	}
+
+	return __console_is_usable(con->flags);
+}
+
 static void __console_unlock(void)
 {
-	console_locked = 0;
+	/*
+	 * Depending on whether console_lock() or console_trylock() was used,
+	 * appropriately allow the kthread printers to continue.
+	 */
+	if (console_kthreads_blocked)
+		console_kthreads_unblock();
+	else
+		console_kthreads_atomic_unblock();
+
+	/*
+	 * New records may have arrived while the console was locked.
+	 * Wake the kthread printers to print them.
+	 */
+	wake_up_klogd();
+
 	up_console_sem();
 }

+static u64 read_console_seq(struct console *con)
+{
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+	unsigned long flags;
+	u64 seq2;
+	u64 seq;
+
+	if (!con->atomic_data)
+		return con->seq;
+
+	printk_cpu_sync_get_irqsave(flags);
+
+	seq = con->seq;
+	seq2 = con->atomic_data[0].seq;
+	if (seq2 > seq)
+		seq = seq2;
+#ifdef CONFIG_HAVE_NMI
+	seq2 = con->atomic_data[1].seq;
+	if (seq2 > seq)
+		seq = seq2;
+#endif
+
+	printk_cpu_sync_put_irqrestore(flags);
+
+	return seq;
+#else /* CONFIG_HAVE_ATOMIC_CONSOLE */
+	return con->seq;
+#endif
+}
+
+static void write_console_seq(struct console *con, u64 val, bool atomic_printing)
+{
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+	unsigned long flags;
+	u64 *seq;
+
+	if (!con->atomic_data) {
+		con->seq = val;
+		return;
+	}
+
+	printk_cpu_sync_get_irqsave(flags);
+
+	if (atomic_printing) {
+		seq = &con->atomic_data[0].seq;
+#ifdef CONFIG_HAVE_NMI
+		if (in_nmi())
+			seq = &con->atomic_data[1].seq;
+#endif
+	} else {
+		seq = &con->seq;
+	}
+	*seq = val;
+
+	printk_cpu_sync_put_irqrestore(flags);
+#else /* CONFIG_HAVE_ATOMIC_CONSOLE */
+	con->seq = val;
+#endif
+}
+
 /*
  * Print one record for the given console. The record printed is whatever
  * record is the next available record for the given console.
@@ -2674,36 +3020,47 @@ static void __console_unlock(void)
  * If dropped messages should be printed, @dropped_text is a buffer of size
  * DROPPED_TEXT_MAX. Otherwise @dropped_text must be NULL.
  *
+ * @atomic_printing specifies if atomic printing should be used.
+ *
  * @handover will be set to true if a printk waiter has taken over the
  * console_lock, in which case the caller is no longer holding the
- * console_lock. Otherwise it is set to false.
+ * console_lock. Otherwise it is set to false. A NULL pointer may be provided
+ * to disable allowing the console_lock to be taken over by a printk waiter.
  *
  * Returns false if the given console has no next record to print, otherwise
  * true.
  *
- * Requires the console_lock.
+ * Requires the console_lock if @handover is non-NULL.
+ * Requires con->lock otherwise.
  */
-static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
-				     char *dropped_text, bool *handover)
+static bool __console_emit_next_record(struct console *con, char *text, char *ext_text,
+				       char *dropped_text, bool atomic_printing,
+				       bool *handover)
 {
-	static int panic_console_dropped;
+	static atomic_t panic_console_dropped = ATOMIC_INIT(0);
 	struct printk_info info;
 	struct printk_record r;
 	unsigned long flags;
 	char *write_text;
 	size_t len;
+	u64 seq;

 	prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX);

-	*handover = false;
+	if (handover)
+		*handover = false;

-	if (!prb_read_valid(prb, con->seq, &r))
+	seq = read_console_seq(con);
+
+	if (!prb_read_valid(prb, seq, &r))
 		return false;

-	if (con->seq != r.info->seq) {
-		con->dropped += r.info->seq - con->seq;
-		con->seq = r.info->seq;
-		if (panic_in_progress() && panic_console_dropped++ > 10) {
+	if (seq != r.info->seq) {
+		atomic_long_add((unsigned long)(r.info->seq - seq), &con->dropped);
+		write_console_seq(con, r.info->seq, atomic_printing);
+		seq = r.info->seq;
+		if (panic_in_progress() &&
+		    atomic_fetch_inc_relaxed(&panic_console_dropped) > 10) {
 			suppress_panic_printk = 1;
 			pr_warn_once("Too many dropped messages. Suppress messages on non-panic CPUs to prevent livelock.\n");
 		}
@@ -2711,7 +3068,7 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_

 	/* Skip record that has level above the console loglevel. */
 	if (suppress_message_printing(r.info->level)) {
-		con->seq++;
+		write_console_seq(con, seq + 1, atomic_printing);
 		goto skip;
 	}

@@ -2725,31 +3082,65 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
 		len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time);
 	}

-	/*
-	 * While actively printing out messages, if another printk()
-	 * were to occur on another CPU, it may wait for this one to
-	 * finish. This task can not be preempted if there is a
-	 * waiter waiting to take over.
-	 *
-	 * Interrupts are disabled because the hand over to a waiter
-	 * must not be interrupted until the hand over is completed
-	 * (@console_waiter is cleared).
-	 */
-	printk_safe_enter_irqsave(flags);
-	console_lock_spinning_enable();
+	if (handover) {
+		/*
+		 * While actively printing out messages, if another printk()
+		 * were to occur on another CPU, it may wait for this one to
+		 * finish. This task can not be preempted if there is a
+		 * waiter waiting to take over.
+		 *
+		 * Interrupts are disabled because the hand over to a waiter
+		 * must not be interrupted until the hand over is completed
+		 * (@console_waiter is cleared).
+		 */
+		printk_safe_enter_irqsave(flags);
+		console_lock_spinning_enable();

-	stop_critical_timings();	/* don't trace print latency */
-	call_console_driver(con, write_text, len, dropped_text);
-	start_critical_timings();
+		/* don't trace irqsoff print latency */
+		stop_critical_timings();
+	}

-	con->seq++;
+	call_console_driver(con, write_text, len, dropped_text, atomic_printing);

-	*handover = console_lock_spinning_disable_and_check();
-	printk_safe_exit_irqrestore(flags);
+	write_console_seq(con, seq + 1, atomic_printing);
+
+	if (handover) {
+		start_critical_timings();
+		*handover = console_lock_spinning_disable_and_check();
+		printk_safe_exit_irqrestore(flags);
+	}
 skip:
 	return true;
 }

+/*
+ * Print a record for a given console, but allow another printk() caller to
+ * take over the console_lock and continue printing.
+ *
+ * Requires the console_lock, but depending on @handover after the call, the
+ * caller may no longer have the console_lock.
+ *
+ * See __console_emit_next_record() for argument and return details.
+ */
+static bool console_emit_next_record_transferable(struct console *con, char *text, char *ext_text,
+						  char *dropped_text, bool *handover)
+{
+	/*
+	 * Handovers are only supported if threaded printers are atomically
+	 * blocked. The context taking over the console_lock may be atomic.
+	 *
+	 * PREEMPT_RT also does not support handovers because the spinning
+	 * waiter can cause large latencies.
+	 */
+	if (!console_kthreads_atomically_blocked() ||
+	    IS_ENABLED(CONFIG_PREEMPT_RT)) {
+		*handover = false;
+		handover = NULL;
+	}
+
+	return __console_emit_next_record(con, text, ext_text, dropped_text, false, handover);
+}
+
 /*
  * Print out all remaining records to all consoles.
  *
@@ -2768,8 +3159,8 @@ static bool console_emit_next_record(struct console *con, char *text, char *ext_
  * were flushed to all usable consoles. A returned false informs the caller
  * that everything was not flushed (either there were no usable consoles or
  * another context has taken over printing or it is a panic situation and this
- * is not the panic CPU). Regardless the reason, the caller should assume it
- * is not useful to immediately try again.
+ * is not the panic CPU or direct printing is not preferred). Regardless the
+ * reason, the caller should assume it is not useful to immediately try again.
  *
  * Requires the console_lock.
  */
@@ -2786,24 +3177,26 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 	*handover = false;

 	do {
+		/* Let the kthread printers do the work if they can. */
+		if (!allow_direct_printing())
+			return false;
+
 		any_progress = false;

 		for_each_console(con) {
 			bool progress;

-			if (!console_is_usable(con))
+			if (!console_is_usable(con, false))
 				continue;
 			any_usable = true;

 			if (con->flags & CON_EXTENDED) {
 				/* Extended consoles do not print "dropped messages". */
-				progress = console_emit_next_record(con, &text[0],
-								    &ext_text[0], NULL,
-								    handover);
+				progress = console_emit_next_record_transferable(con, &text[0],
+								&ext_text[0], NULL, handover);
 			} else {
-				progress = console_emit_next_record(con, &text[0],
-								    NULL, &dropped_text[0],
-								    handover);
+				progress = console_emit_next_record_transferable(con, &text[0],
+								NULL, &dropped_text[0], handover);
 			}
 			if (*handover)
 				return false;
@@ -2828,6 +3221,68 @@ static bool console_flush_all(bool do_cond_resched, u64 *next_seq, bool *handove
 	return any_usable;
 }

+#if defined(CONFIG_HAVE_ATOMIC_CONSOLE) && defined(CONFIG_PRINTK)
+static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
+				     char *dropped_text, bool atomic_printing);
+
+static void atomic_console_flush_all(void)
+{
+	unsigned long flags;
+	struct console *con;
+	bool any_progress;
+	int index = 0;
+
+	if (console_suspended)
+		return;
+
+#ifdef CONFIG_HAVE_NMI
+	if (in_nmi())
+		index = 1;
+#endif
+
+	printk_cpu_sync_get_irqsave(flags);
+
+	do {
+		any_progress = false;
+
+		for_each_console(con) {
+			bool progress;
+
+			if (!console_is_usable(con, true))
+				continue;
+
+			if (con->flags & CON_EXTENDED) {
+				/* Extended consoles do not print "dropped messages". */
+				progress = console_emit_next_record(con,
+							&con->atomic_data->text[index],
+							&con->atomic_data->ext_text[index],
+							NULL,
+							true);
+			} else {
+				progress = console_emit_next_record(con,
+							&con->atomic_data->text[index],
+							NULL,
+							&con->atomic_data->dropped_text[index],
+							true);
+			}
+
+			if (!progress)
+				continue;
+			any_progress = true;
+
+			touch_softlockup_watchdog_sync();
+			clocksource_touch_watchdog();
+			rcu_cpu_stall_reset();
+			touch_nmi_watchdog();
+		}
+	} while (any_progress);
+
+	printk_cpu_sync_put_irqrestore(flags);
+}
+#else /* CONFIG_HAVE_ATOMIC_CONSOLE && CONFIG_PRINTK */
+#define atomic_console_flush_all()
+#endif
+
 /**
  * console_unlock - unlock the console system
  *
@@ -2918,10 +3373,13 @@ void console_unblank(void)
 	if (oops_in_progress) {
 		if (down_trylock_console_sem() != 0)
 			return;
+		if (!console_kthreads_atomic_tryblock()) {
+			up_console_sem();
+			return;
+		}
 	} else
 		console_lock();

-	console_locked = 1;
 	console_may_schedule = 0;
 	for_each_console(c)
 		if ((c->flags & CON_ENABLED) && c->unblank)
@@ -2940,6 +3398,11 @@ void console_unblank(void)
  */
 void console_flush_on_panic(enum con_flush_mode mode)
 {
+	if (mode == CONSOLE_ATOMIC_FLUSH_PENDING) {
+		atomic_console_flush_all();
+		return;
+	}
+
 	/*
 	 * If someone else is holding the console lock, trylock will fail
 	 * and may_schedule may be set.  Ignore and proceed to unlock so
@@ -2956,7 +3419,7 @@ void console_flush_on_panic(enum con_flush_mode mode)

 		seq = prb_first_valid_seq(prb);
 		for_each_console(c)
-			c->seq = seq;
+			write_console_seq(c, seq, false);
 	}
 	console_unlock();
 }
@@ -3196,16 +3659,27 @@ void register_console(struct console *newcon)
 		console_drivers->next = newcon;
 	}

-	newcon->dropped = 0;
+	atomic_long_set(&newcon->dropped, 0);
+	newcon->thread = NULL;
+	newcon->blocked = true;
+	mutex_init(&newcon->lock);
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+	newcon->atomic_data = NULL;
+#endif
+
 	if (newcon->flags & CON_PRINTBUFFER) {
 		/* Get a consistent copy of @syslog_seq. */
 		mutex_lock(&syslog_lock);
-		newcon->seq = syslog_seq;
+		write_console_seq(newcon, syslog_seq, false);
 		mutex_unlock(&syslog_lock);
 	} else {
 		/* Begin with next message. */
-		newcon->seq = prb_next_seq(prb);
+		write_console_seq(newcon, prb_next_seq(prb), false);
 	}
+
+	if (printk_kthreads_available)
+		printk_start_kthread(newcon);
+
 	console_unlock();
 	console_sysfs_notify();

@@ -3229,6 +3703,7 @@ EXPORT_SYMBOL(register_console);

 int unregister_console(struct console *console)
 {
+	struct task_struct *thd;
 	struct console *con;
 	int res;

@@ -3266,9 +3741,26 @@ int unregister_console(struct console *console)
 		console_drivers->flags |= CON_CONSDEV;

 	console->flags &= ~CON_ENABLED;
+
+	/*
+	 * console->thread can only be cleared under the console lock. But
+	 * stopping the thread must be done without the console lock. The
+	 * task that clears @thread is the task that stops the kthread.
+	 */
+	thd = console->thread;
+	console->thread = NULL;
+
 	console_unlock();
+
+	if (thd)
+		kthread_stop(thd);
+
 	console_sysfs_notify();

+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+	free_atomic_data(console->atomic_data);
+#endif
+
 	if (console->exit)
 		res = console->exit(console);

@@ -3362,6 +3854,20 @@ static int __init printk_late_init(void)
 }
 late_initcall(printk_late_init);

+static int __init printk_activate_kthreads(void)
+{
+	struct console *con;
+
+	console_lock();
+	printk_kthreads_available = true;
+	for_each_console(con)
+		printk_start_kthread(con);
+	console_unlock();
+
+	return 0;
+}
+early_initcall(printk_activate_kthreads);
+
 #if defined CONFIG_PRINTK
 /* If @con is specified, only wait for that console. Otherwise wait for all. */
 static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progress)
@@ -3385,7 +3891,7 @@ static bool __pr_flush(struct console *con, int timeout_ms, bool reset_on_progre
 		for_each_console(c) {
 			if (con && con != c)
 				continue;
-			if (!console_is_usable(c))
+			if (!console_is_usable(c, false))
 				continue;
 			printk_seq = c->seq;
 			if (printk_seq < seq)
@@ -3444,11 +3950,214 @@ static bool pr_flush(int timeout_ms, bool reset_on_progress)
 	return __pr_flush(NULL, timeout_ms, reset_on_progress);
 }

+static void __printk_fallback_preferred_direct(void)
+{
+	printk_prefer_direct_enter();
+	pr_err("falling back to preferred direct printing\n");
+	printk_kthreads_available = false;
+}
+
+/*
+ * Enter preferred direct printing, but never exit. Mark console threads as
+ * unavailable. The system is then forever in preferred direct printing and
+ * any printing threads will exit.
+ *
+ * Must *not* be called under console_lock. Use
+ * __printk_fallback_preferred_direct() if already holding console_lock.
+ */
+static void printk_fallback_preferred_direct(void)
+{
+	console_lock();
+	__printk_fallback_preferred_direct();
+	console_unlock();
+}
+
+/*
+ * Print a record for a given console, not allowing another printk() caller
+ * to take over. This is appropriate for contexts that do not have the
+ * console_lock.
+ *
+ * See __console_emit_next_record() for argument and return details.
+ */
+static bool console_emit_next_record(struct console *con, char *text, char *ext_text,
+				     char *dropped_text, bool atomic_printing)
+{
+	return __console_emit_next_record(con, text, ext_text, dropped_text,
+					  atomic_printing, NULL);
+}
+
+static bool printer_should_wake(struct console *con, u64 seq)
+{
+	short flags;
+
+	if (kthread_should_stop() || !printk_kthreads_available)
+		return true;
+
+	if (con->blocked ||
+	    console_kthreads_atomically_blocked() ||
+	    block_console_kthreads ||
+	    system_state > SYSTEM_RUNNING ||
+	    oops_in_progress) {
+		return false;
+	}
+
+	/*
+	 * This is an unsafe read from con->flags, but a false positive is
+	 * not a problem. Worst case it would allow the printer to wake up
+	 * although it is disabled. But the printer will notice that when
+	 * attempting to print and instead go back to sleep.
+	 */
+	flags = data_race(READ_ONCE(con->flags));
+
+	if (!__console_is_usable(flags))
+		return false;
+
+	return prb_read_valid(prb, seq, NULL);
+}
+
+static int printk_kthread_func(void *data)
+{
+	struct console *con = data;
+	char *dropped_text = NULL;
+	char *ext_text = NULL;
+	u64 seq = 0;
+	char *text;
+	int error;
+
+#ifdef CONFIG_HAVE_ATOMIC_CONSOLE
+	if (con->write_atomic)
+		con->atomic_data = alloc_atomic_data(con->flags);
+#endif
+
+	text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL);
+	if (!text) {
+		con_printk(KERN_ERR, con, "failed to allocate text buffer\n");
+		printk_fallback_preferred_direct();
+		goto out;
+	}
+
+	if (con->flags & CON_EXTENDED) {
+		ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL);
+		if (!ext_text) {
+			con_printk(KERN_ERR, con, "failed to allocate ext_text buffer\n");
+			printk_fallback_preferred_direct();
+			goto out;
+		}
+	} else {
+		dropped_text = kmalloc(DROPPED_TEXT_MAX, GFP_KERNEL);
+		if (!dropped_text) {
+			con_printk(KERN_ERR, con, "failed to allocate dropped_text buffer\n");
+			printk_fallback_preferred_direct();
+			goto out;
+		}
+	}
+
+	con_printk(KERN_INFO, con, "printing thread started\n");
+	for (;;) {
+		/*
+		 * Guarantee this task is visible on the waitqueue before
+		 * checking the wake condition.
+		 *
+		 * The full memory barrier within set_current_state() of
+		 * prepare_to_wait_event() pairs with the full memory barrier
+		 * within wq_has_sleeper().
+		 *
+		 * This pairs with __wake_up_klogd:A.
+		 */
+		error = wait_event_interruptible(log_wait,
+						 printer_should_wake(con, seq)); /* LMM(printk_kthread_func:A) */
+
+		if (kthread_should_stop() || !printk_kthreads_available)
+			break;
+
+		if (error)
+			continue;
+
+		error = mutex_lock_interruptible(&con->lock);
+		if (error)
+			continue;
+
+		if (con->blocked ||
+		    !console_kthread_printing_tryenter()) {
+			/* Another context has locked the console_lock. */
+			mutex_unlock(&con->lock);
+			continue;
+		}
+
+		/*
+		 * Although this context has not locked the console_lock, it
+		 * is known that the console_lock is not locked and it is not
+		 * possible for any other context to lock the console_lock.
+		 * Therefore it is safe to read con->flags.
+		 */
+
+		if (!__console_is_usable(con->flags)) {
+			console_kthread_printing_exit();
+			mutex_unlock(&con->lock);
+			continue;
+		}
+
+		/*
+		 * Even though the printk kthread is always preemptible, it is
+		 * still not allowed to call cond_resched() from within
+		 * console drivers. The task may become non-preemptible in the
+		 * console driver call chain. For example, vt_console_print()
+		 * takes a spinlock and then can call into fbcon_redraw(),
+		 * which can conditionally invoke cond_resched().
+		 */
+		console_may_schedule = 0;
+		console_emit_next_record(con, text, ext_text, dropped_text, false);
+
+		seq = con->seq;
+
+		console_kthread_printing_exit();
+
+		mutex_unlock(&con->lock);
+	}
+
+	con_printk(KERN_INFO, con, "printing thread stopped\n");
+out:
+	kfree(dropped_text);
+	kfree(ext_text);
+	kfree(text);
+
+	console_lock();
+	/*
+	 * If this kthread is being stopped by another task, con->thread will
+	 * already be NULL. That is fine. The important thing is that it is
+	 * NULL after the kthread exits.
+	 */
+	con->thread = NULL;
+	console_unlock();
+
+	return 0;
+}
+
+/* Must be called under console_lock. */
+static void printk_start_kthread(struct console *con)
+{
+	/*
+	 * Do not start a kthread if there is no write() callback. The
+	 * kthreads assume the write() callback exists.
+	 */
+	if (!con->write)
+		return;
+
+	con->thread = kthread_run(printk_kthread_func, con,
+				  "pr/%s%d", con->name, con->index);
+	if (IS_ERR(con->thread)) {
+		con->thread = NULL;
+		con_printk(KERN_ERR, con, "unable to start printing thread\n");
+		__printk_fallback_preferred_direct();
+		return;
+	}
+}
+
 /*
  * Delayed printk version, for scheduler-internal messages:
  */
-#define PRINTK_PENDING_WAKEUP	0x01
-#define PRINTK_PENDING_OUTPUT	0x02
+#define PRINTK_PENDING_WAKEUP		0x01
+#define PRINTK_PENDING_DIRECT_OUTPUT	0x02

 static DEFINE_PER_CPU(int, printk_pending);

@@ -3456,10 +4165,14 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 {
 	int pending = this_cpu_xchg(printk_pending, 0);

-	if (pending & PRINTK_PENDING_OUTPUT) {
+	if (pending & PRINTK_PENDING_DIRECT_OUTPUT) {
+		printk_prefer_direct_enter();
+
 		/* If trylock fails, someone else is doing the printing */
 		if (console_trylock())
 			console_unlock();
+
+		printk_prefer_direct_exit();
 	}

 	if (pending & PRINTK_PENDING_WAKEUP)
@@ -3484,10 +4197,11 @@ static void __wake_up_klogd(int val)
 	 * prepare_to_wait_event(), which is called after ___wait_event() adds
 	 * the waiter but before it has checked the wait condition.
 	 *
-	 * This pairs with devkmsg_read:A and syslog_print:A.
+	 * This pairs with devkmsg_read:A, syslog_print:A, and
+	 * printk_kthread_func:A.
 	 */
 	if (wq_has_sleeper(&log_wait) || /* LMM(__wake_up_klogd:A) */
-	    (val & PRINTK_PENDING_OUTPUT)) {
+	    (val & PRINTK_PENDING_DIRECT_OUTPUT)) {
 		this_cpu_or(printk_pending, val);
 		irq_work_queue(this_cpu_ptr(&wake_up_klogd_work));
 	}
@@ -3527,7 +4241,17 @@ void defer_console_output(void)
 	 * New messages may have been added directly to the ringbuffer
 	 * using vprintk_store(), so wake any waiters as well.
 	 */
-	__wake_up_klogd(PRINTK_PENDING_WAKEUP | PRINTK_PENDING_OUTPUT);
+	int val = PRINTK_PENDING_WAKEUP;
+
+	/*
+	 * Make sure that some context will print the messages when direct
+	 * printing is allowed. This happens in situations when the kthreads
+	 * may not be as reliable or perhaps unusable.
+	 */
+	if (allow_direct_printing())
+		val |= PRINTK_PENDING_DIRECT_OUTPUT;
+
+	__wake_up_klogd(val);
 }

 void printk_trigger_flush(void)
diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c
index 6d10927a07d8..8e8fd2fb0a5b 100644
--- a/kernel/printk/printk_safe.c
+++ b/kernel/printk/printk_safe.c
@@ -8,7 +8,9 @@
 #include <linux/smp.h>
 #include <linux/cpumask.h>
 #include <linux/printk.h>
+#include <linux/console.h>
 #include <linux/kprobes.h>
+#include <linux/delay.h>

 #include "internal.h"

@@ -45,3 +47,33 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 	return vprintk_default(fmt, args);
 }
 EXPORT_SYMBOL(vprintk);
+
+/**
+ * try_block_console_kthreads() - Try to block console kthreads and
+ *	make the global console_lock() avaialble
+ *
+ * @timeout_ms:        The maximum time (in ms) to wait.
+ *
+ * Prevent console kthreads from starting processing new messages. Wait
+ * until the global console_lock() become available.
+ *
+ * Context: Can be called in any context.
+ */
+void try_block_console_kthreads(int timeout_ms)
+{
+	block_console_kthreads = true;
+
+	/* Do not wait when the console lock could not be safely taken. */
+	if (this_cpu_read(printk_context) || in_nmi())
+		return;
+
+	while (timeout_ms > 0) {
+		if (console_trylock()) {
+			console_unlock();
+			return;
+		}
+
+		udelay(1000);
+		timeout_ms -= 1;
+	}
+}
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 503c2aa845a4..dcd8c0e44c00 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2363,6 +2363,12 @@ static int rcutorture_booster_init(unsigned int cpu)
 		WARN_ON_ONCE(!t);
 		sp.sched_priority = 2;
 		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#ifdef CONFIG_PREEMPT_RT
+		t = per_cpu(timersd, cpu);
+		WARN_ON_ONCE(!t);
+		sp.sched_priority = 2;
+		sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+#endif
 	}

 	/* Don't allow time recalculation while creating a new task. */
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 5653560573e2..dcbbcf93d608 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -642,6 +642,7 @@ static void print_cpu_stall(unsigned long gps)
 	 * See Documentation/RCU/stallwarn.rst for info on how to debug
 	 * RCU CPU stall warnings.
 	 */
+	printk_prefer_direct_enter();
 	trace_rcu_stall_warning(rcu_state.name, TPS("SelfDetected"));
 	pr_err("INFO: %s self-detected stall on CPU\n", rcu_state.name);
 	raw_spin_lock_irqsave_rcu_node(rdp->mynode, flags);
@@ -676,6 +677,7 @@ static void print_cpu_stall(unsigned long gps)
 	 */
 	set_tsk_need_resched(current);
 	set_preempt_need_resched();
+	printk_prefer_direct_exit();
 }

 static void check_cpu_stall(struct rcu_data *rdp)
diff --git a/kernel/reboot.c b/kernel/reboot.c
index 3bba88c7ffc6..57cedc330660 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -82,6 +82,7 @@ void kernel_restart_prepare(char *cmd)
 {
 	blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
 	system_state = SYSTEM_RESTART;
+	try_block_console_kthreads(10000);
 	usermodehelper_disable();
 	device_shutdown();
 }
@@ -282,6 +283,7 @@ static void kernel_shutdown_prepare(enum system_states state)
 	blocking_notifier_call_chain(&reboot_notifier_list,
 		(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
 	system_state = state;
+	try_block_console_kthreads(10000);
 	usermodehelper_disable();
 	device_shutdown();
 }
@@ -836,9 +838,11 @@ static int __orderly_reboot(void)
 	ret = run_cmd(reboot_cmd);

 	if (ret) {
+		printk_prefer_direct_enter();
 		pr_warn("Failed to start orderly reboot: forcing the issue\n");
 		emergency_sync();
 		kernel_restart(NULL);
+		printk_prefer_direct_exit();
 	}

 	return ret;
@@ -851,6 +855,7 @@ static int __orderly_poweroff(bool force)
 	ret = run_cmd(poweroff_cmd);

 	if (ret && force) {
+		printk_prefer_direct_enter();
 		pr_warn("Failed to start orderly shutdown: forcing the issue\n");

 		/*
@@ -860,6 +865,7 @@ static int __orderly_poweroff(bool force)
 		 */
 		emergency_sync();
 		kernel_power_off();
+		printk_prefer_direct_exit();
 	}

 	return ret;
@@ -917,6 +923,8 @@ EXPORT_SYMBOL_GPL(orderly_reboot);
  */
 static void hw_failure_emergency_poweroff_func(struct work_struct *work)
 {
+	printk_prefer_direct_enter();
+
 	/*
 	 * We have reached here after the emergency shutdown waiting period has
 	 * expired. This means orderly_poweroff has not been able to shut off
@@ -933,6 +941,8 @@ static void hw_failure_emergency_poweroff_func(struct work_struct *work)
 	 */
 	pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n");
 	emergency_restart();
+
+	printk_prefer_direct_exit();
 }

 static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work,
@@ -971,11 +981,13 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
 {
 	static atomic_t allow_proceed = ATOMIC_INIT(1);

+	printk_prefer_direct_enter();
+
 	pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason);

 	/* Shutdown should be initiated only once. */
 	if (!atomic_dec_and_test(&allow_proceed))
-		return;
+		goto out;

 	/*
 	 * Queue a backup emergency shutdown in the event of
@@ -983,6 +995,8 @@ void hw_protection_shutdown(const char *reason, int ms_until_forced)
 	 */
 	hw_failure_emergency_poweroff(ms_until_forced);
 	orderly_poweroff(true);
+out:
+	printk_prefer_direct_exit();
 }
 EXPORT_SYMBOL_GPL(hw_protection_shutdown);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0f6a92737c91..f26292bed420 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1040,6 +1040,46 @@ void resched_curr(struct rq *rq)
 		trace_sched_wake_idle_without_ipi(cpu);
 }

+#ifdef CONFIG_PREEMPT_LAZY
+
+static int tsk_is_polling(struct task_struct *p)
+{
+#ifdef TIF_POLLING_NRFLAG
+	return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
+#else
+	return 0;
+#endif
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+	struct task_struct *curr = rq->curr;
+	int cpu;
+
+	if (!sched_feat(PREEMPT_LAZY)) {
+		resched_curr(rq);
+		return;
+	}
+
+	if (test_tsk_need_resched(curr))
+		return;
+
+	if (test_tsk_need_resched_lazy(curr))
+		return;
+
+	set_tsk_need_resched_lazy(curr);
+
+	cpu = cpu_of(rq);
+	if (cpu == smp_processor_id())
+		return;
+
+	/* NEED_RESCHED_LAZY must be visible before we test polling */
+	smp_mb();
+	if (!tsk_is_polling(curr))
+		smp_send_reschedule(cpu);
+}
+#endif
+
 void resched_cpu(int cpu)
 {
 	struct rq *rq = cpu_rq(cpu);
@@ -2224,6 +2264,7 @@ void migrate_disable(void)
 	preempt_disable();
 	this_rq()->nr_pinned++;
 	p->migration_disabled = 1;
+	preempt_lazy_disable();
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_disable);
@@ -2255,6 +2296,7 @@ void migrate_enable(void)
 	barrier();
 	p->migration_disabled = 0;
 	this_rq()->nr_pinned--;
+	preempt_lazy_enable();
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(migrate_enable);
@@ -3277,6 +3319,76 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
 }
 #endif /* CONFIG_NUMA_BALANCING */

+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Consider:
+ *
+ *  set_special_state(X);
+ *
+ *  do_things()
+ *    // Somewhere in there is an rtlock that can be contended:
+ *    current_save_and_set_rtlock_wait_state();
+ *    [...]
+ *    schedule_rtlock(); (A)
+ *    [...]
+ *    current_restore_rtlock_saved_state();
+ *
+ *  schedule(); (B)
+ *
+ * If p->saved_state is anything else than TASK_RUNNING, then p blocked on an
+ * rtlock (A) *before* voluntarily calling into schedule() (B) after setting its
+ * state to X. For things like ptrace (X=TASK_TRACED), the task could have more
+ * work to do upon acquiring the lock in do_things() before whoever called
+ * wait_task_inactive() should return. IOW, we have to wait for:
+ *
+ *   p.saved_state = TASK_RUNNING
+ *   p.__state     = X
+ *
+ * which implies the task isn't blocked on an RT lock and got to schedule() (B).
+ *
+ * Also see comments in ttwu_state_match().
+ */
+
+static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state)
+{
+	unsigned long flags;
+	bool mismatch;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	if (READ_ONCE(p->__state) & match_state)
+		mismatch = false;
+	else if (READ_ONCE(p->saved_state) & match_state)
+		mismatch = false;
+	else
+		mismatch = true;
+
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	return mismatch;
+}
+static __always_inline bool state_match(struct task_struct *p, unsigned int match_state,
+					bool *wait)
+{
+	if (READ_ONCE(p->__state) & match_state)
+		return true;
+	if (READ_ONCE(p->saved_state) & match_state) {
+		*wait = true;
+		return true;
+	}
+	return false;
+}
+#else
+static __always_inline bool state_mismatch(struct task_struct *p, unsigned int match_state)
+{
+	return !(READ_ONCE(p->__state) & match_state);
+}
+static __always_inline bool state_match(struct task_struct *p, unsigned int match_state,
+					bool *wait)
+{
+	return (READ_ONCE(p->__state) & match_state);
+}
+#endif
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -3295,7 +3407,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p,
  */
 unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
 {
-	int running, queued;
+	bool running, wait;
 	struct rq_flags rf;
 	unsigned long ncsw;
 	struct rq *rq;
@@ -3321,7 +3433,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 		 * is actually now running somewhere else!
 		 */
 		while (task_on_cpu(rq, p)) {
-			if (!(READ_ONCE(p->__state) & match_state))
+			if (state_mismatch(p, match_state))
 				return 0;
 			cpu_relax();
 		}
@@ -3334,9 +3446,10 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 		rq = task_rq_lock(p, &rf);
 		trace_sched_wait_task(p);
 		running = task_on_cpu(rq, p);
-		queued = task_on_rq_queued(p);
+		wait = task_on_rq_queued(p);
 		ncsw = 0;
-		if (READ_ONCE(p->__state) & match_state)
+
+		if (state_match(p, match_state, &wait))
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
 		task_rq_unlock(rq, p, &rf);

@@ -3366,7 +3479,7 @@ unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
-		if (unlikely(queued)) {
+		if (unlikely(wait)) {
 			ktime_t to = NSEC_PER_SEC / HZ;

 			set_current_state(TASK_UNINTERRUPTIBLE);
@@ -4647,6 +4760,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->on_cpu = 0;
 #endif
 	init_task_preempt_count(p);
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(p)->preempt_lazy_count = 0;
+#endif
 #ifdef CONFIG_SMP
 	plist_node_init(&p->pushable_tasks, MAX_PRIO);
 	RB_CLEAR_NODE(&p->pushable_dl_tasks);
@@ -6517,6 +6633,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)

 	next = pick_next_task(rq, prev, &rf);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_lazy(prev);
 	clear_preempt_need_resched();
 #ifdef CONFIG_SCHED_DEBUG
 	rq->last_seen_need_resched_ns = 0;
@@ -6731,6 +6848,30 @@ static void __sched notrace preempt_schedule_common(void)
 	} while (need_resched());
 }

+#ifdef CONFIG_PREEMPT_LAZY
+/*
+ * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is
+ * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as
+ * preempt_lazy_count counter >0.
+ */
+static __always_inline int preemptible_lazy(void)
+{
+	if (test_thread_flag(TIF_NEED_RESCHED))
+		return 1;
+	if (current_thread_info()->preempt_lazy_count)
+		return 0;
+	return 1;
+}
+
+#else
+
+static inline int preemptible_lazy(void)
+{
+	return 1;
+}
+
+#endif
+
 #ifdef CONFIG_PREEMPTION
 /*
  * This is the entry point to schedule() from in-kernel preemption
@@ -6744,6 +6885,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
 	 */
 	if (likely(!preemptible()))
 		return;
+	if (!preemptible_lazy())
+		return;
 	preempt_schedule_common();
 }
 NOKPROBE_SYMBOL(preempt_schedule);
@@ -6791,6 +6934,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
 	if (likely(!preemptible()))
 		return;

+	if (!preemptible_lazy())
+		return;
+
 	do {
 		/*
 		 * Because the function tracer can trace preempt_count_sub()
@@ -9056,7 +9202,9 @@ void __init init_idle(struct task_struct *idle, int cpu)

 	/* Set the preempt count _outside_ the spinlocks! */
 	init_idle_preempt_count(idle, cpu);
-
+#ifdef CONFIG_HAVE_PREEMPT_LAZY
+	task_thread_info(idle)->preempt_lazy_count = 0;
+#endif
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 612873ec2197..9493473e8e16 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4930,7 +4930,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	ideal_runtime = sched_slice(cfs_rq, curr);
 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
 	if (delta_exec > ideal_runtime) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		/*
 		 * The current task ran long enough, ensure it doesn't get
 		 * re-elected due to buddy favours.
@@ -4954,7 +4954,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		return;

 	if (delta > ideal_runtime)
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 }

 static void
@@ -5100,7 +5100,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 	 * validating it and just reschedule.
 	 */
 	if (queued) {
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 		return;
 	}
 	/*
@@ -5249,7 +5249,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 	 * hierarchy can be throttled
 	 */
 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
-		resched_curr(rq_of(cfs_rq));
+		resched_curr_lazy(rq_of(cfs_rq));
 }

 static __always_inline
@@ -6000,7 +6000,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)

 		if (delta < 0) {
 			if (task_current(rq, p))
-				resched_curr(rq);
+				resched_curr_lazy(rq);
 			return;
 		}
 		hrtick_start(rq, delta);
@@ -7729,7 +7729,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	return;

 preempt:
-	resched_curr(rq);
+	resched_curr_lazy(rq);
 	/*
 	 * Only set the backward buddy when the current task is still
 	 * on the rq. This can happen when a wakeup gets interleaved
@@ -11892,7 +11892,7 @@ static void task_fork_fair(struct task_struct *p)
 		 * 'current' within the tree based on its new key value.
 		 */
 		swap(curr->vruntime, se->vruntime);
-		resched_curr(rq);
+		resched_curr_lazy(rq);
 	}

 	se->vruntime -= cfs_rq->min_vruntime;
@@ -11919,7 +11919,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 	 */
 	if (task_current(rq, p)) {
 		if (p->prio > oldprio)
-			resched_curr(rq);
+			resched_curr_lazy(rq);
 	} else
 		check_preempt_curr(rq, p, 0);
 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index ee7f23c76bd3..e13090e33f3c 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -48,6 +48,9 @@ SCHED_FEAT(NONTASK_CAPACITY, true)

 #ifdef CONFIG_PREEMPT_RT
 SCHED_FEAT(TTWU_QUEUE, false)
+# ifdef CONFIG_PREEMPT_LAZY
+SCHED_FEAT(PREEMPT_LAZY, true)
+# endif
 #else

 /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b62d53d7c264..f2577f511a41 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2350,6 +2350,15 @@ extern void reweight_task(struct task_struct *p, int prio);
 extern void resched_curr(struct rq *rq);
 extern void resched_cpu(int cpu);

+#ifdef CONFIG_PREEMPT_LAZY
+extern void resched_curr_lazy(struct rq *rq);
+#else
+static inline void resched_curr_lazy(struct rq *rq)
+{
+	resched_curr(rq);
+}
+#endif
+
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
 extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
diff --git a/kernel/signal.c b/kernel/signal.c
index 5d45f5da2b36..58e919c7c936 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2302,13 +2302,13 @@ static int ptrace_stop(int exit_code, int why, unsigned long message,
 	/*
 	 * Don't want to allow preemption here, because
 	 * sys_ptrace() needs this task to be inactive.
-	 *
-	 * XXX: implement read_unlock_no_resched().
 	 */
-	preempt_disable();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_disable();
 	read_unlock(&tasklist_lock);
 	cgroup_enter_frozen();
-	preempt_enable_no_resched();
+	if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+		preempt_enable_no_resched();
 	schedule();
 	cgroup_leave_frozen(true);

diff --git a/kernel/softirq.c b/kernel/softirq.c
index c8a6913c067d..82f3e68fbe22 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -637,6 +637,24 @@ static inline void tick_irq_exit(void)
 #endif
 }

+#ifdef CONFIG_PREEMPT_RT
+DEFINE_PER_CPU(struct task_struct *, timersd);
+DEFINE_PER_CPU(unsigned long, pending_timer_softirq);
+
+static void wake_timersd(void)
+{
+        struct task_struct *tsk = __this_cpu_read(timersd);
+
+        if (tsk)
+                wake_up_process(tsk);
+}
+
+#else
+
+static inline void wake_timersd(void) { }
+
+#endif
+
 static inline void __irq_exit_rcu(void)
 {
 #ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -649,6 +667,10 @@ static inline void __irq_exit_rcu(void)
 	if (!in_interrupt() && local_softirq_pending())
 		invoke_softirq();

+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && local_pending_timers() &&
+	    !(in_nmi() | in_hardirq()))
+		wake_timersd();
+
 	tick_irq_exit();
 }

@@ -976,12 +998,70 @@ static struct smp_hotplug_thread softirq_threads = {
 	.thread_comm		= "ksoftirqd/%u",
 };

+#ifdef CONFIG_PREEMPT_RT
+static void timersd_setup(unsigned int cpu)
+{
+        sched_set_fifo_low(current);
+}
+
+static int timersd_should_run(unsigned int cpu)
+{
+        return local_pending_timers();
+}
+
+static void run_timersd(unsigned int cpu)
+{
+	unsigned int timer_si;
+
+	ksoftirqd_run_begin();
+
+	timer_si = local_pending_timers();
+	__this_cpu_write(pending_timer_softirq, 0);
+	or_softirq_pending(timer_si);
+
+	__do_softirq();
+
+	ksoftirqd_run_end();
+}
+
+static void raise_ktimers_thread(unsigned int nr)
+{
+	trace_softirq_raise(nr);
+	__this_cpu_or(pending_timer_softirq, 1 << nr);
+}
+
+void raise_hrtimer_softirq(void)
+{
+	raise_ktimers_thread(HRTIMER_SOFTIRQ);
+}
+
+void raise_timer_softirq(void)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	raise_ktimers_thread(TIMER_SOFTIRQ);
+	wake_timersd();
+	local_irq_restore(flags);
+}
+
+static struct smp_hotplug_thread timer_threads = {
+        .store                  = &timersd,
+        .setup                  = timersd_setup,
+        .thread_should_run      = timersd_should_run,
+        .thread_fn              = run_timersd,
+        .thread_comm            = "ktimers/%u",
+};
+#endif
+
 static __init int spawn_ksoftirqd(void)
 {
 	cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
 				  takeover_tasklets);
 	BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
-
+#ifdef CONFIG_PREEMPT_RT
+	BUG_ON(smpboot_register_percpu_thread(&timer_threads));
+#endif
 	return 0;
 }
 early_initcall(spawn_ksoftirqd);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e4f0e3b0c4f4..9e2cfba065de 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1805,7 +1805,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		raise_hrtimer_softirq();
 	}

 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
@@ -1918,7 +1918,7 @@ void hrtimer_run_queues(void)
 	if (!ktime_before(now, cpu_base->softirq_expires_next)) {
 		cpu_base->softirq_expires_next = KTIME_MAX;
 		cpu_base->softirq_activated = 1;
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+		raise_hrtimer_softirq();
 	}

 	__hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 798e1841d286..b52e1861b913 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -800,7 +800,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)

 static inline bool local_timer_softirq_pending(void)
 {
-	return local_softirq_pending() & BIT(TIMER_SOFTIRQ);
+	return local_pending_timers() & BIT(TIMER_SOFTIRQ);
 }

 static ktime_t tick_nohz_next_event(struct tick_sched *ts, int cpu)
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 717fcb9fb14a..e6219da89933 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1822,7 +1822,7 @@ static void run_local_timers(void)
 		if (time_before(jiffies, base->next_expiry))
 			return;
 	}
-	raise_softirq(TIMER_SOFTIRQ);
+	raise_timer_softirq();
 }

 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9db92a6e1463..0efcfac975b6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2658,11 +2658,19 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status)
 	if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
 		trace_flags |= TRACE_FLAG_BH_OFF;

-	if (tif_need_resched())
+	if (tif_need_resched_now())
 		trace_flags |= TRACE_FLAG_NEED_RESCHED;
+#ifdef CONFIG_PREEMPT_LAZY
+	/* Run out of bits. Share the LAZY and PREEMPT_RESCHED */
+	if (need_resched_lazy())
+		trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY;
+#else
 	if (test_preempt_need_resched())
 		trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
-	return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+#endif
+
+	return (trace_flags << 24) | (min_t(unsigned int, pc & 0xff, 0xf)) |
+		(preempt_lazy_count() & 0xff) << 16 |
 		(min_t(unsigned int, migration_disable_value(), 0xf)) << 4;
 }

@@ -4255,15 +4263,17 @@ unsigned long trace_total_entries(struct trace_array *tr)

 static void print_lat_help_header(struct seq_file *m)
 {
-	seq_puts(m, "#                    _------=> CPU#            \n"
-		    "#                   / _-----=> irqs-off/BH-disabled\n"
-		    "#                  | / _----=> need-resched    \n"
-		    "#                  || / _---=> hardirq/softirq \n"
-		    "#                  ||| / _--=> preempt-depth   \n"
-		    "#                  |||| / _-=> migrate-disable \n"
-		    "#                  ||||| /     delay           \n"
-		    "#  cmd     pid     |||||| time  |   caller     \n"
-		    "#     \\   /        ||||||  \\    |    /       \n");
+	seq_puts(m, "#                    _--------=> CPU#            \n"
+		    "#                   / _-------=> irqs-off/BH-disabled\n"
+		    "#                  | / _------=> need-resched    \n"
+		    "#                  || / _-----=> need-resched-lazy\n"
+		    "#                  ||| / _----=> hardirq/softirq \n"
+		    "#                  |||| / _---=> preempt-depth   \n"
+		    "#                  ||||| / _--=> preempt-lazy-depth\n"
+		    "#                  |||||| / _-=> migrate-disable \n"
+		    "#                  ||||||| /     delay           \n"
+		    "#  cmd     pid     |||||||| time  |   caller     \n"
+		    "#     \\   /        ||||||||  \\    |    /       \n");
 }

 static void print_event_info(struct array_buffer *buf, struct seq_file *m)
@@ -4297,14 +4307,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file

 	print_event_info(buf, m);

-	seq_printf(m, "#                            %.*s  _-----=> irqs-off/BH-disabled\n", prec, space);
-	seq_printf(m, "#                            %.*s / _----=> need-resched\n", prec, space);
-	seq_printf(m, "#                            %.*s| / _---=> hardirq/softirq\n", prec, space);
-	seq_printf(m, "#                            %.*s|| / _--=> preempt-depth\n", prec, space);
-	seq_printf(m, "#                            %.*s||| / _-=> migrate-disable\n", prec, space);
-	seq_printf(m, "#                            %.*s|||| /     delay\n", prec, space);
-	seq_printf(m, "#           TASK-PID  %.*s CPU#  |||||  TIMESTAMP  FUNCTION\n", prec, "     TGID   ");
-	seq_printf(m, "#              | |    %.*s   |   |||||     |         |\n", prec, "       |    ");
+	seq_printf(m, "#                            %.*s  _-------=> irqs-off/BH-disabled\n", prec, space);
+	seq_printf(m, "#                            %.*s / _------=> need-resched\n", prec, space);
+	seq_printf(m, "#                            %.*s| / _-----=> need-resched-lazy\n", prec, space);
+	seq_printf(m, "#                            %.*s|| / _----=> hardirq/softirq\n", prec, space);
+	seq_printf(m, "#                            %.*s||| / _---=> preempt-depth\n", prec, space);
+	seq_printf(m, "#                            %.*s|||| / _--=> preempt-lazy-depth\n", prec, space);
+	seq_printf(m, "#                            %.*s||||| / _-=> migrate-disable\n", prec, space);
+	seq_printf(m, "#                            %.*s|||||| /     delay\n", prec, space);
+	seq_printf(m, "#           TASK-PID  %.*s CPU#  |||||||  TIMESTAMP  FUNCTION\n", prec, "     TGID   ");
+	seq_printf(m, "#              | |    %.*s   |   |||||||      |         |\n", prec, "       |    ");
 }

 void
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9da418442a06..1d8c513d1121 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -208,6 +208,7 @@ static int trace_define_common_fields(void)
 	/* Holds both preempt_count and migrate_disable */
 	__common_field(unsigned char, preempt_count);
 	__common_field(int, pid);
+	__common_field(unsigned char, preempt_lazy_count);

 	return ret;
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5cd4fb656306..3c227e2843ae 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -442,6 +442,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 {
 	char hardsoft_irq;
 	char need_resched;
+	char need_resched_lazy;
 	char irqs_off;
 	int hardirq;
 	int softirq;
@@ -462,20 +463,27 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)

 	switch (entry->flags & (TRACE_FLAG_NEED_RESCHED |
 				TRACE_FLAG_PREEMPT_RESCHED)) {
+#ifndef CONFIG_PREEMPT_LAZY
 	case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED:
 		need_resched = 'N';
 		break;
+#endif
 	case TRACE_FLAG_NEED_RESCHED:
 		need_resched = 'n';
 		break;
+#ifndef CONFIG_PREEMPT_LAZY
 	case TRACE_FLAG_PREEMPT_RESCHED:
 		need_resched = 'p';
 		break;
+#endif
 	default:
 		need_resched = '.';
 		break;
 	}

+	need_resched_lazy =
+		(entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.';
+
 	hardsoft_irq =
 		(nmi && hardirq)     ? 'Z' :
 		nmi                  ? 'z' :
@@ -484,14 +492,20 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 		softirq              ? 's' :
 		                       '.' ;

-	trace_seq_printf(s, "%c%c%c",
-			 irqs_off, need_resched, hardsoft_irq);
+	trace_seq_printf(s, "%c%c%c%c",
+			 irqs_off, need_resched, need_resched_lazy,
+			 hardsoft_irq);

 	if (entry->preempt_count & 0xf)
 		trace_seq_printf(s, "%x", entry->preempt_count & 0xf);
 	else
 		trace_seq_putc(s, '.');

+	if (entry->preempt_lazy_count)
+		trace_seq_printf(s, "%x", entry->preempt_lazy_count);
+	else
+		trace_seq_putc(s, '.');
+
 	if (entry->preempt_count & 0xf0)
 		trace_seq_printf(s, "%x", entry->preempt_count >> 4);
 	else
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 8e61f21e7e33..41596c415111 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -424,6 +424,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		/* Start period for the next softlockup warning. */
 		update_report_ts();

+		printk_prefer_direct_enter();
+
 		pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
 			smp_processor_id(), duration,
 			current->comm, task_pid_nr(current));
@@ -442,6 +444,8 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 		add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
 		if (softlockup_panic)
 			panic("softlockup: hung tasks");
+
+		printk_prefer_direct_exit();
 	}

 	return HRTIMER_RESTART;
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index 1e8a49dc956e..7c977b945c92 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -135,6 +135,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		if (__this_cpu_read(hard_watchdog_warn) == true)
 			return;

+		printk_prefer_direct_enter();
+
 		pr_emerg("Watchdog detected hard LOCKUP on cpu %d\n",
 			 this_cpu);
 		print_modules();
@@ -155,6 +157,8 @@ static void watchdog_overflow_callback(struct perf_event *event,
 		if (hardlockup_panic)
 			nmi_panic(regs, "Hard LOCKUP");

+		printk_prefer_direct_exit();
+
 		__this_cpu_write(hard_watchdog_warn, true);
 		return;
 	}
--
2.34.1