sched, x86: Provide a per-cpu preempt_count implementation
authorPeter Zijlstra <peterz@infradead.org>
Wed, 14 Aug 2013 12:51:00 +0000 (14:51 +0200)
committerIngo Molnar <mingo@kernel.org>
Wed, 25 Sep 2013 12:07:57 +0000 (14:07 +0200)
Convert x86 to use a per-cpu preemption count. The reason for doing so
is that accessing per-cpu variables is a lot cheaper than accessing
thread_info variables.

We still need to save/restore the actual preemption count due to
PREEMPT_ACTIVE so we place the per-cpu __preempt_count variable in the
same cache-line as the other hot __switch_to() variables such as
current_task.

NOTE: this save/restore is required even for !PREEMPT kernels as
cond_resched() also relies on preempt_count's PREEMPT_ACTIVE to ignore
task_struct::state.

Also rename thread_info::preempt_count to ensure nobody is
'accidentally' still poking at it.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/n/tip-gzn5rfsf8trgjoqx8hyayy3q@git.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/include/asm/Kbuild
arch/x86/include/asm/preempt.h [new file with mode: 0644]
arch/x86/include/asm/thread_info.h
arch/x86/kernel/asm-offsets.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/entry_32.S
arch/x86/kernel/entry_64.S
arch/x86/kernel/irq_32.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c

index eca20286a91c4c7204bcadb335d0cdb42ada0657..7f669853317a3e940647319c21705ca951fb18ac 100644 (file)
@@ -5,4 +5,3 @@ genhdr-y += unistd_64.h
 genhdr-y += unistd_x32.h
 
 generic-y += clkdev.h
-generic-y += preempt.h
diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h
new file mode 100644 (file)
index 0000000..1309942
--- /dev/null
@@ -0,0 +1,98 @@
+#ifndef __ASM_PREEMPT_H
+#define __ASM_PREEMPT_H
+
+#include <asm/rmwcc.h>
+#include <asm/percpu.h>
+#include <linux/thread_info.h>
+
+DECLARE_PER_CPU(int, __preempt_count);
+
+/*
+ * We mask the PREEMPT_NEED_RESCHED bit so as not to confuse all current users
+ * that think a non-zero value indicates we cannot preempt.
+ */
+static __always_inline int preempt_count(void)
+{
+       return __this_cpu_read_4(__preempt_count) & ~PREEMPT_NEED_RESCHED;
+}
+
+static __always_inline void preempt_count_set(int pc)
+{
+       __this_cpu_write_4(__preempt_count, pc);
+}
+
+/*
+ * must be macros to avoid header recursion hell
+ */
+#define task_preempt_count(p) \
+       (task_thread_info(p)->saved_preempt_count & ~PREEMPT_NEED_RESCHED)
+
+#define init_task_preempt_count(p) do { \
+       task_thread_info(p)->saved_preempt_count = PREEMPT_DISABLED; \
+} while (0)
+
+#define init_idle_preempt_count(p, cpu) do { \
+       task_thread_info(p)->saved_preempt_count = PREEMPT_ENABLED; \
+       per_cpu(__preempt_count, (cpu)) = PREEMPT_ENABLED; \
+} while (0)
+
+/*
+ * We fold the NEED_RESCHED bit into the preempt count such that
+ * preempt_enable() can decrement and test for needing to reschedule with a
+ * single instruction.
+ *
+ * We invert the actual bit, so that when the decrement hits 0 we know we both
+ * need to resched (the bit is cleared) and can resched (no preempt count).
+ */
+
+static __always_inline void set_preempt_need_resched(void)
+{
+       __this_cpu_and_4(__preempt_count, ~PREEMPT_NEED_RESCHED);
+}
+
+static __always_inline void clear_preempt_need_resched(void)
+{
+       __this_cpu_or_4(__preempt_count, PREEMPT_NEED_RESCHED);
+}
+
+static __always_inline bool test_preempt_need_resched(void)
+{
+       return !(__this_cpu_read_4(__preempt_count) & PREEMPT_NEED_RESCHED);
+}
+
+/*
+ * The various preempt_count add/sub methods
+ */
+
+static __always_inline void __preempt_count_add(int val)
+{
+       __this_cpu_add_4(__preempt_count, val);
+}
+
+static __always_inline void __preempt_count_sub(int val)
+{
+       __this_cpu_add_4(__preempt_count, -val);
+}
+
+static __always_inline bool __preempt_count_dec_and_test(void)
+{
+       GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e");
+}
+
+/*
+ * Returns true when we need to resched -- even if we can not.
+ */
+static __always_inline bool need_resched(void)
+{
+       return unlikely(test_preempt_need_resched());
+}
+
+/*
+ * Returns true when we need to resched and can (barring IRQ state).
+ */
+static __always_inline bool should_resched(void)
+{
+       return unlikely(!__this_cpu_read_4(__preempt_count));
+}
+
+#endif /* __ASM_PREEMPT_H */
index 27811190cbd70e5787263e6f064069776064fe3f..c46a46be1ec699c0a92180c4f584da150fd7f5eb 100644 (file)
@@ -28,8 +28,7 @@ struct thread_info {
        __u32                   flags;          /* low level flags */
        __u32                   status;         /* thread synchronous flags */
        __u32                   cpu;            /* current CPU */
-       int                     preempt_count;  /* 0 => preemptable,
-                                                  <0 => BUG */
+       int                     saved_preempt_count;
        mm_segment_t            addr_limit;
        struct restart_block    restart_block;
        void __user             *sysenter_return;
@@ -49,7 +48,7 @@ struct thread_info {
        .exec_domain    = &default_exec_domain, \
        .flags          = 0,                    \
        .cpu            = 0,                    \
-       .preempt_count  = INIT_PREEMPT_COUNT,   \
+       .saved_preempt_count = INIT_PREEMPT_COUNT,      \
        .addr_limit     = KERNEL_DS,            \
        .restart_block = {                      \
                .fn = do_no_restart_syscall,    \
index 28610822fb3cc4be10a04c098ee91b6a8b8ab78f..9f6b9341950f7895b247b1c320cc0f9cd75cda9d 100644 (file)
@@ -32,7 +32,6 @@ void common(void) {
        OFFSET(TI_flags, thread_info, flags);
        OFFSET(TI_status, thread_info, status);
        OFFSET(TI_addr_limit, thread_info, addr_limit);
-       OFFSET(TI_preempt_count, thread_info, preempt_count);
 
        BLANK();
        OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
index 2793d1f095a2d865c23f14449944d322377f3dba..5223fe6dec7bbfe055989935b00ca258a1ba1f81 100644 (file)
@@ -1095,6 +1095,9 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) =
 
 DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
+
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
 /*
@@ -1169,6 +1172,8 @@ void debug_stack_reset(void)
 
 DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task;
 EXPORT_PER_CPU_SYMBOL(current_task);
+DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
+EXPORT_PER_CPU_SYMBOL(__preempt_count);
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
index f0dcb0ceb6a2eda24d298b8a05650374e753bcf6..fd1bc1b15e6d8ebfc0f9e98a025e2ed65601c7fa 100644 (file)
@@ -362,12 +362,9 @@ END(ret_from_exception)
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
        DISABLE_INTERRUPTS(CLBR_ANY)
-       cmpl $0,TI_preempt_count(%ebp)  # non-zero preempt_count ?
-       jnz restore_all
 need_resched:
-       movl TI_flags(%ebp), %ecx       # need_resched set ?
-       testb $_TIF_NEED_RESCHED, %cl
-       jz restore_all
+       cmpl $0,PER_CPU_VAR(__preempt_count)
+       jnz restore_all
        testl $X86_EFLAGS_IF,PT_EFLAGS(%esp)    # interrupts off (exception path) ?
        jz restore_all
        call preempt_schedule_irq
index 1b69951a81e2b61bff752621872d5750736f96d4..6a43e7d29fe77298652352d9e39c416706ed0e22 100644 (file)
@@ -1118,10 +1118,8 @@ retint_signal:
        /* Returning to kernel space. Check if we need preemption */
        /* rcx:  threadinfo. interrupts off. */
 ENTRY(retint_kernel)
-       cmpl $0,TI_preempt_count(%rcx)
+       cmpl $0,PER_CPU_VAR(__preempt_count)
        jnz  retint_restore_args
-       bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
-       jnc  retint_restore_args
        bt   $9,EFLAGS-ARGOFFSET(%rsp)  /* interrupts off? */
        jnc  retint_restore_args
        call preempt_schedule_irq
index 4186755f1d7cf350f82540fdb76f731f58b455c3..3fe066359ac08df2693928f66323ca3db3c2e925 100644 (file)
@@ -100,9 +100,6 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
        irqctx->tinfo.task = curctx->tinfo.task;
        irqctx->tinfo.previous_esp = current_stack_pointer;
 
-       /* Copy the preempt_count so that the [soft]irq checks work. */
-       irqctx->tinfo.preempt_count = curctx->tinfo.preempt_count;
-
        if (unlikely(overflow))
                call_on_stack(print_stack_overflow, isp);
 
@@ -131,7 +128,6 @@ void irq_ctx_init(int cpu)
                                               THREAD_SIZE_ORDER));
        memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
        irqctx->tinfo.cpu               = cpu;
-       irqctx->tinfo.preempt_count     = HARDIRQ_OFFSET;
        irqctx->tinfo.addr_limit        = MAKE_MM_SEG(0);
 
        per_cpu(hardirq_ctx, cpu) = irqctx;
index 884f98f69354c18393755a23bc89edd74f1051aa..c2ec1aa6d45467bddd5f4c09189e4160b4fb2476 100644 (file)
@@ -291,6 +291,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        if (get_kernel_rpl() && unlikely(prev->iopl != next->iopl))
                set_iopl_mask(next->iopl);
 
+       /*
+        * If it were not for PREEMPT_ACTIVE we could guarantee that the
+        * preempt_count of all tasks was equal here and this would not be
+        * needed.
+        */
+       task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
+       this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+
        /*
         * Now maybe handle debug registers and/or IO bitmaps
         */
index bb1dc51bab05649c4eb2e48ddbb23840c4a598a8..45ab4d6fc8a7af0409c56b0f8cd1170124b57858 100644 (file)
@@ -363,6 +363,14 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
        this_cpu_write(old_rsp, next->usersp);
        this_cpu_write(current_task, next_p);
 
+       /*
+        * If it were not for PREEMPT_ACTIVE we could guarantee that the
+        * preempt_count of all tasks was equal here and this would not be
+        * needed.
+        */
+       task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
+       this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+
        this_cpu_write(kernel_stack,
                  (unsigned long)task_stack_page(next_p) +
                  THREAD_SIZE - KERNEL_STACK_OFFSET);