kernel/rcu/tree_plugin.h

   1 /*
   2  * Read-Copy Update mechanism for mutual exclusion (tree-based version)
   3  * Internal non-public definitions that provide either classic
   4  * or preemptible semantics.
   5  *
   6  * This program is free software; you can redistribute it and/or modify
   7  * it under the terms of the GNU General Public License as published by
   8  * the Free Software Foundation; either version 2 of the License, or
   9  * (at your option) any later version.
  10  *
  11  * This program is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14  * GNU General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU General Public License
  17  * along with this program; if not, you can access it online at
  18  * http://www.gnu.org/licenses/gpl-2.0.html.
  19  *
  20  * Copyright Red Hat, 2009
  21  * Copyright IBM Corporation, 2009
  22  *
  23  * Author: Ingo Molnar <mingo@elte.hu>
  24  *         Paul E. McKenney <paulmck@linux.vnet.ibm.com>
  25  */
  26
  27 #include <linux/delay.h>
  28 #include <linux/gfp.h>
  29 #include <linux/oom.h>
  30 #include <linux/smpboot.h>
  31 #include "../time/tick-internal.h"
  32
  33 #ifdef CONFIG_RCU_BOOST
  34
  35 #include "../locking/rtmutex_common.h"
  36
  37 /*
  38  * Control variables for per-CPU and per-rcu_node kthreads.  These
  39  * handle all flavors of RCU.
  40  */
  41 static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
  42 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
  43 DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
  44 DEFINE_PER_CPU(char, rcu_cpu_has_work);
  45
  46 #endif /* #ifdef CONFIG_RCU_BOOST */
  47
  48 #ifdef CONFIG_RCU_NOCB_CPU
  49 static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
  50 static bool have_rcu_nocb_mask;     /* Was rcu_nocb_mask allocated? */
  51 static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
  52 #endif /* #ifdef CONFIG_RCU_NOCB_CPU */
  53
  54 /*
  55  * Check the RCU kernel configuration parameters and print informative
  56  * messages about anything out of the ordinary.  If you like #ifdef, you
  57  * will love this function.
  58  */
  59 static void __init rcu_bootup_announce_oddness(void)
  60 {
  61 #ifdef CONFIG_RCU_TRACE
  62         pr_info("\tRCU debugfs-based tracing is enabled.\n");
  63 #endif
  64 #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
  65         pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
  66                CONFIG_RCU_FANOUT);
  67 #endif
  68 #ifdef CONFIG_RCU_FANOUT_EXACT
  69         pr_info("\tHierarchical RCU autobalancing is disabled.\n");
  70 #endif
  71 #ifdef CONFIG_RCU_FAST_NO_HZ
  72         pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
  73 #endif
  74 #ifdef CONFIG_PROVE_RCU
  75         pr_info("\tRCU lockdep checking is enabled.\n");
  76 #endif
  77 #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
  78         pr_info("\tRCU torture testing starts during boot.\n");
  79 #endif
  80 #if defined(CONFIG_RCU_CPU_STALL_INFO)
  81         pr_info("\tAdditional per-CPU info printed with stalls.\n");
  82 #endif
  83 #if NUM_RCU_LVL_4 != 0
  84         pr_info("\tFour-level hierarchy is enabled.\n");
  85 #endif
  86         if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
  87                 pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
  88         if (nr_cpu_ids != NR_CPUS)
  89                 pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
  90 #ifdef CONFIG_RCU_BOOST
  91         pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
  92 #endif
  93 }
  94
  95 #ifdef CONFIG_PREEMPT_RCU
  96
  97 RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu);
  98 static struct rcu_state *rcu_state_p = &rcu_preempt_state;
  99
 100 static int rcu_preempted_readers_exp(struct rcu_node *rnp);
 101 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 102                                bool wake);
 103
 104 /*
 105  * Tell them what RCU they are running.
 106  */
 107 static void __init rcu_bootup_announce(void)
 108 {
 109         pr_info("Preemptible hierarchical RCU implementation.\n");
 110         rcu_bootup_announce_oddness();
 111 }
 112
 113 /*
 114  * Record a preemptible-RCU quiescent state for the specified CPU.  Note
 115  * that this just means that the task currently running on the CPU is
 116  * not in a quiescent state.  There might be any number of tasks blocked
 117  * while in an RCU read-side critical section.
 118  *
 119  * As with the other rcu_*_qs() functions, callers to this function
 120  * must disable preemption.
 121  */
 122 static void rcu_preempt_qs(void)
 123 {
 124         if (!__this_cpu_read(rcu_preempt_data.passed_quiesce)) {
 125                 trace_rcu_grace_period(TPS("rcu_preempt"),
 126                                        __this_cpu_read(rcu_preempt_data.gpnum),
 127                                        TPS("cpuqs"));
 128                 __this_cpu_write(rcu_preempt_data.passed_quiesce, 1);
 129                 barrier(); /* Coordinate with rcu_preempt_check_callbacks(). */
 130                 current->rcu_read_unlock_special.b.need_qs = false;
 131         }
 132 }
 133
 134 /*
 135  * We have entered the scheduler, and the current task might soon be
 136  * context-switched away from.  If this task is in an RCU read-side
 137  * critical section, we will no longer be able to rely on the CPU to
 138  * record that fact, so we enqueue the task on the blkd_tasks list.
 139  * The task will dequeue itself when it exits the outermost enclosing
 140  * RCU read-side critical section.  Therefore, the current grace period
 141  * cannot be permitted to complete until the blkd_tasks list entries
 142  * predating the current grace period drain, in other words, until
 143  * rnp->gp_tasks becomes NULL.
 144  *
 145  * Caller must disable preemption.
 146  */
 147 static void rcu_preempt_note_context_switch(void)
 148 {
 149         struct task_struct *t = current;
 150         unsigned long flags;
 151         struct rcu_data *rdp;
 152         struct rcu_node *rnp;
 153
 154         if (t->rcu_read_lock_nesting > 0 &&
 155             !t->rcu_read_unlock_special.b.blocked) {
 156
 157                 /* Possibly blocking in an RCU read-side critical section. */
 158                 rdp = this_cpu_ptr(rcu_preempt_state.rda);
 159                 rnp = rdp->mynode;
 160                 raw_spin_lock_irqsave(&rnp->lock, flags);
 161                 smp_mb__after_unlock_lock();
 162                 t->rcu_read_unlock_special.b.blocked = true;
 163                 t->rcu_blocked_node = rnp;
 164
 165                 /*
 166                  * If this CPU has already checked in, then this task
 167                  * will hold up the next grace period rather than the
 168                  * current grace period.  Queue the task accordingly.
 169                  * If the task is queued for the current grace period
 170                  * (i.e., this CPU has not yet passed through a quiescent
 171                  * state for the current grace period), then as long
 172                  * as that task remains queued, the current grace period
 173                  * cannot end.  Note that there is some uncertainty as
 174                  * to exactly when the current grace period started.
 175                  * We take a conservative approach, which can result
 176                  * in unnecessarily waiting on tasks that started very
 177                  * slightly after the current grace period began.  C'est
 178                  * la vie!!!
 179                  *
 180                  * But first, note that the current CPU must still be
 181                  * on line!
 182                  */
 183                 WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
 184                 WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 185                 if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 186                         list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
 187                         rnp->gp_tasks = &t->rcu_node_entry;
 188 #ifdef CONFIG_RCU_BOOST
 189                         if (rnp->boost_tasks != NULL)
 190                                 rnp->boost_tasks = rnp->gp_tasks;
 191 #endif /* #ifdef CONFIG_RCU_BOOST */
 192                 } else {
 193                         list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
 194                         if (rnp->qsmask & rdp->grpmask)
 195                                 rnp->gp_tasks = &t->rcu_node_entry;
 196                 }
 197                 trace_rcu_preempt_task(rdp->rsp->name,
 198                                        t->pid,
 199                                        (rnp->qsmask & rdp->grpmask)
 200                                        ? rnp->gpnum
 201                                        : rnp->gpnum + 1);
 202                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 203         } else if (t->rcu_read_lock_nesting < 0 &&
 204                    t->rcu_read_unlock_special.s) {
 205
 206                 /*
 207                  * Complete exit from RCU read-side critical section on
 208                  * behalf of preempted instance of __rcu_read_unlock().
 209                  */
 210                 rcu_read_unlock_special(t);
 211         }
 212
 213         /*
 214          * Either we were not in an RCU read-side critical section to
 215          * begin with, or we have now recorded that critical section
 216          * globally.  Either way, we can now note a quiescent state
 217          * for this CPU.  Again, if we were in an RCU read-side critical
 218          * section, and if that critical section was blocking the current
 219          * grace period, then the fact that the task has been enqueued
 220          * means that we continue to block the current grace period.
 221          */
 222         rcu_preempt_qs();
 223 }
 224
 225 /*
 226  * Check for preempted RCU readers blocking the current grace period
 227  * for the specified rcu_node structure.  If the caller needs a reliable
 228  * answer, it must hold the rcu_node's ->lock.
 229  */
 230 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 231 {
 232         return rnp->gp_tasks != NULL;
 233 }
 234
 235 /*
 236  * Record a quiescent state for all tasks that were previously queued
 237  * on the specified rcu_node structure and that were blocking the current
 238  * RCU grace period.  The caller must hold the specified rnp->lock with
 239  * irqs disabled, and this lock is released upon return, but irqs remain
 240  * disabled.
 241  */
 242 static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
 243         __releases(rnp->lock)
 244 {
 245         unsigned long mask;
 246         struct rcu_node *rnp_p;
 247
 248         if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 249                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 250                 return;  /* Still need more quiescent states! */
 251         }
 252
 253         rnp_p = rnp->parent;
 254         if (rnp_p == NULL) {
 255                 /*
 256                  * Either there is only one rcu_node in the tree,
 257                  * or tasks were kicked up to root rcu_node due to
 258                  * CPUs going offline.
 259                  */
 260                 rcu_report_qs_rsp(&rcu_preempt_state, flags);
 261                 return;
 262         }
 263
 264         /* Report up the rest of the hierarchy. */
 265         mask = rnp->grpmask;
 266         raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
 267         raw_spin_lock(&rnp_p->lock);    /* irqs already disabled. */
 268         smp_mb__after_unlock_lock();
 269         rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
 270 }
 271
 272 /*
 273  * Advance a ->blkd_tasks-list pointer to the next entry, instead
 274  * returning NULL if at the end of the list.
 275  */
 276 static struct list_head *rcu_next_node_entry(struct task_struct *t,
 277                                              struct rcu_node *rnp)
 278 {
 279         struct list_head *np;
 280
 281         np = t->rcu_node_entry.next;
 282         if (np == &rnp->blkd_tasks)
 283                 np = NULL;
 284         return np;
 285 }
 286
 287 /*
 288  * Return true if the specified rcu_node structure has tasks that were
 289  * preempted within an RCU read-side critical section.
 290  */
 291 static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 292 {
 293         return !list_empty(&rnp->blkd_tasks);
 294 }
 295
 296 /*
 297  * Handle special cases during rcu_read_unlock(), such as needing to
 298  * notify RCU core processing or task having blocked during the RCU
 299  * read-side critical section.
 300  */
 301 void rcu_read_unlock_special(struct task_struct *t)
 302 {
 303         bool empty;
 304         bool empty_exp;
 305         bool empty_norm;
 306         bool empty_exp_now;
 307         unsigned long flags;
 308         struct list_head *np;
 309 #ifdef CONFIG_RCU_BOOST
 310         bool drop_boost_mutex = false;
 311 #endif /* #ifdef CONFIG_RCU_BOOST */
 312         struct rcu_node *rnp;
 313         union rcu_special special;
 314
 315         /* NMI handlers cannot block and cannot safely manipulate state. */
 316         if (in_nmi())
 317                 return;
 318
 319         local_irq_save(flags);
 320
 321         /*
 322          * If RCU core is waiting for this CPU to exit critical section,
 323          * let it know that we have done so.  Because irqs are disabled,
 324          * t->rcu_read_unlock_special cannot change.
 325          */
 326         special = t->rcu_read_unlock_special;
 327         if (special.b.need_qs) {
 328                 rcu_preempt_qs();
 329                 if (!t->rcu_read_unlock_special.s) {
 330                         local_irq_restore(flags);
 331                         return;
 332                 }
 333         }
 334
 335         /* Hardware IRQ handlers cannot block, complain if they get here. */
 336         if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
 337                 local_irq_restore(flags);
 338                 return;
 339         }
 340
 341         /* Clean up if blocked during RCU read-side critical section. */
 342         if (special.b.blocked) {
 343                 t->rcu_read_unlock_special.b.blocked = false;
 344
 345                 /*
 346                  * Remove this task from the list it blocked on.  The
 347                  * task can migrate while we acquire the lock, but at
 348                  * most one time.  So at most two passes through loop.
 349                  */
 350                 for (;;) {
 351                         rnp = t->rcu_blocked_node;
 352                         raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
 353                         smp_mb__after_unlock_lock();
 354                         if (rnp == t->rcu_blocked_node)
 355                                 break;
 356                         raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 357                 }
 358                 empty = !rcu_preempt_has_tasks(rnp);
 359                 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 360                 empty_exp = !rcu_preempted_readers_exp(rnp);
 361                 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
 362                 np = rcu_next_node_entry(t, rnp);
 363                 list_del_init(&t->rcu_node_entry);
 364                 t->rcu_blocked_node = NULL;
 365                 trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
 366                                                 rnp->gpnum, t->pid);
 367                 if (&t->rcu_node_entry == rnp->gp_tasks)
 368                         rnp->gp_tasks = np;
 369                 if (&t->rcu_node_entry == rnp->exp_tasks)
 370                         rnp->exp_tasks = np;
 371 #ifdef CONFIG_RCU_BOOST
 372                 if (&t->rcu_node_entry == rnp->boost_tasks)
 373                         rnp->boost_tasks = np;
 374                 /* Snapshot ->boost_mtx ownership with rcu_node lock held. */
 375                 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
 376 #endif /* #ifdef CONFIG_RCU_BOOST */
 377
 378                 /*
 379                  * If this was the last task on the list, go see if we
 380                  * need to propagate ->qsmaskinit bit clearing up the
 381                  * rcu_node tree.
 382                  */
 383                 if (!empty && !rcu_preempt_has_tasks(rnp))
 384                         rcu_cleanup_dead_rnp(rnp);
 385
 386                 /*
 387                  * If this was the last task on the current list, and if
 388                  * we aren't waiting on any CPUs, report the quiescent state.
 389                  * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
 390                  * so we must take a snapshot of the expedited state.
 391                  */
 392                 empty_exp_now = !rcu_preempted_readers_exp(rnp);
 393                 if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
 394                         trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
 395                                                          rnp->gpnum,
 396                                                          0, rnp->qsmask,
 397                                                          rnp->level,
 398                                                          rnp->grplo,
 399                                                          rnp->grphi,
 400                                                          !!rnp->gp_tasks);
 401                         rcu_report_unblock_qs_rnp(rnp, flags);
 402                 } else {
 403                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 404                 }
 405
 406 #ifdef CONFIG_RCU_BOOST
 407                 /* Unboost if we were boosted. */
 408                 if (drop_boost_mutex)
 409                         rt_mutex_unlock(&rnp->boost_mtx);
 410 #endif /* #ifdef CONFIG_RCU_BOOST */
 411
 412                 /*
 413                  * If this was the last task on the expedited lists,
 414                  * then we need to report up the rcu_node hierarchy.
 415                  */
 416                 if (!empty_exp && empty_exp_now)
 417                         rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
 418         } else {
 419                 local_irq_restore(flags);
 420         }
 421 }
 422
 423 /*
 424  * Dump detailed information for all tasks blocking the current RCU
 425  * grace period on the specified rcu_node structure.
 426  */
 427 static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 428 {
 429         unsigned long flags;
 430         struct task_struct *t;
 431
 432         raw_spin_lock_irqsave(&rnp->lock, flags);
 433         if (!rcu_preempt_blocked_readers_cgp(rnp)) {
 434                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 435                 return;
 436         }
 437         t = list_entry(rnp->gp_tasks,
 438                        struct task_struct, rcu_node_entry);
 439         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
 440                 sched_show_task(t);
 441         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 442 }
 443
 444 /*
 445  * Dump detailed information for all tasks blocking the current RCU
 446  * grace period.
 447  */
 448 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 449 {
 450         struct rcu_node *rnp = rcu_get_root(rsp);
 451
 452         rcu_print_detail_task_stall_rnp(rnp);
 453         rcu_for_each_leaf_node(rsp, rnp)
 454                 rcu_print_detail_task_stall_rnp(rnp);
 455 }
 456
 457 #ifdef CONFIG_RCU_CPU_STALL_INFO
 458
 459 static void rcu_print_task_stall_begin(struct rcu_node *rnp)
 460 {
 461         pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
 462                rnp->level, rnp->grplo, rnp->grphi);
 463 }
 464
 465 static void rcu_print_task_stall_end(void)
 466 {
 467         pr_cont("\n");
 468 }
 469
 470 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
 471
 472 static void rcu_print_task_stall_begin(struct rcu_node *rnp)
 473 {
 474 }
 475
 476 static void rcu_print_task_stall_end(void)
 477 {
 478 }
 479
 480 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
 481
 482 /*
 483  * Scan the current list of tasks blocked within RCU read-side critical
 484  * sections, printing out the tid of each.
 485  */
 486 static int rcu_print_task_stall(struct rcu_node *rnp)
 487 {
 488         struct task_struct *t;
 489         int ndetected = 0;
 490
 491         if (!rcu_preempt_blocked_readers_cgp(rnp))
 492                 return 0;
 493         rcu_print_task_stall_begin(rnp);
 494         t = list_entry(rnp->gp_tasks,
 495                        struct task_struct, rcu_node_entry);
 496         list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
 497                 pr_cont(" P%d", t->pid);
 498                 ndetected++;
 499         }
 500         rcu_print_task_stall_end();
 501         return ndetected;
 502 }
 503
 504 /*
 505  * Check that the list of blocked tasks for the newly completed grace
 506  * period is in fact empty.  It is a serious bug to complete a grace
 507  * period that still has RCU readers blocked!  This function must be
 508  * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
 509  * must be held by the caller.
 510  *
 511  * Also, if there are blocked tasks on the list, they automatically
 512  * block the newly created grace period, so set up ->gp_tasks accordingly.
 513  */
 514 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 515 {
 516         WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
 517         if (rcu_preempt_has_tasks(rnp))
 518                 rnp->gp_tasks = rnp->blkd_tasks.next;
 519         WARN_ON_ONCE(rnp->qsmask);
 520 }
 521
 522 #ifdef CONFIG_HOTPLUG_CPU
 523
 524 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 525
 526 /*
 527  * Check for a quiescent state from the current CPU.  When a task blocks,
 528  * the task is recorded in the corresponding CPU's rcu_node structure,
 529  * which is checked elsewhere.
 530  *
 531  * Caller must disable hard irqs.
 532  */
 533 static void rcu_preempt_check_callbacks(void)
 534 {
 535         struct task_struct *t = current;
 536
 537         if (t->rcu_read_lock_nesting == 0) {
 538                 rcu_preempt_qs();
 539                 return;
 540         }
 541         if (t->rcu_read_lock_nesting > 0 &&
 542             __this_cpu_read(rcu_preempt_data.qs_pending) &&
 543             !__this_cpu_read(rcu_preempt_data.passed_quiesce))
 544                 t->rcu_read_unlock_special.b.need_qs = true;
 545 }
 546
 547 #ifdef CONFIG_RCU_BOOST
 548
 549 static void rcu_preempt_do_callbacks(void)
 550 {
 551         rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data));
 552 }
 553
 554 #endif /* #ifdef CONFIG_RCU_BOOST */
 555
 556 /*
 557  * Queue a preemptible-RCU callback for invocation after a grace period.
 558  */
 559 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 560 {
 561         __call_rcu(head, func, &rcu_preempt_state, -1, 0);
 562 }
 563 EXPORT_SYMBOL_GPL(call_rcu);
 564
 565 /**
 566  * synchronize_rcu - wait until a grace period has elapsed.
 567  *
 568  * Control will return to the caller some time after a full grace
 569  * period has elapsed, in other words after all currently executing RCU
 570  * read-side critical sections have completed.  Note, however, that
 571  * upon return from synchronize_rcu(), the caller might well be executing
 572  * concurrently with new RCU read-side critical sections that began while
 573  * synchronize_rcu() was waiting.  RCU read-side critical sections are
 574  * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested.
 575  *
 576  * See the description of synchronize_sched() for more detailed information
 577  * on memory ordering guarantees.
 578  */
 579 void synchronize_rcu(void)
 580 {
 581         rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) &&
 582                            !lock_is_held(&rcu_lock_map) &&
 583                            !lock_is_held(&rcu_sched_lock_map),
 584                            "Illegal synchronize_rcu() in RCU read-side critical section");
 585         if (!rcu_scheduler_active)
 586                 return;
 587         if (rcu_expedited)
 588                 synchronize_rcu_expedited();
 589         else
 590                 wait_rcu_gp(call_rcu);
 591 }
 592 EXPORT_SYMBOL_GPL(synchronize_rcu);
 593
 594 static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq);
 595 static unsigned long sync_rcu_preempt_exp_count;
 596 static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
 597
 598 /*
 599  * Return non-zero if there are any tasks in RCU read-side critical
 600  * sections blocking the current preemptible-RCU expedited grace period.
 601  * If there is no preemptible-RCU expedited grace period currently in
 602  * progress, returns zero unconditionally.
 603  */
 604 static int rcu_preempted_readers_exp(struct rcu_node *rnp)
 605 {
 606         return rnp->exp_tasks != NULL;
 607 }
 608
 609 /*
 610  * return non-zero if there is no RCU expedited grace period in progress
 611  * for the specified rcu_node structure, in other words, if all CPUs and
 612  * tasks covered by the specified rcu_node structure have done their bit
 613  * for the current expedited grace period.  Works only for preemptible
 614  * RCU -- other RCU implementation use other means.
 615  *
 616  * Caller must hold sync_rcu_preempt_exp_mutex.
 617  */
 618 static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 619 {
 620         return !rcu_preempted_readers_exp(rnp) &&
 621                ACCESS_ONCE(rnp->expmask) == 0;
 622 }
 623
 624 /*
 625  * Report the exit from RCU read-side critical section for the last task
 626  * that queued itself during or before the current expedited preemptible-RCU
 627  * grace period.  This event is reported either to the rcu_node structure on
 628  * which the task was queued or to one of that rcu_node structure's ancestors,
 629  * recursively up the tree.  (Calm down, calm down, we do the recursion
 630  * iteratively!)
 631  *
 632  * Most callers will set the "wake" flag, but the task initiating the
 633  * expedited grace period need not wake itself.
 634  *
 635  * Caller must hold sync_rcu_preempt_exp_mutex.
 636  */
 637 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 638                                bool wake)
 639 {
 640         unsigned long flags;
 641         unsigned long mask;
 642
 643         raw_spin_lock_irqsave(&rnp->lock, flags);
 644         smp_mb__after_unlock_lock();
 645         for (;;) {
 646                 if (!sync_rcu_preempt_exp_done(rnp)) {
 647                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 648                         break;
 649                 }
 650                 if (rnp->parent == NULL) {
 651                         raw_spin_unlock_irqrestore(&rnp->lock, flags);
 652                         if (wake) {
 653                                 smp_mb(); /* EGP done before wake_up(). */
 654                                 wake_up(&sync_rcu_preempt_exp_wq);
 655                         }
 656                         break;
 657                 }
 658                 mask = rnp->grpmask;
 659                 raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
 660                 rnp = rnp->parent;
 661                 raw_spin_lock(&rnp->lock); /* irqs already disabled */
 662                 smp_mb__after_unlock_lock();
 663                 rnp->expmask &= ~mask;
 664         }
 665 }
 666
 667 /*
 668  * Snapshot the tasks blocking the newly started preemptible-RCU expedited
 669  * grace period for the specified rcu_node structure.  If there are no such
 670  * tasks, report it up the rcu_node hierarchy.
 671  *
 672  * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
 673  * CPU hotplug operations.
 674  */
 675 static void
 676 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 677 {
 678         unsigned long flags;
 679         int must_wait = 0;
 680
 681         raw_spin_lock_irqsave(&rnp->lock, flags);
 682         smp_mb__after_unlock_lock();
 683         if (!rcu_preempt_has_tasks(rnp)) {
 684                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 685         } else {
 686                 rnp->exp_tasks = rnp->blkd_tasks.next;
 687                 rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
 688                 must_wait = 1;
 689         }
 690         if (!must_wait)
 691                 rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 692 }
 693
 694 /**
 695  * synchronize_rcu_expedited - Brute-force RCU grace period
 696  *
 697  * Wait for an RCU-preempt grace period, but expedite it.  The basic
 698  * idea is to invoke synchronize_sched_expedited() to push all the tasks to
 699  * the ->blkd_tasks lists and wait for this list to drain.  This consumes
 700  * significant time on all CPUs and is unfriendly to real-time workloads,
 701  * so is thus not recommended for any sort of common-case code.
 702  * In fact, if you are using synchronize_rcu_expedited() in a loop,
 703  * please restructure your code to batch your updates, and then Use a
 704  * single synchronize_rcu() instead.
 705  */
 706 void synchronize_rcu_expedited(void)
 707 {
 708         unsigned long flags;
 709         struct rcu_node *rnp;
 710         struct rcu_state *rsp = &rcu_preempt_state;
 711         unsigned long snap;
 712         int trycount = 0;
 713
 714         smp_mb(); /* Caller's modifications seen first by other CPUs. */
 715         snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1;
 716         smp_mb(); /* Above access cannot bleed into critical section. */
 717
 718         /*
 719          * Block CPU-hotplug operations.  This means that any CPU-hotplug
 720          * operation that finds an rcu_node structure with tasks in the
 721          * process of being boosted will know that all tasks blocking
 722          * this expedited grace period will already be in the process of
 723          * being boosted.  This simplifies the process of moving tasks
 724          * from leaf to root rcu_node structures.
 725          */
 726         if (!try_get_online_cpus()) {
 727                 /* CPU-hotplug operation in flight, fall back to normal GP. */
 728                 wait_rcu_gp(call_rcu);
 729                 return;
 730         }
 731
 732         /*
 733          * Acquire lock, falling back to synchronize_rcu() if too many
 734          * lock-acquisition failures.  Of course, if someone does the
 735          * expedited grace period for us, just leave.
 736          */
 737         while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) {
 738                 if (ULONG_CMP_LT(snap,
 739                     ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
 740                         put_online_cpus();
 741                         goto mb_ret; /* Others did our work for us. */
 742                 }
 743                 if (trycount++ < 10) {
 744                         udelay(trycount * num_online_cpus());
 745                 } else {
 746                         put_online_cpus();
 747                         wait_rcu_gp(call_rcu);
 748                         return;
 749                 }
 750         }
 751         if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) {
 752                 put_online_cpus();
 753                 goto unlock_mb_ret; /* Others did our work for us. */
 754         }
 755
 756         /* force all RCU readers onto ->blkd_tasks lists. */
 757         synchronize_sched_expedited();
 758
 759         /* Initialize ->expmask for all non-leaf rcu_node structures. */
 760         rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
 761                 raw_spin_lock_irqsave(&rnp->lock, flags);
 762                 smp_mb__after_unlock_lock();
 763                 rnp->expmask = rnp->qsmaskinit;
 764                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
 765         }
 766
 767         /* Snapshot current state of ->blkd_tasks lists. */
 768         rcu_for_each_leaf_node(rsp, rnp)
 769                 sync_rcu_preempt_exp_init(rsp, rnp);
 770         if (NUM_RCU_NODES > 1)
 771                 sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
 772
 773         put_online_cpus();
 774
 775         /* Wait for snapshotted ->blkd_tasks lists to drain. */
 776         rnp = rcu_get_root(rsp);
 777         wait_event(sync_rcu_preempt_exp_wq,
 778                    sync_rcu_preempt_exp_done(rnp));
 779
 780         /* Clean up and exit. */
 781         smp_mb(); /* ensure expedited GP seen before counter increment. */
 782         ACCESS_ONCE(sync_rcu_preempt_exp_count) =
 783                                         sync_rcu_preempt_exp_count + 1;
 784 unlock_mb_ret:
 785         mutex_unlock(&sync_rcu_preempt_exp_mutex);
 786 mb_ret:
 787         smp_mb(); /* ensure subsequent action seen after grace period. */
 788 }
 789 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 790
 791 /**
 792  * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
 793  *
 794  * Note that this primitive does not necessarily wait for an RCU grace period
 795  * to complete.  For example, if there are no RCU callbacks queued anywhere
 796  * in the system, then rcu_barrier() is within its rights to return
 797  * immediately, without waiting for anything, much less an RCU grace period.
 798  */
 799 void rcu_barrier(void)
 800 {
 801         _rcu_barrier(&rcu_preempt_state);
 802 }
 803 EXPORT_SYMBOL_GPL(rcu_barrier);
 804
 805 /*
 806  * Initialize preemptible RCU's state structures.
 807  */
 808 static void __init __rcu_init_preempt(void)
 809 {
 810         rcu_init_one(&rcu_preempt_state, &rcu_preempt_data);
 811 }
 812
 813 /*
 814  * Check for a task exiting while in a preemptible-RCU read-side
 815  * critical section, clean up if so.  No need to issue warnings,
 816  * as debug_check_no_locks_held() already does this if lockdep
 817  * is enabled.
 818  */
 819 void exit_rcu(void)
 820 {
 821         struct task_struct *t = current;
 822
 823         if (likely(list_empty(&current->rcu_node_entry)))
 824                 return;
 825         t->rcu_read_lock_nesting = 1;
 826         barrier();
 827         t->rcu_read_unlock_special.b.blocked = true;
 828         __rcu_read_unlock();
 829 }
 830
 831 #else /* #ifdef CONFIG_PREEMPT_RCU */
 832
 833 static struct rcu_state *rcu_state_p = &rcu_sched_state;
 834
 835 /*
 836  * Tell them what RCU they are running.
 837  */
 838 static void __init rcu_bootup_announce(void)
 839 {
 840         pr_info("Hierarchical RCU implementation.\n");
 841         rcu_bootup_announce_oddness();
 842 }
 843
 844 /*
 845  * Because preemptible RCU does not exist, we never have to check for
 846  * CPUs being in quiescent states.
 847  */
 848 static void rcu_preempt_note_context_switch(void)
 849 {
 850 }
 851
 852 /*
 853  * Because preemptible RCU does not exist, there are never any preempted
 854  * RCU readers.
 855  */
 856 static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 857 {
 858         return 0;
 859 }
 860
 861 #ifdef CONFIG_HOTPLUG_CPU
 862
 863 /*
 864  * Because there is no preemptible RCU, there can be no readers blocked.
 865  */
 866 static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 867 {
 868         return false;
 869 }
 870
 871 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 872
 873 /*
 874  * Because preemptible RCU does not exist, we never have to check for
 875  * tasks blocked within RCU read-side critical sections.
 876  */
 877 static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 878 {
 879 }
 880
 881 /*
 882  * Because preemptible RCU does not exist, we never have to check for
 883  * tasks blocked within RCU read-side critical sections.
 884  */
 885 static int rcu_print_task_stall(struct rcu_node *rnp)
 886 {
 887         return 0;
 888 }
 889
 890 /*
 891  * Because there is no preemptible RCU, there can be no readers blocked,
 892  * so there is no need to check for blocked tasks.  So check only for
 893  * bogus qsmask values.
 894  */
 895 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 896 {
 897         WARN_ON_ONCE(rnp->qsmask);
 898 }
 899
 900 /*
 901  * Because preemptible RCU does not exist, it never has any callbacks
 902  * to check.
 903  */
 904 static void rcu_preempt_check_callbacks(void)
 905 {
 906 }
 907
 908 /*
 909  * Wait for an rcu-preempt grace period, but make it happen quickly.
 910  * But because preemptible RCU does not exist, map to rcu-sched.
 911  */
 912 void synchronize_rcu_expedited(void)
 913 {
 914         synchronize_sched_expedited();
 915 }
 916 EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 917
 918 /*
 919  * Because preemptible RCU does not exist, rcu_barrier() is just
 920  * another name for rcu_barrier_sched().
 921  */
 922 void rcu_barrier(void)
 923 {
 924         rcu_barrier_sched();
 925 }
 926 EXPORT_SYMBOL_GPL(rcu_barrier);
 927
 928 /*
 929  * Because preemptible RCU does not exist, it need not be initialized.
 930  */
 931 static void __init __rcu_init_preempt(void)
 932 {
 933 }
 934
 935 /*
 936  * Because preemptible RCU does not exist, tasks cannot possibly exit
 937  * while in preemptible RCU read-side critical sections.
 938  */
 939 void exit_rcu(void)
 940 {
 941 }
 942
 943 #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
 944
 945 #ifdef CONFIG_RCU_BOOST
 946
 947 #include "../locking/rtmutex_common.h"
 948
 949 #ifdef CONFIG_RCU_TRACE
 950
 951 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 952 {
 953         if (!rcu_preempt_has_tasks(rnp))
 954                 rnp->n_balk_blkd_tasks++;
 955         else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
 956                 rnp->n_balk_exp_gp_tasks++;
 957         else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
 958                 rnp->n_balk_boost_tasks++;
 959         else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
 960                 rnp->n_balk_notblocked++;
 961         else if (rnp->gp_tasks != NULL &&
 962                  ULONG_CMP_LT(jiffies, rnp->boost_time))
 963                 rnp->n_balk_notyet++;
 964         else
 965                 rnp->n_balk_nos++;
 966 }
 967
 968 #else /* #ifdef CONFIG_RCU_TRACE */
 969
 970 static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 971 {
 972 }
 973
 974 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 975
 976 static void rcu_wake_cond(struct task_struct *t, int status)
 977 {
 978         /*
 979          * If the thread is yielding, only wake it when this
 980          * is invoked from idle
 981          */
 982         if (status != RCU_KTHREAD_YIELDING || is_idle_task(current))
 983                 wake_up_process(t);
 984 }
 985
 986 /*
 987  * Carry out RCU priority boosting on the task indicated by ->exp_tasks
 988  * or ->boost_tasks, advancing the pointer to the next task in the
 989  * ->blkd_tasks list.
 990  *
 991  * Note that irqs must be enabled: boosting the task can block.
 992  * Returns 1 if there are more tasks needing to be boosted.
 993  */
 994 static int rcu_boost(struct rcu_node *rnp)
 995 {
 996         unsigned long flags;
 997         struct task_struct *t;
 998         struct list_head *tb;
 999
1000         if (ACCESS_ONCE(rnp->exp_tasks) == NULL &&
1001             ACCESS_ONCE(rnp->boost_tasks) == NULL)
1002                 return 0;  /* Nothing left to boost. */
1003
1004         raw_spin_lock_irqsave(&rnp->lock, flags);
1005         smp_mb__after_unlock_lock();
1006
1007         /*
1008          * Recheck under the lock: all tasks in need of boosting
1009          * might exit their RCU read-side critical sections on their own.
1010          */
1011         if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
1012                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1013                 return 0;
1014         }
1015
1016         /*
1017          * Preferentially boost tasks blocking expedited grace periods.
1018          * This cannot starve the normal grace periods because a second
1019          * expedited grace period must boost all blocked tasks, including
1020          * those blocking the pre-existing normal grace period.
1021          */
1022         if (rnp->exp_tasks != NULL) {
1023                 tb = rnp->exp_tasks;
1024                 rnp->n_exp_boosts++;
1025         } else {
1026                 tb = rnp->boost_tasks;
1027                 rnp->n_normal_boosts++;
1028         }
1029         rnp->n_tasks_boosted++;
1030
1031         /*
1032          * We boost task t by manufacturing an rt_mutex that appears to
1033          * be held by task t.  We leave a pointer to that rt_mutex where
1034          * task t can find it, and task t will release the mutex when it
1035          * exits its outermost RCU read-side critical section.  Then
1036          * simply acquiring this artificial rt_mutex will boost task
1037          * t's priority.  (Thanks to tglx for suggesting this approach!)
1038          *
1039          * Note that task t must acquire rnp->lock to remove itself from
1040          * the ->blkd_tasks list, which it will do from exit() if from
1041          * nowhere else.  We therefore are guaranteed that task t will
1042          * stay around at least until we drop rnp->lock.  Note that
1043          * rnp->lock also resolves races between our priority boosting
1044          * and task t's exiting its outermost RCU read-side critical
1045          * section.
1046          */
1047         t = container_of(tb, struct task_struct, rcu_node_entry);
1048         rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
1049         raw_spin_unlock_irqrestore(&rnp->lock, flags);
1050         /* Lock only for side effect: boosts task t's priority. */
1051         rt_mutex_lock(&rnp->boost_mtx);
1052         rt_mutex_unlock(&rnp->boost_mtx);  /* Then keep lockdep happy. */
1053
1054         return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
1055                ACCESS_ONCE(rnp->boost_tasks) != NULL;
1056 }
1057
1058 /*
1059  * Priority-boosting kthread.  One per leaf rcu_node and one for the
1060  * root rcu_node.
1061  */
1062 static int rcu_boost_kthread(void *arg)
1063 {
1064         struct rcu_node *rnp = (struct rcu_node *)arg;
1065         int spincnt = 0;
1066         int more2boost;
1067
1068         trace_rcu_utilization(TPS("Start boost kthread@init"));
1069         for (;;) {
1070                 rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
1071                 trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
1072                 rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
1073                 trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
1074                 rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
1075                 more2boost = rcu_boost(rnp);
1076                 if (more2boost)
1077                         spincnt++;
1078                 else
1079                         spincnt = 0;
1080                 if (spincnt > 10) {
1081                         rnp->boost_kthread_status = RCU_KTHREAD_YIELDING;
1082                         trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
1083                         schedule_timeout_interruptible(2);
1084                         trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
1085                         spincnt = 0;
1086                 }
1087         }
1088         /* NOTREACHED */
1089         trace_rcu_utilization(TPS("End boost kthread@notreached"));
1090         return 0;
1091 }
1092
1093 /*
1094  * Check to see if it is time to start boosting RCU readers that are
1095  * blocking the current grace period, and, if so, tell the per-rcu_node
1096  * kthread to start boosting them.  If there is an expedited grace
1097  * period in progress, it is always time to boost.
1098  *
1099  * The caller must hold rnp->lock, which this function releases.
1100  * The ->boost_kthread_task is immortal, so we don't need to worry
1101  * about it going away.
1102  */
1103 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1104         __releases(rnp->lock)
1105 {
1106         struct task_struct *t;
1107
1108         if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
1109                 rnp->n_balk_exp_gp_tasks++;
1110                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1111                 return;
1112         }
1113         if (rnp->exp_tasks != NULL ||
1114             (rnp->gp_tasks != NULL &&
1115              rnp->boost_tasks == NULL &&
1116              rnp->qsmask == 0 &&
1117              ULONG_CMP_GE(jiffies, rnp->boost_time))) {
1118                 if (rnp->exp_tasks == NULL)
1119                         rnp->boost_tasks = rnp->gp_tasks;
1120                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1121                 t = rnp->boost_kthread_task;
1122                 if (t)
1123                         rcu_wake_cond(t, rnp->boost_kthread_status);
1124         } else {
1125                 rcu_initiate_boost_trace(rnp);
1126                 raw_spin_unlock_irqrestore(&rnp->lock, flags);
1127         }
1128 }
1129
1130 /*
1131  * Wake up the per-CPU kthread to invoke RCU callbacks.
1132  */
1133 static void invoke_rcu_callbacks_kthread(void)
1134 {
1135         unsigned long flags;
1136
1137         local_irq_save(flags);
1138         __this_cpu_write(rcu_cpu_has_work, 1);
1139         if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
1140             current != __this_cpu_read(rcu_cpu_kthread_task)) {
1141                 rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task),
1142                               __this_cpu_read(rcu_cpu_kthread_status));
1143         }
1144         local_irq_restore(flags);
1145 }
1146
1147 /*
1148  * Is the current CPU running the RCU-callbacks kthread?
1149  * Caller must have preemption disabled.
1150  */
1151 static bool rcu_is_callbacks_kthread(void)
1152 {
1153         return __this_cpu_read(rcu_cpu_kthread_task) == current;
1154 }
1155
1156 #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
1157
1158 /*
1159  * Do priority-boost accounting for the start of a new grace period.
1160  */
1161 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1162 {
1163         rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
1164 }
1165
1166 /*
1167  * Create an RCU-boost kthread for the specified node if one does not
1168  * already exist.  We only create this kthread for preemptible RCU.
1169  * Returns zero if all is well, a negated errno otherwise.
1170  */
1171 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
1172                                                  struct rcu_node *rnp)
1173 {
1174         int rnp_index = rnp - &rsp->node[0];
1175         unsigned long flags;
1176         struct sched_param sp;
1177         struct task_struct *t;
1178
1179         if (&rcu_preempt_state != rsp)
1180                 return 0;
1181
1182         if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
1183                 return 0;
1184
1185         rsp->boost = 1;
1186         if (rnp->boost_kthread_task != NULL)
1187                 return 0;
1188         t = kthread_create(rcu_boost_kthread, (void *)rnp,
1189                            "rcub/%d", rnp_index);
1190         if (IS_ERR(t))
1191                 return PTR_ERR(t);
1192         raw_spin_lock_irqsave(&rnp->lock, flags);
1193         smp_mb__after_unlock_lock();
1194         rnp->boost_kthread_task = t;
1195         raw_spin_unlock_irqrestore(&rnp->lock, flags);
1196         sp.sched_priority = kthread_prio;
1197         sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
1198         wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
1199         return 0;
1200 }
1201
1202 static void rcu_kthread_do_work(void)
1203 {
1204         rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data));
1205         rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data));
1206         rcu_preempt_do_callbacks();
1207 }
1208
1209 static void rcu_cpu_kthread_setup(unsigned int cpu)
1210 {
1211         struct sched_param sp;
1212
1213         sp.sched_priority = kthread_prio;
1214         sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
1215 }
1216
1217 static void rcu_cpu_kthread_park(unsigned int cpu)
1218 {
1219         per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
1220 }
1221
1222 static int rcu_cpu_kthread_should_run(unsigned int cpu)
1223 {
1224         return __this_cpu_read(rcu_cpu_has_work);
1225 }
1226
1227 /*
1228  * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
1229  * RCU softirq used in flavors and configurations of RCU that do not
1230  * support RCU priority boosting.
1231  */
1232 static void rcu_cpu_kthread(unsigned int cpu)
1233 {
1234         unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status);
1235         char work, *workp = this_cpu_ptr(&rcu_cpu_has_work);
1236         int spincnt;
1237
1238         for (spincnt = 0; spincnt < 10; spincnt++) {
1239                 trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait"));
1240                 local_bh_disable();
1241                 *statusp = RCU_KTHREAD_RUNNING;
1242                 this_cpu_inc(rcu_cpu_kthread_loops);
1243                 local_irq_disable();
1244                 work = *workp;
1245                 *workp = 0;
1246                 local_irq_enable();
1247                 if (work)
1248                         rcu_kthread_do_work();
1249                 local_bh_enable();
1250                 if (*workp == 0) {
1251                         trace_rcu_utilization(TPS("End CPU kthread@rcu_wait"));
1252                         *statusp = RCU_KTHREAD_WAITING;
1253                         return;
1254                 }
1255         }
1256         *statusp = RCU_KTHREAD_YIELDING;
1257         trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield"));
1258         schedule_timeout_interruptible(2);
1259         trace_rcu_utilization(TPS("End CPU kthread@rcu_yield"));
1260         *statusp = RCU_KTHREAD_WAITING;
1261 }
1262
1263 /*
1264  * Set the per-rcu_node kthread's affinity to cover all CPUs that are
1265  * served by the rcu_node in question.  The CPU hotplug lock is still
1266  * held, so the value of rnp->qsmaskinit will be stable.
1267  *
1268  * We don't include outgoingcpu in the affinity set, use -1 if there is
1269  * no outgoing CPU.  If there are no CPUs left in the affinity set,
1270  * this function allows the kthread to execute on any CPU.
1271  */
1272 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1273 {
1274         struct task_struct *t = rnp->boost_kthread_task;
1275         unsigned long mask = rnp->qsmaskinit;
1276         cpumask_var_t cm;
1277         int cpu;
1278
1279         if (!t)
1280                 return;
1281         if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
1282                 return;
1283         for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
1284                 if ((mask & 0x1) && cpu != outgoingcpu)
1285                         cpumask_set_cpu(cpu, cm);
1286         if (cpumask_weight(cm) == 0)
1287                 cpumask_setall(cm);
1288         set_cpus_allowed_ptr(t, cm);
1289         free_cpumask_var(cm);
1290 }
1291
1292 static struct smp_hotplug_thread rcu_cpu_thread_spec = {
1293         .store                  = &rcu_cpu_kthread_task,
1294         .thread_should_run      = rcu_cpu_kthread_should_run,
1295         .thread_fn              = rcu_cpu_kthread,
1296         .thread_comm            = "rcuc/%u",
1297         .setup                  = rcu_cpu_kthread_setup,
1298         .park                   = rcu_cpu_kthread_park,
1299 };
1300
1301 /*
1302  * Spawn boost kthreads -- called as soon as the scheduler is running.
1303  */
1304 static void __init rcu_spawn_boost_kthreads(void)
1305 {
1306         struct rcu_node *rnp;
1307         int cpu;
1308
1309         for_each_possible_cpu(cpu)
1310                 per_cpu(rcu_cpu_has_work, cpu) = 0;
1311         BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec));
1312         rcu_for_each_leaf_node(rcu_state_p, rnp)
1313                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1314 }
1315
1316 static void rcu_prepare_kthreads(int cpu)
1317 {
1318         struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
1319         struct rcu_node *rnp = rdp->mynode;
1320
1321         /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
1322         if (rcu_scheduler_fully_active)
1323                 (void)rcu_spawn_one_boost_kthread(rcu_state_p, rnp);
1324 }
1325
1326 #else /* #ifdef CONFIG_RCU_BOOST */
1327
1328 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
1329         __releases(rnp->lock)
1330 {
1331         raw_spin_unlock_irqrestore(&rnp->lock, flags);
1332 }
1333
1334 static void invoke_rcu_callbacks_kthread(void)
1335 {
1336         WARN_ON_ONCE(1);
1337 }
1338
1339 static bool rcu_is_callbacks_kthread(void)
1340 {
1341         return false;
1342 }
1343
1344 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
1345 {
1346 }
1347
1348 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
1349 {
1350 }
1351
1352 static void __init rcu_spawn_boost_kthreads(void)
1353 {
1354 }
1355
1356 static void rcu_prepare_kthreads(int cpu)
1357 {
1358 }
1359
1360 #endif /* #else #ifdef CONFIG_RCU_BOOST */
1361
1362 #if !defined(CONFIG_RCU_FAST_NO_HZ)
1363
1364 /*
1365  * Check to see if any future RCU-related work will need to be done
1366  * by the current CPU, even if none need be done immediately, returning
1367  * 1 if so.  This function is part of the RCU implementation; it is -not-
1368  * an exported member of the RCU API.
1369  *
1370  * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
1371  * any flavor of RCU.
1372  */
1373 #ifndef CONFIG_RCU_NOCB_CPU_ALL
1374 int rcu_needs_cpu(unsigned long *delta_jiffies)
1375 {
1376         *delta_jiffies = ULONG_MAX;
1377         return rcu_cpu_has_callbacks(NULL);
1378 }
1379 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1380
1381 /*
1382  * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
1383  * after it.
1384  */
1385 static void rcu_cleanup_after_idle(void)
1386 {
1387 }
1388
1389 /*
1390  * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n,
1391  * is nothing.
1392  */
1393 static void rcu_prepare_for_idle(void)
1394 {
1395 }
1396
1397 /*
1398  * Don't bother keeping a running count of the number of RCU callbacks
1399  * posted because CONFIG_RCU_FAST_NO_HZ=n.
1400  */
1401 static void rcu_idle_count_callbacks_posted(void)
1402 {
1403 }
1404
1405 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1406
1407 /*
1408  * This code is invoked when a CPU goes idle, at which point we want
1409  * to have the CPU do everything required for RCU so that it can enter
1410  * the energy-efficient dyntick-idle mode.  This is handled by a
1411  * state machine implemented by rcu_prepare_for_idle() below.
1412  *
1413  * The following three proprocessor symbols control this state machine:
1414  *
1415  * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
1416  *      to sleep in dyntick-idle mode with RCU callbacks pending.  This
1417  *      is sized to be roughly one RCU grace period.  Those energy-efficiency
1418  *      benchmarkers who might otherwise be tempted to set this to a large
1419  *      number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
1420  *      system.  And if you are -that- concerned about energy efficiency,
1421  *      just power the system down and be done with it!
1422  * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is
1423  *      permitted to sleep in dyntick-idle mode with only lazy RCU
1424  *      callbacks pending.  Setting this too high can OOM your system.
1425  *
1426  * The values below work well in practice.  If future workloads require
1427  * adjustment, they can be converted into kernel config parameters, though
1428  * making the state machine smarter might be a better option.
1429  */
1430 #define RCU_IDLE_GP_DELAY 4             /* Roughly one grace period. */
1431 #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */
1432
1433 static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY;
1434 module_param(rcu_idle_gp_delay, int, 0644);
1435 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
1436 module_param(rcu_idle_lazy_gp_delay, int, 0644);
1437
1438 extern int tick_nohz_active;
1439
1440 /*
1441  * Try to advance callbacks for all flavors of RCU on the current CPU, but
1442  * only if it has been awhile since the last time we did so.  Afterwards,
1443  * if there are any callbacks ready for immediate invocation, return true.
1444  */
1445 static bool __maybe_unused rcu_try_advance_all_cbs(void)
1446 {
1447         bool cbs_ready = false;
1448         struct rcu_data *rdp;
1449         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1450         struct rcu_node *rnp;
1451         struct rcu_state *rsp;
1452
1453         /* Exit early if we advanced recently. */
1454         if (jiffies == rdtp->last_advance_all)
1455                 return false;
1456         rdtp->last_advance_all = jiffies;
1457
1458         for_each_rcu_flavor(rsp) {
1459                 rdp = this_cpu_ptr(rsp->rda);
1460                 rnp = rdp->mynode;
1461
1462                 /*
1463                  * Don't bother checking unless a grace period has
1464                  * completed since we last checked and there are
1465                  * callbacks not yet ready to invoke.
1466                  */
1467                 if ((rdp->completed != rnp->completed ||
1468                      unlikely(ACCESS_ONCE(rdp->gpwrap))) &&
1469                     rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL])
1470                         note_gp_changes(rsp, rdp);
1471
1472                 if (cpu_has_callbacks_ready_to_invoke(rdp))
1473                         cbs_ready = true;
1474         }
1475         return cbs_ready;
1476 }
1477
1478 /*
1479  * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready
1480  * to invoke.  If the CPU has callbacks, try to advance them.  Tell the
1481  * caller to set the timeout based on whether or not there are non-lazy
1482  * callbacks.
1483  *
1484  * The caller must have disabled interrupts.
1485  */
1486 #ifndef CONFIG_RCU_NOCB_CPU_ALL
1487 int rcu_needs_cpu(unsigned long *dj)
1488 {
1489         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1490
1491         /* Snapshot to detect later posting of non-lazy callback. */
1492         rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1493
1494         /* If no callbacks, RCU doesn't need the CPU. */
1495         if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
1496                 *dj = ULONG_MAX;
1497                 return 0;
1498         }
1499
1500         /* Attempt to advance callbacks. */
1501         if (rcu_try_advance_all_cbs()) {
1502                 /* Some ready to invoke, so initiate later invocation. */
1503                 invoke_rcu_core();
1504                 return 1;
1505         }
1506         rdtp->last_accelerate = jiffies;
1507
1508         /* Request timer delay depending on laziness, and round. */
1509         if (!rdtp->all_lazy) {
1510                 *dj = round_up(rcu_idle_gp_delay + jiffies,
1511                                rcu_idle_gp_delay) - jiffies;
1512         } else {
1513                 *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
1514         }
1515         return 0;
1516 }
1517 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1518
1519 /*
1520  * Prepare a CPU for idle from an RCU perspective.  The first major task
1521  * is to sense whether nohz mode has been enabled or disabled via sysfs.
1522  * The second major task is to check to see if a non-lazy callback has
1523  * arrived at a CPU that previously had only lazy callbacks.  The third
1524  * major task is to accelerate (that is, assign grace-period numbers to)
1525  * any recently arrived callbacks.
1526  *
1527  * The caller must have disabled interrupts.
1528  */
1529 static void rcu_prepare_for_idle(void)
1530 {
1531 #ifndef CONFIG_RCU_NOCB_CPU_ALL
1532         bool needwake;
1533         struct rcu_data *rdp;
1534         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
1535         struct rcu_node *rnp;
1536         struct rcu_state *rsp;
1537         int tne;
1538
1539         /* Handle nohz enablement switches conservatively. */
1540         tne = ACCESS_ONCE(tick_nohz_active);
1541         if (tne != rdtp->tick_nohz_enabled_snap) {
1542                 if (rcu_cpu_has_callbacks(NULL))
1543                         invoke_rcu_core(); /* force nohz to see update. */
1544                 rdtp->tick_nohz_enabled_snap = tne;
1545                 return;
1546         }
1547         if (!tne)
1548                 return;
1549
1550         /* If this is a no-CBs CPU, no callbacks, just return. */
1551         if (rcu_is_nocb_cpu(smp_processor_id()))
1552                 return;
1553
1554         /*
1555          * If a non-lazy callback arrived at a CPU having only lazy
1556          * callbacks, invoke RCU core for the side-effect of recalculating
1557          * idle duration on re-entry to idle.
1558          */
1559         if (rdtp->all_lazy &&
1560             rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) {
1561                 rdtp->all_lazy = false;
1562                 rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
1563                 invoke_rcu_core();
1564                 return;
1565         }
1566
1567         /*
1568          * If we have not yet accelerated this jiffy, accelerate all
1569          * callbacks on this CPU.
1570          */
1571         if (rdtp->last_accelerate == jiffies)
1572                 return;
1573         rdtp->last_accelerate = jiffies;
1574         for_each_rcu_flavor(rsp) {
1575                 rdp = this_cpu_ptr(rsp->rda);
1576                 if (!*rdp->nxttail[RCU_DONE_TAIL])
1577                         continue;
1578                 rnp = rdp->mynode;
1579                 raw_spin_lock(&rnp->lock); /* irqs already disabled. */
1580                 smp_mb__after_unlock_lock();
1581                 needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
1582                 raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
1583                 if (needwake)
1584                         rcu_gp_kthread_wake(rsp);
1585         }
1586 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1587 }
1588
1589 /*
1590  * Clean up for exit from idle.  Attempt to advance callbacks based on
1591  * any grace periods that elapsed while the CPU was idle, and if any
1592  * callbacks are now ready to invoke, initiate invocation.
1593  */
1594 static void rcu_cleanup_after_idle(void)
1595 {
1596 #ifndef CONFIG_RCU_NOCB_CPU_ALL
1597         if (rcu_is_nocb_cpu(smp_processor_id()))
1598                 return;
1599         if (rcu_try_advance_all_cbs())
1600                 invoke_rcu_core();
1601 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1602 }
1603
1604 /*
1605  * Keep a running count of the number of non-lazy callbacks posted
1606  * on this CPU.  This running counter (which is never decremented) allows
1607  * rcu_prepare_for_idle() to detect when something out of the idle loop
1608  * posts a callback, even if an equal number of callbacks are invoked.
1609  * Of course, callbacks should only be posted from within a trace event
1610  * designed to be called from idle or from within RCU_NONIDLE().
1611  */
1612 static void rcu_idle_count_callbacks_posted(void)
1613 {
1614         __this_cpu_add(rcu_dynticks.nonlazy_posted, 1);
1615 }
1616
1617 /*
1618  * Data for flushing lazy RCU callbacks at OOM time.
1619  */
1620 static atomic_t oom_callback_count;
1621 static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq);
1622
1623 /*
1624  * RCU OOM callback -- decrement the outstanding count and deliver the
1625  * wake-up if we are the last one.
1626  */
1627 static void rcu_oom_callback(struct rcu_head *rhp)
1628 {
1629         if (atomic_dec_and_test(&oom_callback_count))
1630                 wake_up(&oom_callback_wq);
1631 }
1632
1633 /*
1634  * Post an rcu_oom_notify callback on the current CPU if it has at
1635  * least one lazy callback.  This will unnecessarily post callbacks
1636  * to CPUs that already have a non-lazy callback at the end of their
1637  * callback list, but this is an infrequent operation, so accept some
1638  * extra overhead to keep things simple.
1639  */
1640 static void rcu_oom_notify_cpu(void *unused)
1641 {
1642         struct rcu_state *rsp;
1643         struct rcu_data *rdp;
1644
1645         for_each_rcu_flavor(rsp) {
1646                 rdp = raw_cpu_ptr(rsp->rda);
1647                 if (rdp->qlen_lazy != 0) {
1648                         atomic_inc(&oom_callback_count);
1649                         rsp->call(&rdp->oom_head, rcu_oom_callback);
1650                 }
1651         }
1652 }
1653
1654 /*
1655  * If low on memory, ensure that each CPU has a non-lazy callback.
1656  * This will wake up CPUs that have only lazy callbacks, in turn
1657  * ensuring that they free up the corresponding memory in a timely manner.
1658  * Because an uncertain amount of memory will be freed in some uncertain
1659  * timeframe, we do not claim to have freed anything.
1660  */
1661 static int rcu_oom_notify(struct notifier_block *self,
1662                           unsigned long notused, void *nfreed)
1663 {
1664         int cpu;
1665
1666         /* Wait for callbacks from earlier instance to complete. */
1667         wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0);
1668         smp_mb(); /* Ensure callback reuse happens after callback invocation. */
1669
1670         /*
1671          * Prevent premature wakeup: ensure that all increments happen
1672          * before there is a chance of the counter reaching zero.
1673          */
1674         atomic_set(&oom_callback_count, 1);
1675
1676         get_online_cpus();
1677         for_each_online_cpu(cpu) {
1678                 smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1);
1679                 cond_resched_rcu_qs();
1680         }
1681         put_online_cpus();
1682
1683         /* Unconditionally decrement: no need to wake ourselves up. */
1684         atomic_dec(&oom_callback_count);
1685
1686         return NOTIFY_OK;
1687 }
1688
1689 static struct notifier_block rcu_oom_nb = {
1690         .notifier_call = rcu_oom_notify
1691 };
1692
1693 static int __init rcu_register_oom_notifier(void)
1694 {
1695         register_oom_notifier(&rcu_oom_nb);
1696         return 0;
1697 }
1698 early_initcall(rcu_register_oom_notifier);
1699
1700 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
1701
1702 #ifdef CONFIG_RCU_CPU_STALL_INFO
1703
1704 #ifdef CONFIG_RCU_FAST_NO_HZ
1705
1706 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1707 {
1708         struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu);
1709         unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap;
1710
1711         sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c",
1712                 rdtp->last_accelerate & 0xffff, jiffies & 0xffff,
1713                 ulong2long(nlpd),
1714                 rdtp->all_lazy ? 'L' : '.',
1715                 rdtp->tick_nohz_enabled_snap ? '.' : 'D');
1716 }
1717
1718 #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */
1719
1720 static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
1721 {
1722         *cp = '\0';
1723 }
1724
1725 #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */
1726
1727 /* Initiate the stall-info list. */
1728 static void print_cpu_stall_info_begin(void)
1729 {
1730         pr_cont("\n");
1731 }
1732
1733 /*
1734  * Print out diagnostic information for the specified stalled CPU.
1735  *
1736  * If the specified CPU is aware of the current RCU grace period
1737  * (flavor specified by rsp), then print the number of scheduling
1738  * clock interrupts the CPU has taken during the time that it has
1739  * been aware.  Otherwise, print the number of RCU grace periods
1740  * that this CPU is ignorant of, for example, "1" if the CPU was
1741  * aware of the previous grace period.
1742  *
1743  * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info.
1744  */
1745 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1746 {
1747         char fast_no_hz[72];
1748         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1749         struct rcu_dynticks *rdtp = rdp->dynticks;
1750         char *ticks_title;
1751         unsigned long ticks_value;
1752
1753         if (rsp->gpnum == rdp->gpnum) {
1754                 ticks_title = "ticks this GP";
1755                 ticks_value = rdp->ticks_this_gp;
1756         } else {
1757                 ticks_title = "GPs behind";
1758                 ticks_value = rsp->gpnum - rdp->gpnum;
1759         }
1760         print_cpu_stall_fast_no_hz(fast_no_hz, cpu);
1761         pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u fqs=%ld %s\n",
1762                cpu, ticks_value, ticks_title,
1763                atomic_read(&rdtp->dynticks) & 0xfff,
1764                rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting,
1765                rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu),
1766                ACCESS_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart,
1767                fast_no_hz);
1768 }
1769
1770 /* Terminate the stall-info list. */
1771 static void print_cpu_stall_info_end(void)
1772 {
1773         pr_err("\t");
1774 }
1775
1776 /* Zero ->ticks_this_gp for all flavors of RCU. */
1777 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1778 {
1779         rdp->ticks_this_gp = 0;
1780         rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id());
1781 }
1782
1783 /* Increment ->ticks_this_gp for all flavors of RCU. */
1784 static void increment_cpu_stall_ticks(void)
1785 {
1786         struct rcu_state *rsp;
1787
1788         for_each_rcu_flavor(rsp)
1789                 raw_cpu_inc(rsp->rda->ticks_this_gp);
1790 }
1791
1792 #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */
1793
1794 static void print_cpu_stall_info_begin(void)
1795 {
1796         pr_cont(" {");
1797 }
1798
1799 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu)
1800 {
1801         pr_cont(" %d", cpu);
1802 }
1803
1804 static void print_cpu_stall_info_end(void)
1805 {
1806         pr_cont("} ");
1807 }
1808
1809 static void zero_cpu_stall_ticks(struct rcu_data *rdp)
1810 {
1811 }
1812
1813 static void increment_cpu_stall_ticks(void)
1814 {
1815 }
1816
1817 #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */
1818
1819 #ifdef CONFIG_RCU_NOCB_CPU
1820
1821 /*
1822  * Offload callback processing from the boot-time-specified set of CPUs
1823  * specified by rcu_nocb_mask.  For each CPU in the set, there is a
1824  * kthread created that pulls the callbacks from the corresponding CPU,
1825  * waits for a grace period to elapse, and invokes the callbacks.
1826  * The no-CBs CPUs do a wake_up() on their kthread when they insert
1827  * a callback into any empty list, unless the rcu_nocb_poll boot parameter
1828  * has been specified, in which case each kthread actively polls its
1829  * CPU.  (Which isn't so great for energy efficiency, but which does
1830  * reduce RCU's overhead on that CPU.)
1831  *
1832  * This is intended to be used in conjunction with Frederic Weisbecker's
1833  * adaptive-idle work, which would seriously reduce OS jitter on CPUs
1834  * running CPU-bound user-mode computations.
1835  *
1836  * Offloading of callback processing could also in theory be used as
1837  * an energy-efficiency measure because CPUs with no RCU callbacks
1838  * queued are more aggressive about entering dyntick-idle mode.
1839  */
1840
1841
1842 /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */
1843 static int __init rcu_nocb_setup(char *str)
1844 {
1845         alloc_bootmem_cpumask_var(&rcu_nocb_mask);
1846         have_rcu_nocb_mask = true;
1847         cpulist_parse(str, rcu_nocb_mask);
1848         return 1;
1849 }
1850 __setup("rcu_nocbs=", rcu_nocb_setup);
1851
1852 static int __init parse_rcu_nocb_poll(char *arg)
1853 {
1854         rcu_nocb_poll = 1;
1855         return 0;
1856 }
1857 early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
1858
1859 /*
1860  * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
1861  * grace period.
1862  */
1863 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
1864 {
1865         wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
1866 }
1867
1868 /*
1869  * Set the root rcu_node structure's ->need_future_gp field
1870  * based on the sum of those of all rcu_node structures.  This does
1871  * double-count the root rcu_node structure's requests, but this
1872  * is necessary to handle the possibility of a rcu_nocb_kthread()
1873  * having awakened during the time that the rcu_node structures
1874  * were being updated for the end of the previous grace period.
1875  */
1876 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
1877 {
1878         rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
1879 }
1880
1881 static void rcu_init_one_nocb(struct rcu_node *rnp)
1882 {
1883         init_waitqueue_head(&rnp->nocb_gp_wq[0]);
1884         init_waitqueue_head(&rnp->nocb_gp_wq[1]);
1885 }
1886
1887 #ifndef CONFIG_RCU_NOCB_CPU_ALL
1888 /* Is the specified CPU a no-CBs CPU? */
1889 bool rcu_is_nocb_cpu(int cpu)
1890 {
1891         if (have_rcu_nocb_mask)
1892                 return cpumask_test_cpu(cpu, rcu_nocb_mask);
1893         return false;
1894 }
1895 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
1896
1897 /*
1898  * Kick the leader kthread for this NOCB group.
1899  */
1900 static void wake_nocb_leader(struct rcu_data *rdp, bool force)
1901 {
1902         struct rcu_data *rdp_leader = rdp->nocb_leader;
1903
1904         if (!ACCESS_ONCE(rdp_leader->nocb_kthread))
1905                 return;
1906         if (ACCESS_ONCE(rdp_leader->nocb_leader_sleep) || force) {
1907                 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
1908                 ACCESS_ONCE(rdp_leader->nocb_leader_sleep) = false;
1909                 wake_up(&rdp_leader->nocb_wq);
1910         }
1911 }
1912
1913 /*
1914  * Does the specified CPU need an RCU callback for the specified flavor
1915  * of rcu_barrier()?
1916  */
1917 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
1918 {
1919         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
1920         unsigned long ret;
1921 #ifdef CONFIG_PROVE_RCU
1922         struct rcu_head *rhp;
1923 #endif /* #ifdef CONFIG_PROVE_RCU */
1924
1925         /*
1926          * Check count of all no-CBs callbacks awaiting invocation.
1927          * There needs to be a barrier before this function is called,
1928          * but associated with a prior determination that no more
1929          * callbacks would be posted.  In the worst case, the first
1930          * barrier in _rcu_barrier() suffices (but the caller cannot
1931          * necessarily rely on this, not a substitute for the caller
1932          * getting the concurrency design right!).  There must also be
1933          * a barrier between the following load an posting of a callback
1934          * (if a callback is in fact needed).  This is associated with an
1935          * atomic_inc() in the caller.
1936          */
1937         ret = atomic_long_read(&rdp->nocb_q_count);
1938
1939 #ifdef CONFIG_PROVE_RCU
1940         rhp = ACCESS_ONCE(rdp->nocb_head);
1941         if (!rhp)
1942                 rhp = ACCESS_ONCE(rdp->nocb_gp_head);
1943         if (!rhp)
1944                 rhp = ACCESS_ONCE(rdp->nocb_follower_head);
1945
1946         /* Having no rcuo kthread but CBs after scheduler starts is bad! */
1947         if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
1948                 /* RCU callback enqueued before CPU first came online??? */
1949                 pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
1950                        cpu, rhp->func);
1951                 WARN_ON_ONCE(1);
1952         }
1953 #endif /* #ifdef CONFIG_PROVE_RCU */
1954
1955         return !!ret;
1956 }
1957
1958 /*
1959  * Enqueue the specified string of rcu_head structures onto the specified
1960  * CPU's no-CBs lists.  The CPU is specified by rdp, the head of the
1961  * string by rhp, and the tail of the string by rhtp.  The non-lazy/lazy
1962  * counts are supplied by rhcount and rhcount_lazy.
1963  *
1964  * If warranted, also wake up the kthread servicing this CPUs queues.
1965  */
1966 static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1967                                     struct rcu_head *rhp,
1968                                     struct rcu_head **rhtp,
1969                                     int rhcount, int rhcount_lazy,
1970                                     unsigned long flags)
1971 {
1972         int len;
1973         struct rcu_head **old_rhpp;
1974         struct task_struct *t;
1975
1976         /* Enqueue the callback on the nocb list and update counts. */
1977         atomic_long_add(rhcount, &rdp->nocb_q_count);
1978         /* rcu_barrier() relies on ->nocb_q_count add before xchg. */
1979         old_rhpp = xchg(&rdp->nocb_tail, rhtp);
1980         ACCESS_ONCE(*old_rhpp) = rhp;
1981         atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy);
1982         smp_mb__after_atomic(); /* Store *old_rhpp before _wake test. */
1983
1984         /* If we are not being polled and there is a kthread, awaken it ... */
1985         t = ACCESS_ONCE(rdp->nocb_kthread);
1986         if (rcu_nocb_poll || !t) {
1987                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1988                                     TPS("WakeNotPoll"));
1989                 return;
1990         }
1991         len = atomic_long_read(&rdp->nocb_q_count);
1992         if (old_rhpp == &rdp->nocb_head) {
1993                 if (!irqs_disabled_flags(flags)) {
1994                         /* ... if queue was empty ... */
1995                         wake_nocb_leader(rdp, false);
1996                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1997                                             TPS("WakeEmpty"));
1998                 } else {
1999                         rdp->nocb_defer_wakeup = RCU_NOGP_WAKE;
2000                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2001                                             TPS("WakeEmptyIsDeferred"));
2002                 }
2003                 rdp->qlen_last_fqs_check = 0;
2004         } else if (len > rdp->qlen_last_fqs_check + qhimark) {
2005                 /* ... or if many callbacks queued. */
2006                 if (!irqs_disabled_flags(flags)) {
2007                         wake_nocb_leader(rdp, true);
2008                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2009                                             TPS("WakeOvf"));
2010                 } else {
2011                         rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE;
2012                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2013                                             TPS("WakeOvfIsDeferred"));
2014                 }
2015                 rdp->qlen_last_fqs_check = LONG_MAX / 2;
2016         } else {
2017                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot"));
2018         }
2019         return;
2020 }
2021
2022 /*
2023  * This is a helper for __call_rcu(), which invokes this when the normal
2024  * callback queue is inoperable.  If this is not a no-CBs CPU, this
2025  * function returns failure back to __call_rcu(), which can complain
2026  * appropriately.
2027  *
2028  * Otherwise, this function queues the callback where the corresponding
2029  * "rcuo" kthread can find it.
2030  */
2031 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2032                             bool lazy, unsigned long flags)
2033 {
2034
2035         if (!rcu_is_nocb_cpu(rdp->cpu))
2036                 return false;
2037         __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy, flags);
2038         if (__is_kfree_rcu_offset((unsigned long)rhp->func))
2039                 trace_rcu_kfree_callback(rdp->rsp->name, rhp,
2040                                          (unsigned long)rhp->func,
2041                                          -atomic_long_read(&rdp->nocb_q_count_lazy),
2042                                          -atomic_long_read(&rdp->nocb_q_count));
2043         else
2044                 trace_rcu_callback(rdp->rsp->name, rhp,
2045                                    -atomic_long_read(&rdp->nocb_q_count_lazy),
2046                                    -atomic_long_read(&rdp->nocb_q_count));
2047
2048         /*
2049          * If called from an extended quiescent state with interrupts
2050          * disabled, invoke the RCU core in order to allow the idle-entry
2051          * deferred-wakeup check to function.
2052          */
2053         if (irqs_disabled_flags(flags) &&
2054             !rcu_is_watching() &&
2055             cpu_online(smp_processor_id()))
2056                 invoke_rcu_core();
2057
2058         return true;
2059 }
2060
2061 /*
2062  * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
2063  * not a no-CBs CPU.
2064  */
2065 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2066                                                      struct rcu_data *rdp,
2067                                                      unsigned long flags)
2068 {
2069         long ql = rsp->qlen;
2070         long qll = rsp->qlen_lazy;
2071
2072         /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
2073         if (!rcu_is_nocb_cpu(smp_processor_id()))
2074                 return false;
2075         rsp->qlen = 0;
2076         rsp->qlen_lazy = 0;
2077
2078         /* First, enqueue the donelist, if any.  This preserves CB ordering. */
2079         if (rsp->orphan_donelist != NULL) {
2080                 __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist,
2081                                         rsp->orphan_donetail, ql, qll, flags);
2082                 ql = qll = 0;
2083                 rsp->orphan_donelist = NULL;
2084                 rsp->orphan_donetail = &rsp->orphan_donelist;
2085         }
2086         if (rsp->orphan_nxtlist != NULL) {
2087                 __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist,
2088                                         rsp->orphan_nxttail, ql, qll, flags);
2089                 ql = qll = 0;
2090                 rsp->orphan_nxtlist = NULL;
2091                 rsp->orphan_nxttail = &rsp->orphan_nxtlist;
2092         }
2093         return true;
2094 }
2095
2096 /*
2097  * If necessary, kick off a new grace period, and either way wait
2098  * for a subsequent grace period to complete.
2099  */
2100 static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2101 {
2102         unsigned long c;
2103         bool d;
2104         unsigned long flags;
2105         bool needwake;
2106         struct rcu_node *rnp = rdp->mynode;
2107
2108         raw_spin_lock_irqsave(&rnp->lock, flags);
2109         smp_mb__after_unlock_lock();
2110         needwake = rcu_start_future_gp(rnp, rdp, &c);
2111         raw_spin_unlock_irqrestore(&rnp->lock, flags);
2112         if (needwake)
2113                 rcu_gp_kthread_wake(rdp->rsp);
2114
2115         /*
2116          * Wait for the grace period.  Do so interruptibly to avoid messing
2117          * up the load average.
2118          */
2119         trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
2120         for (;;) {
2121                 wait_event_interruptible(
2122                         rnp->nocb_gp_wq[c & 0x1],
2123                         (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c)));
2124                 if (likely(d))
2125                         break;
2126                 WARN_ON(signal_pending(current));
2127                 trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait"));
2128         }
2129         trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait"));
2130         smp_mb(); /* Ensure that CB invocation happens after GP end. */
2131 }
2132
2133 /*
2134  * Leaders come here to wait for additional callbacks to show up.
2135  * This function does not return until callbacks appear.
2136  */
2137 static void nocb_leader_wait(struct rcu_data *my_rdp)
2138 {
2139         bool firsttime = true;
2140         bool gotcbs;
2141         struct rcu_data *rdp;
2142         struct rcu_head **tail;
2143
2144 wait_again:
2145
2146         /* Wait for callbacks to appear. */
2147         if (!rcu_nocb_poll) {
2148                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
2149                 wait_event_interruptible(my_rdp->nocb_wq,
2150                                 !ACCESS_ONCE(my_rdp->nocb_leader_sleep));
2151                 /* Memory barrier handled by smp_mb() calls below and repoll. */
2152         } else if (firsttime) {
2153                 firsttime = false; /* Don't drown trace log with "Poll"! */
2154                 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll");
2155         }
2156
2157         /*
2158          * Each pass through the following loop checks a follower for CBs.
2159          * We are our own first follower.  Any CBs found are moved to
2160          * nocb_gp_head, where they await a grace period.
2161          */
2162         gotcbs = false;
2163         for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2164                 rdp->nocb_gp_head = ACCESS_ONCE(rdp->nocb_head);
2165                 if (!rdp->nocb_gp_head)
2166                         continue;  /* No CBs here, try next follower. */
2167
2168                 /* Move callbacks to wait-for-GP list, which is empty. */
2169                 ACCESS_ONCE(rdp->nocb_head) = NULL;
2170                 rdp->nocb_gp_tail = xchg(&rdp->nocb_tail, &rdp->nocb_head);
2171                 gotcbs = true;
2172         }
2173
2174         /*
2175          * If there were no callbacks, sleep a bit, rescan after a
2176          * memory barrier, and go retry.
2177          */
2178         if (unlikely(!gotcbs)) {
2179                 if (!rcu_nocb_poll)
2180                         trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
2181                                             "WokeEmpty");
2182                 WARN_ON(signal_pending(current));
2183                 schedule_timeout_interruptible(1);
2184
2185                 /* Rescan in case we were a victim of memory ordering. */
2186                 my_rdp->nocb_leader_sleep = true;
2187                 smp_mb();  /* Ensure _sleep true before scan. */
2188                 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower)
2189                         if (ACCESS_ONCE(rdp->nocb_head)) {
2190                                 /* Found CB, so short-circuit next wait. */
2191                                 my_rdp->nocb_leader_sleep = false;
2192                                 break;
2193                         }
2194                 goto wait_again;
2195         }
2196
2197         /* Wait for one grace period. */
2198         rcu_nocb_wait_gp(my_rdp);
2199
2200         /*
2201          * We left ->nocb_leader_sleep unset to reduce cache thrashing.
2202          * We set it now, but recheck for new callbacks while
2203          * traversing our follower list.
2204          */
2205         my_rdp->nocb_leader_sleep = true;
2206         smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
2207
2208         /* Each pass through the following loop wakes a follower, if needed. */
2209         for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2210                 if (ACCESS_ONCE(rdp->nocb_head))
2211                         my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
2212                 if (!rdp->nocb_gp_head)
2213                         continue; /* No CBs, so no need to wake follower. */
2214
2215                 /* Append callbacks to follower's "done" list. */
2216                 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail);
2217                 *tail = rdp->nocb_gp_head;
2218                 smp_mb__after_atomic(); /* Store *tail before wakeup. */
2219                 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2220                         /*
2221                          * List was empty, wake up the follower.
2222                          * Memory barriers supplied by atomic_long_add().
2223                          */
2224                         wake_up(&rdp->nocb_wq);
2225                 }
2226         }
2227
2228         /* If we (the leader) don't have CBs, go wait some more. */
2229         if (!my_rdp->nocb_follower_head)
2230                 goto wait_again;
2231 }
2232
2233 /*
2234  * Followers come here to wait for additional callbacks to show up.
2235  * This function does not return until callbacks appear.
2236  */
2237 static void nocb_follower_wait(struct rcu_data *rdp)
2238 {
2239         bool firsttime = true;
2240
2241         for (;;) {
2242                 if (!rcu_nocb_poll) {
2243                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2244                                             "FollowerSleep");
2245                         wait_event_interruptible(rdp->nocb_wq,
2246                                                  ACCESS_ONCE(rdp->nocb_follower_head));
2247                 } else if (firsttime) {
2248                         /* Don't drown trace log with "Poll"! */
2249                         firsttime = false;
2250                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
2251                 }
2252                 if (smp_load_acquire(&rdp->nocb_follower_head)) {
2253                         /* ^^^ Ensure CB invocation follows _head test. */
2254                         return;
2255                 }
2256                 if (!rcu_nocb_poll)
2257                         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2258                                             "WokeEmpty");
2259                 WARN_ON(signal_pending(current));
2260                 schedule_timeout_interruptible(1);
2261         }
2262 }
2263
2264 /*
2265  * Per-rcu_data kthread, but only for no-CBs CPUs.  Each kthread invokes
2266  * callbacks queued by the corresponding no-CBs CPU, however, there is
2267  * an optional leader-follower relationship so that the grace-period
2268  * kthreads don't have to do quite so many wakeups.
2269  */
2270 static int rcu_nocb_kthread(void *arg)
2271 {
2272         int c, cl;
2273         struct rcu_head *list;
2274         struct rcu_head *next;
2275         struct rcu_head **tail;
2276         struct rcu_data *rdp = arg;
2277
2278         /* Each pass through this loop invokes one batch of callbacks */
2279         for (;;) {
2280                 /* Wait for callbacks. */
2281                 if (rdp->nocb_leader == rdp)
2282                         nocb_leader_wait(rdp);
2283                 else
2284                         nocb_follower_wait(rdp);
2285
2286                 /* Pull the ready-to-invoke callbacks onto local list. */
2287                 list = ACCESS_ONCE(rdp->nocb_follower_head);
2288                 BUG_ON(!list);
2289                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty");
2290                 ACCESS_ONCE(rdp->nocb_follower_head) = NULL;
2291                 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
2292
2293                 /* Each pass through the following loop invokes a callback. */
2294                 trace_rcu_batch_start(rdp->rsp->name,
2295                                       atomic_long_read(&rdp->nocb_q_count_lazy),
2296                                       atomic_long_read(&rdp->nocb_q_count), -1);
2297                 c = cl = 0;
2298                 while (list) {
2299                         next = list->next;
2300                         /* Wait for enqueuing to complete, if needed. */
2301                         while (next == NULL && &list->next != tail) {
2302                                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2303                                                     TPS("WaitQueue"));
2304                                 schedule_timeout_interruptible(1);
2305                                 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2306                                                     TPS("WokeQueue"));
2307                                 next = list->next;
2308                         }
2309                         debug_rcu_head_unqueue(list);
2310                         local_bh_disable();
2311                         if (__rcu_reclaim(rdp->rsp->name, list))
2312                                 cl++;
2313                         c++;
2314                         local_bh_enable();
2315                         list = next;
2316                 }
2317                 trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1);
2318                 smp_mb__before_atomic();  /* _add after CB invocation. */
2319                 atomic_long_add(-c, &rdp->nocb_q_count);
2320                 atomic_long_add(-cl, &rdp->nocb_q_count_lazy);
2321                 rdp->n_nocbs_invoked += c;
2322         }
2323         return 0;
2324 }
2325
2326 /* Is a deferred wakeup of rcu_nocb_kthread() required? */
2327 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2328 {
2329         return ACCESS_ONCE(rdp->nocb_defer_wakeup);
2330 }
2331
2332 /* Do a deferred wakeup of rcu_nocb_kthread(). */
2333 static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2334 {
2335         int ndw;
2336
2337         if (!rcu_nocb_need_deferred_wakeup(rdp))
2338                 return;
2339         ndw = ACCESS_ONCE(rdp->nocb_defer_wakeup);
2340         ACCESS_ONCE(rdp->nocb_defer_wakeup) = RCU_NOGP_WAKE_NOT;
2341         wake_nocb_leader(rdp, ndw == RCU_NOGP_WAKE_FORCE);
2342         trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
2343 }
2344
2345 void __init rcu_init_nohz(void)
2346 {
2347         int cpu;
2348         bool need_rcu_nocb_mask = true;
2349         struct rcu_state *rsp;
2350
2351 #ifdef CONFIG_RCU_NOCB_CPU_NONE
2352         need_rcu_nocb_mask = false;
2353 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */
2354
2355 #if defined(CONFIG_NO_HZ_FULL)
2356         if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
2357                 need_rcu_nocb_mask = true;
2358 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
2359
2360         if (!have_rcu_nocb_mask && need_rcu_nocb_mask) {
2361                 if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
2362                         pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
2363                         return;
2364                 }
2365                 have_rcu_nocb_mask = true;
2366         }
2367         if (!have_rcu_nocb_mask)
2368                 return;
2369
2370 #ifdef CONFIG_RCU_NOCB_CPU_ZERO
2371         pr_info("\tOffload RCU callbacks from CPU 0\n");
2372         cpumask_set_cpu(0, rcu_nocb_mask);
2373 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */
2374 #ifdef CONFIG_RCU_NOCB_CPU_ALL
2375         pr_info("\tOffload RCU callbacks from all CPUs\n");
2376         cpumask_copy(rcu_nocb_mask, cpu_possible_mask);
2377 #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */
2378 #if defined(CONFIG_NO_HZ_FULL)
2379         if (tick_nohz_full_running)
2380                 cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
2381 #endif /* #if defined(CONFIG_NO_HZ_FULL) */
2382
2383         if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
2384                 pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n");
2385                 cpumask_and(rcu_nocb_mask, cpu_possible_mask,
2386                             rcu_nocb_mask);
2387         }
2388         pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
2389                 cpumask_pr_args(rcu_nocb_mask));
2390         if (rcu_nocb_poll)
2391                 pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
2392
2393         for_each_rcu_flavor(rsp) {
2394                 for_each_cpu(cpu, rcu_nocb_mask) {
2395                         struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
2396
2397                         /*
2398                          * If there are early callbacks, they will need
2399                          * to be moved to the nocb lists.
2400                          */
2401                         WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
2402                                      &rdp->nxtlist &&
2403                                      rdp->nxttail[RCU_NEXT_TAIL] != NULL);
2404                         init_nocb_callback_list(rdp);
2405                 }
2406                 rcu_organize_nocb_kthreads(rsp);
2407         }
2408 }
2409
2410 /* Initialize per-rcu_data variables for no-CBs CPUs. */
2411 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2412 {
2413         rdp->nocb_tail = &rdp->nocb_head;
2414         init_waitqueue_head(&rdp->nocb_wq);
2415         rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2416 }
2417
2418 /*
2419  * If the specified CPU is a no-CBs CPU that does not already have its
2420  * rcuo kthread for the specified RCU flavor, spawn it.  If the CPUs are
2421  * brought online out of order, this can require re-organizing the
2422  * leader-follower relationships.
2423  */
2424 static void rcu_spawn_one_nocb_kthread(struct rcu_state *rsp, int cpu)
2425 {
2426         struct rcu_data *rdp;
2427         struct rcu_data *rdp_last;
2428         struct rcu_data *rdp_old_leader;
2429         struct rcu_data *rdp_spawn = per_cpu_ptr(rsp->rda, cpu);
2430         struct task_struct *t;
2431
2432         /*
2433          * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
2434          * then nothing to do.
2435          */
2436         if (!rcu_is_nocb_cpu(cpu) || rdp_spawn->nocb_kthread)
2437                 return;
2438
2439         /* If we didn't spawn the leader first, reorganize! */
2440         rdp_old_leader = rdp_spawn->nocb_leader;
2441         if (rdp_old_leader != rdp_spawn && !rdp_old_leader->nocb_kthread) {
2442                 rdp_last = NULL;
2443                 rdp = rdp_old_leader;
2444                 do {
2445                         rdp->nocb_leader = rdp_spawn;
2446                         if (rdp_last && rdp != rdp_spawn)
2447                                 rdp_last->nocb_next_follower = rdp;
2448                         if (rdp == rdp_spawn) {
2449                                 rdp = rdp->nocb_next_follower;
2450                         } else {
2451                                 rdp_last = rdp;
2452                                 rdp = rdp->nocb_next_follower;
2453                                 rdp_last->nocb_next_follower = NULL;
2454                         }
2455                 } while (rdp);
2456                 rdp_spawn->nocb_next_follower = rdp_old_leader;
2457         }
2458
2459         /* Spawn the kthread for this CPU and RCU flavor. */
2460         t = kthread_run(rcu_nocb_kthread, rdp_spawn,
2461                         "rcuo%c/%d", rsp->abbr, cpu);
2462         BUG_ON(IS_ERR(t));
2463         ACCESS_ONCE(rdp_spawn->nocb_kthread) = t;
2464 }
2465
2466 /*
2467  * If the specified CPU is a no-CBs CPU that does not already have its
2468  * rcuo kthreads, spawn them.
2469  */
2470 static void rcu_spawn_all_nocb_kthreads(int cpu)
2471 {
2472         struct rcu_state *rsp;
2473
2474         if (rcu_scheduler_fully_active)
2475                 for_each_rcu_flavor(rsp)
2476                         rcu_spawn_one_nocb_kthread(rsp, cpu);
2477 }
2478
2479 /*
2480  * Once the scheduler is running, spawn rcuo kthreads for all online
2481  * no-CBs CPUs.  This assumes that the early_initcall()s happen before
2482  * non-boot CPUs come online -- if this changes, we will need to add
2483  * some mutual exclusion.
2484  */
2485 static void __init rcu_spawn_nocb_kthreads(void)
2486 {
2487         int cpu;
2488
2489         for_each_online_cpu(cpu)
2490                 rcu_spawn_all_nocb_kthreads(cpu);
2491 }
2492
2493 /* How many follower CPU IDs per leader?  Default of -1 for sqrt(nr_cpu_ids). */
2494 static int rcu_nocb_leader_stride = -1;
2495 module_param(rcu_nocb_leader_stride, int, 0444);
2496
2497 /*
2498  * Initialize leader-follower relationships for all no-CBs CPU.
2499  */
2500 static void __init rcu_organize_nocb_kthreads(struct rcu_state *rsp)
2501 {
2502         int cpu;
2503         int ls = rcu_nocb_leader_stride;
2504         int nl = 0;  /* Next leader. */
2505         struct rcu_data *rdp;
2506         struct rcu_data *rdp_leader = NULL;  /* Suppress misguided gcc warn. */
2507         struct rcu_data *rdp_prev = NULL;
2508
2509         if (!have_rcu_nocb_mask)
2510                 return;
2511         if (ls == -1) {
2512                 ls = int_sqrt(nr_cpu_ids);
2513                 rcu_nocb_leader_stride = ls;
2514         }
2515
2516         /*
2517          * Each pass through this loop sets up one rcu_data structure and
2518          * spawns one rcu_nocb_kthread().
2519          */
2520         for_each_cpu(cpu, rcu_nocb_mask) {
2521                 rdp = per_cpu_ptr(rsp->rda, cpu);
2522                 if (rdp->cpu >= nl) {
2523                         /* New leader, set up for followers & next leader. */
2524                         nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
2525                         rdp->nocb_leader = rdp;
2526                         rdp_leader = rdp;
2527                 } else {
2528                         /* Another follower, link to previous leader. */
2529                         rdp->nocb_leader = rdp_leader;
2530                         rdp_prev->nocb_next_follower = rdp;
2531                 }
2532                 rdp_prev = rdp;
2533         }
2534 }
2535
2536 /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */
2537 static bool init_nocb_callback_list(struct rcu_data *rdp)
2538 {
2539         if (!rcu_is_nocb_cpu(rdp->cpu))
2540                 return false;
2541
2542         rdp->nxttail[RCU_NEXT_TAIL] = NULL;
2543         return true;
2544 }
2545
2546 #else /* #ifdef CONFIG_RCU_NOCB_CPU */
2547
2548 static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
2549 {
2550         WARN_ON_ONCE(1); /* Should be dead code. */
2551         return false;
2552 }
2553
2554 static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
2555 {
2556 }
2557
2558 static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
2559 {
2560 }
2561
2562 static void rcu_init_one_nocb(struct rcu_node *rnp)
2563 {
2564 }
2565
2566 static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2567                             bool lazy, unsigned long flags)
2568 {
2569         return false;
2570 }
2571
2572 static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp,
2573                                                      struct rcu_data *rdp,
2574                                                      unsigned long flags)
2575 {
2576         return false;
2577 }
2578
2579 static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2580 {
2581 }
2582
2583 static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2584 {
2585         return false;
2586 }
2587
2588 static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2589 {
2590 }
2591
2592 static void rcu_spawn_all_nocb_kthreads(int cpu)
2593 {
2594 }
2595
2596 static void __init rcu_spawn_nocb_kthreads(void)
2597 {
2598 }
2599
2600 static bool init_nocb_callback_list(struct rcu_data *rdp)
2601 {
2602         return false;
2603 }
2604
2605 #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
2606
2607 /*
2608  * An adaptive-ticks CPU can potentially execute in kernel mode for an
2609  * arbitrarily long period of time with the scheduling-clock tick turned
2610  * off.  RCU will be paying attention to this CPU because it is in the
2611  * kernel, but the CPU cannot be guaranteed to be executing the RCU state
2612  * machine because the scheduling-clock tick has been disabled.  Therefore,
2613  * if an adaptive-ticks CPU is failing to respond to the current grace
2614  * period and has not be idle from an RCU perspective, kick it.
2615  */
2616 static void __maybe_unused rcu_kick_nohz_cpu(int cpu)
2617 {
2618 #ifdef CONFIG_NO_HZ_FULL
2619         if (tick_nohz_full_cpu(cpu))
2620                 smp_send_reschedule(cpu);
2621 #endif /* #ifdef CONFIG_NO_HZ_FULL */
2622 }
2623
2624
2625 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
2626
2627 static int full_sysidle_state;          /* Current system-idle state. */
2628 #define RCU_SYSIDLE_NOT         0       /* Some CPU is not idle. */
2629 #define RCU_SYSIDLE_SHORT       1       /* All CPUs idle for brief period. */
2630 #define RCU_SYSIDLE_LONG        2       /* All CPUs idle for long enough. */
2631 #define RCU_SYSIDLE_FULL        3       /* All CPUs idle, ready for sysidle. */
2632 #define RCU_SYSIDLE_FULL_NOTED  4       /* Actually entered sysidle state. */
2633
2634 /*
2635  * Invoked to note exit from irq or task transition to idle.  Note that
2636  * usermode execution does -not- count as idle here!  After all, we want
2637  * to detect full-system idle states, not RCU quiescent states and grace
2638  * periods.  The caller must have disabled interrupts.
2639  */
2640 static void rcu_sysidle_enter(int irq)
2641 {
2642         unsigned long j;
2643         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2644
2645         /* If there are no nohz_full= CPUs, no need to track this. */
2646         if (!tick_nohz_full_enabled())
2647                 return;
2648
2649         /* Adjust nesting, check for fully idle. */
2650         if (irq) {
2651                 rdtp->dynticks_idle_nesting--;
2652                 WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2653                 if (rdtp->dynticks_idle_nesting != 0)
2654                         return;  /* Still not fully idle. */
2655         } else {
2656                 if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) ==
2657                     DYNTICK_TASK_NEST_VALUE) {
2658                         rdtp->dynticks_idle_nesting = 0;
2659                 } else {
2660                         rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE;
2661                         WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0);
2662                         return;  /* Still not fully idle. */
2663                 }
2664         }
2665
2666         /* Record start of fully idle period. */
2667         j = jiffies;
2668         ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j;
2669         smp_mb__before_atomic();
2670         atomic_inc(&rdtp->dynticks_idle);
2671         smp_mb__after_atomic();
2672         WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1);
2673 }
2674
2675 /*
2676  * Unconditionally force exit from full system-idle state.  This is
2677  * invoked when a normal CPU exits idle, but must be called separately
2678  * for the timekeeping CPU (tick_do_timer_cpu).  The reason for this
2679  * is that the timekeeping CPU is permitted to take scheduling-clock
2680  * interrupts while the system is in system-idle state, and of course
2681  * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock
2682  * interrupt from any other type of interrupt.
2683  */
2684 void rcu_sysidle_force_exit(void)
2685 {
2686         int oldstate = ACCESS_ONCE(full_sysidle_state);
2687         int newoldstate;
2688
2689         /*
2690          * Each pass through the following loop attempts to exit full
2691          * system-idle state.  If contention proves to be a problem,
2692          * a trylock-based contention tree could be used here.
2693          */
2694         while (oldstate > RCU_SYSIDLE_SHORT) {
2695                 newoldstate = cmpxchg(&full_sysidle_state,
2696                                       oldstate, RCU_SYSIDLE_NOT);
2697                 if (oldstate == newoldstate &&
2698                     oldstate == RCU_SYSIDLE_FULL_NOTED) {
2699                         rcu_kick_nohz_cpu(tick_do_timer_cpu);
2700                         return; /* We cleared it, done! */
2701                 }
2702                 oldstate = newoldstate;
2703         }
2704         smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */
2705 }
2706
2707 /*
2708  * Invoked to note entry to irq or task transition from idle.  Note that
2709  * usermode execution does -not- count as idle here!  The caller must
2710  * have disabled interrupts.
2711  */
2712 static void rcu_sysidle_exit(int irq)
2713 {
2714         struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
2715
2716         /* If there are no nohz_full= CPUs, no need to track this. */
2717         if (!tick_nohz_full_enabled())
2718                 return;
2719
2720         /* Adjust nesting, check for already non-idle. */
2721         if (irq) {
2722                 rdtp->dynticks_idle_nesting++;
2723                 WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2724                 if (rdtp->dynticks_idle_nesting != 1)
2725                         return; /* Already non-idle. */
2726         } else {
2727                 /*
2728                  * Allow for irq misnesting.  Yes, it really is possible
2729                  * to enter an irq handler then never leave it, and maybe
2730                  * also vice versa.  Handle both possibilities.
2731                  */
2732                 if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) {
2733                         rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE;
2734                         WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0);
2735                         return; /* Already non-idle. */
2736                 } else {
2737                         rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE;
2738                 }
2739         }
2740
2741         /* Record end of idle period. */
2742         smp_mb__before_atomic();
2743         atomic_inc(&rdtp->dynticks_idle);
2744         smp_mb__after_atomic();
2745         WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1));
2746
2747         /*
2748          * If we are the timekeeping CPU, we are permitted to be non-idle
2749          * during a system-idle state.  This must be the case, because
2750          * the timekeeping CPU has to take scheduling-clock interrupts
2751          * during the time that the system is transitioning to full
2752          * system-idle state.  This means that the timekeeping CPU must
2753          * invoke rcu_sysidle_force_exit() directly if it does anything
2754          * more than take a scheduling-clock interrupt.
2755          */
2756         if (smp_processor_id() == tick_do_timer_cpu)
2757                 return;
2758
2759         /* Update system-idle state: We are clearly no longer fully idle! */
2760         rcu_sysidle_force_exit();
2761 }
2762
2763 /*
2764  * Check to see if the current CPU is idle.  Note that usermode execution
2765  * does not count as idle.  The caller must have disabled interrupts.
2766  */
2767 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
2768                                   unsigned long *maxj)
2769 {
2770         int cur;
2771         unsigned long j;
2772         struct rcu_dynticks *rdtp = rdp->dynticks;
2773
2774         /* If there are no nohz_full= CPUs, don't check system-wide idleness. */
2775         if (!tick_nohz_full_enabled())
2776                 return;
2777
2778         /*
2779          * If some other CPU has already reported non-idle, if this is
2780          * not the flavor of RCU that tracks sysidle state, or if this
2781          * is an offline or the timekeeping CPU, nothing to do.
2782          */
2783         if (!*isidle || rdp->rsp != rcu_state_p ||
2784             cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
2785                 return;
2786         if (rcu_gp_in_progress(rdp->rsp))
2787                 WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
2788
2789         /* Pick up current idle and NMI-nesting counter and check. */
2790         cur = atomic_read(&rdtp->dynticks_idle);
2791         if (cur & 0x1) {
2792                 *isidle = false; /* We are not idle! */
2793                 return;
2794         }
2795         smp_mb(); /* Read counters before timestamps. */
2796
2797         /* Pick up timestamps. */
2798         j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies);
2799         /* If this CPU entered idle more recently, update maxj timestamp. */
2800         if (ULONG_CMP_LT(*maxj, j))
2801                 *maxj = j;
2802 }
2803
2804 /*
2805  * Is this the flavor of RCU that is handling full-system idle?
2806  */
2807 static bool is_sysidle_rcu_state(struct rcu_state *rsp)
2808 {
2809         return rsp == rcu_state_p;
2810 }
2811
2812 /*
2813  * Return a delay in jiffies based on the number of CPUs, rcu_node
2814  * leaf fanout, and jiffies tick rate.  The idea is to allow larger
2815  * systems more time to transition to full-idle state in order to
2816  * avoid the cache thrashing that otherwise occur on the state variable.
2817  * Really small systems (less than a couple of tens of CPUs) should
2818  * instead use a single global atomically incremented counter, and later
2819  * versions of this will automatically reconfigure themselves accordingly.
2820  */
2821 static unsigned long rcu_sysidle_delay(void)
2822 {
2823         if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2824                 return 0;
2825         return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000);
2826 }
2827
2828 /*
2829  * Advance the full-system-idle state.  This is invoked when all of
2830  * the non-timekeeping CPUs are idle.
2831  */
2832 static void rcu_sysidle(unsigned long j)
2833 {
2834         /* Check the current state. */
2835         switch (ACCESS_ONCE(full_sysidle_state)) {
2836         case RCU_SYSIDLE_NOT:
2837
2838                 /* First time all are idle, so note a short idle period. */
2839                 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT;
2840                 break;
2841
2842         case RCU_SYSIDLE_SHORT:
2843
2844                 /*
2845                  * Idle for a bit, time to advance to next state?
2846                  * cmpxchg failure means race with non-idle, let them win.
2847                  */
2848                 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2849                         (void)cmpxchg(&full_sysidle_state,
2850                                       RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG);
2851                 break;
2852
2853         case RCU_SYSIDLE_LONG:
2854
2855                 /*
2856                  * Do an additional check pass before advancing to full.
2857                  * cmpxchg failure means race with non-idle, let them win.
2858                  */
2859                 if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay()))
2860                         (void)cmpxchg(&full_sysidle_state,
2861                                       RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL);
2862                 break;
2863
2864         default:
2865                 break;
2866         }
2867 }
2868
2869 /*
2870  * Found a non-idle non-timekeeping CPU, so kick the system-idle state
2871  * back to the beginning.
2872  */
2873 static void rcu_sysidle_cancel(void)
2874 {
2875         smp_mb();
2876         if (full_sysidle_state > RCU_SYSIDLE_SHORT)
2877                 ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT;
2878 }
2879
2880 /*
2881  * Update the sysidle state based on the results of a force-quiescent-state
2882  * scan of the CPUs' dyntick-idle state.
2883  */
2884 static void rcu_sysidle_report(struct rcu_state *rsp, int isidle,
2885                                unsigned long maxj, bool gpkt)
2886 {
2887         if (rsp != rcu_state_p)
2888                 return;  /* Wrong flavor, ignore. */
2889         if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL)
2890                 return;  /* Running state machine from timekeeping CPU. */
2891         if (isidle)
2892                 rcu_sysidle(maxj);    /* More idle! */
2893         else
2894                 rcu_sysidle_cancel(); /* Idle is over. */
2895 }
2896
2897 /*
2898  * Wrapper for rcu_sysidle_report() when called from the grace-period
2899  * kthread's context.
2900  */
2901 static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
2902                                   unsigned long maxj)
2903 {
2904         /* If there are no nohz_full= CPUs, no need to track this. */
2905         if (!tick_nohz_full_enabled())
2906                 return;
2907
2908         rcu_sysidle_report(rsp, isidle, maxj, true);
2909 }
2910
2911 /* Callback and function for forcing an RCU grace period. */
2912 struct rcu_sysidle_head {
2913         struct rcu_head rh;
2914         int inuse;
2915 };
2916
2917 static void rcu_sysidle_cb(struct rcu_head *rhp)
2918 {
2919         struct rcu_sysidle_head *rshp;
2920
2921         /*
2922          * The following memory barrier is needed to replace the
2923          * memory barriers that would normally be in the memory
2924          * allocator.
2925          */
2926         smp_mb();  /* grace period precedes setting inuse. */
2927
2928         rshp = container_of(rhp, struct rcu_sysidle_head, rh);
2929         ACCESS_ONCE(rshp->inuse) = 0;
2930 }
2931
2932 /*
2933  * Check to see if the system is fully idle, other than the timekeeping CPU.
2934  * The caller must have disabled interrupts.  This is not intended to be
2935  * called unless tick_nohz_full_enabled().
2936  */
2937 bool rcu_sys_is_idle(void)
2938 {
2939         static struct rcu_sysidle_head rsh;
2940         int rss = ACCESS_ONCE(full_sysidle_state);
2941
2942         if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu))
2943                 return false;
2944
2945         /* Handle small-system case by doing a full scan of CPUs. */
2946         if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) {
2947                 int oldrss = rss - 1;
2948
2949                 /*
2950                  * One pass to advance to each state up to _FULL.
2951                  * Give up if any pass fails to advance the state.
2952                  */
2953                 while (rss < RCU_SYSIDLE_FULL && oldrss < rss) {
2954                         int cpu;
2955                         bool isidle = true;
2956                         unsigned long maxj = jiffies - ULONG_MAX / 4;
2957                         struct rcu_data *rdp;
2958
2959                         /* Scan all the CPUs looking for nonidle CPUs. */
2960                         for_each_possible_cpu(cpu) {
2961                                 rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
2962                                 rcu_sysidle_check_cpu(rdp, &isidle, &maxj);
2963                                 if (!isidle)
2964                                         break;
2965                         }
2966                         rcu_sysidle_report(rcu_state_p, isidle, maxj, false);
2967                         oldrss = rss;
2968                         rss = ACCESS_ONCE(full_sysidle_state);
2969                 }
2970         }
2971
2972         /* If this is the first observation of an idle period, record it. */
2973         if (rss == RCU_SYSIDLE_FULL) {
2974                 rss = cmpxchg(&full_sysidle_state,
2975                               RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED);
2976                 return rss == RCU_SYSIDLE_FULL;
2977         }
2978
2979         smp_mb(); /* ensure rss load happens before later caller actions. */
2980
2981         /* If already fully idle, tell the caller (in case of races). */
2982         if (rss == RCU_SYSIDLE_FULL_NOTED)
2983                 return true;
2984
2985         /*
2986          * If we aren't there yet, and a grace period is not in flight,
2987          * initiate a grace period.  Either way, tell the caller that
2988          * we are not there yet.  We use an xchg() rather than an assignment
2989          * to make up for the memory barriers that would otherwise be
2990          * provided by the memory allocator.
2991          */
2992         if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL &&
2993             !rcu_gp_in_progress(rcu_state_p) &&
2994             !rsh.inuse && xchg(&rsh.inuse, 1) == 0)
2995                 call_rcu(&rsh.rh, rcu_sysidle_cb);
2996         return false;
2997 }
2998
2999 /*
3000  * Initialize dynticks sysidle state for CPUs coming online.
3001  */
3002 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
3003 {
3004         rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE;
3005 }
3006
3007 #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3008
3009 static void rcu_sysidle_enter(int irq)
3010 {
3011 }
3012
3013 static void rcu_sysidle_exit(int irq)
3014 {
3015 }
3016
3017 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
3018                                   unsigned long *maxj)
3019 {
3020 }
3021
3022 static bool is_sysidle_rcu_state(struct rcu_state *rsp)
3023 {
3024         return false;
3025 }
3026
3027 static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle,
3028                                   unsigned long maxj)
3029 {
3030 }
3031
3032 static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp)
3033 {
3034 }
3035
3036 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3037
3038 /*
3039  * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
3040  * grace-period kthread will do force_quiescent_state() processing?
3041  * The idea is to avoid waking up RCU core processing on such a
3042  * CPU unless the grace period has extended for too long.
3043  *
3044  * This code relies on the fact that all NO_HZ_FULL CPUs are also
3045  * CONFIG_RCU_NOCB_CPU CPUs.
3046  */
3047 static bool rcu_nohz_full_cpu(struct rcu_state *rsp)
3048 {
3049 #ifdef CONFIG_NO_HZ_FULL
3050         if (tick_nohz_full_cpu(smp_processor_id()) &&
3051             (!rcu_gp_in_progress(rsp) ||
3052              ULONG_CMP_LT(jiffies, ACCESS_ONCE(rsp->gp_start) + HZ)))
3053                 return 1;
3054 #endif /* #ifdef CONFIG_NO_HZ_FULL */
3055         return 0;
3056 }
3057
3058 /*
3059  * Bind the grace-period kthread for the sysidle flavor of RCU to the
3060  * timekeeping CPU.
3061  */
3062 static void rcu_bind_gp_kthread(void)
3063 {
3064         int __maybe_unused cpu;
3065
3066         if (!tick_nohz_full_enabled())
3067                 return;
3068 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
3069         cpu = tick_do_timer_cpu;
3070         if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
3071                 set_cpus_allowed_ptr(current, cpumask_of(cpu));
3072 #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3073         if (!is_housekeeping_cpu(raw_smp_processor_id()))
3074                 housekeeping_affine(current);
3075 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
3076 }
3077
3078 /* Record the current task on dyntick-idle entry. */
3079 static void rcu_dynticks_task_enter(void)
3080 {
3081 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
3082         ACCESS_ONCE(current->rcu_tasks_idle_cpu) = smp_processor_id();
3083 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
3084 }
3085
3086 /* Record no current task on dyntick-idle exit. */
3087 static void rcu_dynticks_task_exit(void)
3088 {
3089 #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL)
3090         ACCESS_ONCE(current->rcu_tasks_idle_cpu) = -1;
3091 #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */
3092 }