rcu: Yet another fix for preemption and CPU hotplug
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Sun, 8 Mar 2015 21:52:27 +0000 (14:52 -0700)
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>
Fri, 20 Mar 2015 15:27:33 +0000 (08:27 -0700)
As noted earlier, the following sequence of events can occur when
running PREEMPT_RCU and HOTPLUG_CPU on a system with a multi-level
rcu_node combining tree:

1. A group of tasks block on CPUs corresponding to a given leaf
rcu_node structure while within RCU read-side critical sections.
2. All CPUs corrsponding to that rcu_node structure go offline.
3. The next grace period starts, but because there are still tasks
blocked, the upper-level bits corresponding to this leaf rcu_node
structure remain set.
4. All the tasks exit their RCU read-side critical sections and
remove themselves from the leaf rcu_node structure's list,
leaving it empty.
5. But because there now is code to check for this condition at
force-quiescent-state time, the upper bits are cleared and the
grace period completes.

However, there is another complication that can occur following step 4 above:

4a. The grace period starts, and the leaf rcu_node structure's
gp_tasks pointer is set to NULL because there are no tasks
blocked on this structure.
4b. One of the CPUs corresponding to the leaf rcu_node structure
comes back online.
4b. An endless stream of tasks are preempted within RCU read-side
critical sections on this CPU, such that the ->blkd_tasks
list is always non-empty.

The grace period will never end.

This commit therefore makes the force-quiescent-state processing check only
for absence of tasks blocking the current grace period rather than absence
of tasks altogether.  This will cause a quiescent state to be reported if
the current leaf rcu_node structure is not blocking the current grace period
and its parent thinks that it is, regardless of how RCU managed to get
itself into this state.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # 4.0.x
Tested-by: Sasha Levin <sasha.levin@oracle.com>
kernel/rcu/tree.c

index 17b5abf999ca5efb4248d5df3986d164e99ffa39..b3684b2846770321420de82292e38fb51c0195e1 100644 (file)
@@ -2199,8 +2199,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
        unsigned long mask;
        struct rcu_node *rnp_p;
 
-       WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state);
-       if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+       if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
+           rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;  /* Still need more quiescent states! */
        }
@@ -2208,9 +2208,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
        rnp_p = rnp->parent;
        if (rnp_p == NULL) {
                /*
-                * Either there is only one rcu_node in the tree,
-                * or tasks were kicked up to root rcu_node due to
-                * CPUs going offline.
+                * Only one rcu_node structure in the tree, so don't
+                * try to report up to its nonexistent parent!
                 */
                rcu_report_qs_rsp(rsp, flags);
                return;
@@ -2713,8 +2712,29 @@ static void force_qs_rnp(struct rcu_state *rsp,
                        return;
                }
                if (rnp->qsmask == 0) {
-                       rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
-                       continue;
+                       if (rcu_state_p == &rcu_sched_state ||
+                           rsp != rcu_state_p ||
+                           rcu_preempt_blocked_readers_cgp(rnp)) {
+                               /*
+                                * No point in scanning bits because they
+                                * are all zero.  But we might need to
+                                * priority-boost blocked readers.
+                                */
+                               rcu_initiate_boost(rnp, flags);
+                               /* rcu_initiate_boost() releases rnp->lock */
+                               continue;
+                       }
+                       if (rnp->parent &&
+                           (rnp->parent->qsmask & rnp->grpmask)) {
+                               /*
+                                * Race between grace-period
+                                * initialization and task exiting RCU
+                                * read-side critical section: Report.
+                                */
+                               rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+                               /* rcu_report_unblock_qs_rnp() rlses ->lock */
+                               continue;
+                       }
                }
                cpu = rnp->grplo;
                bit = 1;
@@ -2729,15 +2749,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
                if (mask != 0) {
                        /* Idle/offline CPUs, report. */
                        rcu_report_qs_rnp(mask, rsp, rnp, flags);
-               } else if (rnp->parent &&
-                        list_empty(&rnp->blkd_tasks) &&
-                        !rnp->qsmask &&
-                        (rnp->parent->qsmask & rnp->grpmask)) {
-                       /*
-                        * Race between grace-period initialization and task
-                        * existing RCU read-side critical section, report.
-                        */
-                       rcu_report_unblock_qs_rnp(rsp, rnp, flags);
                } else {
                        /* Nothing to do here, so just drop the lock. */
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);