diff options
Diffstat (limited to 'kernel/rcutree.c')
| -rw-r--r-- | kernel/rcutree.c | 493 | 
1 files changed, 342 insertions, 151 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 705f02ac743..53ae9598f79 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -46,20 +46,24 @@  #include <linux/cpu.h>  #include <linux/mutex.h>  #include <linux/time.h> +#include <linux/kernel_stat.h>  #include "rcutree.h"  /* Data structures. */ +static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; +  #define RCU_STATE_INITIALIZER(name) { \  	.level = { &name.node[0] }, \  	.levelcnt = { \  		NUM_RCU_LVL_0,  /* root of hierarchy. */ \  		NUM_RCU_LVL_1, \  		NUM_RCU_LVL_2, \ -		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \ +		NUM_RCU_LVL_3, \ +		NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \  	}, \ -	.signaled = RCU_SIGNAL_INIT, \ +	.signaled = RCU_GP_IDLE, \  	.gpnum = -300, \  	.completed = -300, \  	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ @@ -77,6 +81,8 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);  struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);  DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +static int rcu_scheduler_active __read_mostly; +  /*   * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s @@ -98,7 +104,7 @@ void rcu_sched_qs(int cpu)  	struct rcu_data *rdp;  	rdp = &per_cpu(rcu_sched_data, cpu); -	rdp->passed_quiesc_completed = rdp->completed; +	rdp->passed_quiesc_completed = rdp->gpnum - 1;  	barrier();  	rdp->passed_quiesc = 1;  	rcu_preempt_note_context_switch(cpu); @@ -109,7 +115,7 @@ void rcu_bh_qs(int cpu)  	struct rcu_data *rdp;  	rdp = &per_cpu(rcu_bh_data, cpu); -	rdp->passed_quiesc_completed = rdp->completed; +	rdp->passed_quiesc_completed = rdp->gpnum - 1;  	barrier();  	rdp->passed_quiesc = 1;  } @@ -335,28 +341,9 @@ void rcu_irq_exit(void)  		set_need_resched();  } -/* - * Record the specified "completed" value, which is later used to validate - * dynticks counter manipulations.  Specify "rsp->completed - 1" to - * unconditionally invalidate any future dynticks manipulations (which is - * useful at the beginning of a grace period). - */ -static void dyntick_record_completed(struct rcu_state *rsp, long comp) -{ -	rsp->dynticks_completed = comp; -} -  #ifdef CONFIG_SMP  /* - * Recall the previously recorded value of the completion for dynticks. - */ -static long dyntick_recall_completed(struct rcu_state *rsp) -{ -	return rsp->dynticks_completed; -} - -/*   * Snapshot the specified CPU's dynticks counter so that we can later   * credit them with an implicit quiescent state.  Return 1 if this CPU   * is in dynticks idle mode, which is an extended quiescent state. @@ -419,24 +406,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)  #else /* #ifdef CONFIG_NO_HZ */ -static void dyntick_record_completed(struct rcu_state *rsp, long comp) -{ -} -  #ifdef CONFIG_SMP -/* - * If there are no dynticks, then the only way that a CPU can passively - * be in a quiescent state is to be offline.  Unlike dynticks idle, which - * is a point in time during the prior (already finished) grace period, - * an offline CPU is always in a quiescent state, and thus can be - * unconditionally applied.  So just return the current value of completed. - */ -static long dyntick_recall_completed(struct rcu_state *rsp) -{ -	return rsp->completed; -} -  static int dyntick_save_progress_counter(struct rcu_data *rdp)  {  	return 0; @@ -553,13 +524,33 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)  /*   * Update CPU-local rcu_data state to record the newly noticed grace period.   * This is used both when we started the grace period and when we notice - * that someone else started the grace period. + * that someone else started the grace period.  The caller must hold the + * ->lock of the leaf rcu_node structure corresponding to the current CPU, + *  and must have irqs disabled.   */ +static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ +	if (rdp->gpnum != rnp->gpnum) { +		rdp->qs_pending = 1; +		rdp->passed_quiesc = 0; +		rdp->gpnum = rnp->gpnum; +	} +} +  static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp)  { -	rdp->qs_pending = 1; -	rdp->passed_quiesc = 0; -	rdp->gpnum = rsp->gpnum; +	unsigned long flags; +	struct rcu_node *rnp; + +	local_irq_save(flags); +	rnp = rdp->mynode; +	if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ +	    !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ +		local_irq_restore(flags); +		return; +	} +	__note_new_gpnum(rsp, rnp, rdp); +	spin_unlock_irqrestore(&rnp->lock, flags);  }  /* @@ -583,6 +574,79 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)  }  /* + * Advance this CPU's callbacks, but only if the current grace period + * has ended.  This may be called only from the CPU to whom the rdp + * belongs.  In addition, the corresponding leaf rcu_node structure's + * ->lock must be held by the caller, with irqs disabled. + */ +static void +__rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ +	/* Did another grace period end? */ +	if (rdp->completed != rnp->completed) { + +		/* Advance callbacks.  No harm if list empty. */ +		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; +		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; +		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + +		/* Remember that we saw this grace-period completion. */ +		rdp->completed = rnp->completed; +	} +} + +/* + * Advance this CPU's callbacks, but only if the current grace period + * has ended.  This may be called only from the CPU to whom the rdp + * belongs. + */ +static void +rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) +{ +	unsigned long flags; +	struct rcu_node *rnp; + +	local_irq_save(flags); +	rnp = rdp->mynode; +	if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ +	    !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ +		local_irq_restore(flags); +		return; +	} +	__rcu_process_gp_end(rsp, rnp, rdp); +	spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Do per-CPU grace-period initialization for running CPU.  The caller + * must hold the lock of the leaf rcu_node structure corresponding to + * this CPU. + */ +static void +rcu_start_gp_per_cpu(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) +{ +	/* Prior grace period ended, so advance callbacks for current CPU. */ +	__rcu_process_gp_end(rsp, rnp, rdp); + +	/* +	 * Because this CPU just now started the new grace period, we know +	 * that all of its callbacks will be covered by this upcoming grace +	 * period, even the ones that were registered arbitrarily recently. +	 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. +	 * +	 * Other CPUs cannot be sure exactly when the grace period started. +	 * Therefore, their recently registered callbacks must pass through +	 * an additional RCU_NEXT_READY stage, so that they will be handled +	 * by the next RCU grace period. +	 */ +	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; +	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; + +	/* Set state so that this CPU will detect the next quiescent state. */ +	__note_new_gpnum(rsp, rnp, rdp); +} + +/*   * Start a new RCU grace period if warranted, re-initializing the hierarchy   * in preparation for detecting the next grace period.  The caller must hold   * the root node's ->lock, which is released before return.  Hard irqs must @@ -596,7 +660,23 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)  	struct rcu_node *rnp = rcu_get_root(rsp);  	if (!cpu_needs_another_gp(rsp, rdp)) { -		spin_unlock_irqrestore(&rnp->lock, flags); +		if (rnp->completed == rsp->completed) { +			spin_unlock_irqrestore(&rnp->lock, flags); +			return; +		} +		spin_unlock(&rnp->lock);	 /* irqs remain disabled. */ + +		/* +		 * Propagate new ->completed value to rcu_node structures +		 * so that other CPUs don't have to wait until the start +		 * of the next grace period to process their callbacks. +		 */ +		rcu_for_each_node_breadth_first(rsp, rnp) { +			spin_lock(&rnp->lock);	 /* irqs already disabled. */ +			rnp->completed = rsp->completed; +			spin_unlock(&rnp->lock); /* irqs remain disabled. */ +		} +		local_irq_restore(flags);  		return;  	} @@ -606,29 +686,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)  	rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */  	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;  	record_gp_stall_check_time(rsp); -	dyntick_record_completed(rsp, rsp->completed - 1); -	note_new_gpnum(rsp, rdp); - -	/* -	 * Because this CPU just now started the new grace period, we know -	 * that all of its callbacks will be covered by this upcoming grace -	 * period, even the ones that were registered arbitrarily recently. -	 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL. -	 * -	 * Other CPUs cannot be sure exactly when the grace period started. -	 * Therefore, their recently registered callbacks must pass through -	 * an additional RCU_NEXT_READY stage, so that they will be handled -	 * by the next RCU grace period. -	 */ -	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; -	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];  	/* Special-case the common single-level case. */  	if (NUM_RCU_NODES == 1) {  		rcu_preempt_check_blocked_tasks(rnp);  		rnp->qsmask = rnp->qsmaskinit;  		rnp->gpnum = rsp->gpnum; +		rnp->completed = rsp->completed;  		rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ +		rcu_start_gp_per_cpu(rsp, rnp, rdp);  		spin_unlock_irqrestore(&rnp->lock, flags);  		return;  	} @@ -657,69 +723,50 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)  	 * irqs disabled.  	 */  	rcu_for_each_node_breadth_first(rsp, rnp) { -		spin_lock(&rnp->lock);	/* irqs already disabled. */ +		spin_lock(&rnp->lock);		/* irqs already disabled. */  		rcu_preempt_check_blocked_tasks(rnp);  		rnp->qsmask = rnp->qsmaskinit;  		rnp->gpnum = rsp->gpnum; -		spin_unlock(&rnp->lock);	/* irqs already disabled. */ +		rnp->completed = rsp->completed; +		if (rnp == rdp->mynode) +			rcu_start_gp_per_cpu(rsp, rnp, rdp); +		spin_unlock(&rnp->lock);	/* irqs remain disabled. */  	} +	rnp = rcu_get_root(rsp); +	spin_lock(&rnp->lock);			/* irqs already disabled. */  	rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ +	spin_unlock(&rnp->lock);		/* irqs remain disabled. */  	spin_unlock_irqrestore(&rsp->onofflock, flags);  }  /* - * Advance this CPU's callbacks, but only if the current grace period - * has ended.  This may be called only from the CPU to whom the rdp - * belongs. - */ -static void -rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) -{ -	long completed_snap; -	unsigned long flags; - -	local_irq_save(flags); -	completed_snap = ACCESS_ONCE(rsp->completed);  /* outside of lock. */ - -	/* Did another grace period end? */ -	if (rdp->completed != completed_snap) { - -		/* Advance callbacks.  No harm if list empty. */ -		rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; -		rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; -		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - -		/* Remember that we saw this grace-period completion. */ -		rdp->completed = completed_snap; -	} -	local_irq_restore(flags); -} - -/* - * Clean up after the prior grace period and let rcu_start_gp() start up - * the next grace period if one is needed.  Note that the caller must - * hold rnp->lock, as required by rcu_start_gp(), which will release it. + * Report a full set of quiescent states to the specified rcu_state + * data structure.  This involves cleaning up after the prior grace + * period and letting rcu_start_gp() start up the next grace period + * if one is needed.  Note that the caller must hold rnp->lock, as + * required by rcu_start_gp(), which will release it.   */ -static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags) +static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)  	__releases(rcu_get_root(rsp)->lock)  {  	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));  	rsp->completed = rsp->gpnum; -	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]); +	rsp->signaled = RCU_GP_IDLE;  	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */  }  /* - * Similar to cpu_quiet(), for which it is a helper function.  Allows - * a group of CPUs to be quieted at one go, though all the CPUs in the - * group must be represented by the same leaf rcu_node structure. - * That structure's lock must be held upon entry, and it is released - * before return. + * Similar to rcu_report_qs_rdp(), for which it is a helper function. + * Allows quiescent states for a group of CPUs to be reported at one go + * to the specified rcu_node structure, though all the CPUs in the group + * must be represented by the same rcu_node structure (which need not be + * a leaf rcu_node structure, though it often will be).  That structure's + * lock must be held upon entry, and it is released before return.   */  static void -cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp, -	      unsigned long flags) +rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, +		  struct rcu_node *rnp, unsigned long flags)  	__releases(rnp->lock)  {  	struct rcu_node *rnp_c; @@ -755,21 +802,23 @@ cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp, struct rcu_node *rnp,  	/*  	 * Get here if we are the last CPU to pass through a quiescent -	 * state for this grace period.  Invoke cpu_quiet_msk_finish() +	 * state for this grace period.  Invoke rcu_report_qs_rsp()  	 * to clean up and start the next grace period if one is needed.  	 */ -	cpu_quiet_msk_finish(rsp, flags); /* releases rnp->lock. */ +	rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */  }  /* - * Record a quiescent state for the specified CPU, which must either be - * the current CPU.  The lastcomp argument is used to make sure we are - * still in the grace period of interest.  We don't want to end the current - * grace period based on quiescent states detected in an earlier grace - * period! + * Record a quiescent state for the specified CPU to that CPU's rcu_data + * structure.  This must be either called from the specified CPU, or + * called when the specified CPU is known to be offline (and when it is + * also known that no other CPU is concurrently trying to help the offline + * CPU).  The lastcomp argument is used to make sure we are still in the + * grace period of interest.  We don't want to end the current grace period + * based on quiescent states detected in an earlier grace period!   */  static void -cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp) +rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)  {  	unsigned long flags;  	unsigned long mask; @@ -777,15 +826,15 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)  	rnp = rdp->mynode;  	spin_lock_irqsave(&rnp->lock, flags); -	if (lastcomp != ACCESS_ONCE(rsp->completed)) { +	if (lastcomp != rnp->completed) {  		/*  		 * Someone beat us to it for this grace period, so leave.  		 * The race with GP start is resolved by the fact that we  		 * hold the leaf rcu_node lock, so that the per-CPU bits  		 * cannot yet be initialized -- so we would simply find our -		 * CPU's bit already cleared in cpu_quiet_msk() if this race -		 * occurred. +		 * CPU's bit already cleared in rcu_report_qs_rnp() if this +		 * race occurred.  		 */  		rdp->passed_quiesc = 0;	/* try again later! */  		spin_unlock_irqrestore(&rnp->lock, flags); @@ -803,7 +852,7 @@ cpu_quiet(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)  		 */  		rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; -		cpu_quiet_msk(mask, rsp, rnp, flags); /* releases rnp->lock */ +		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */  	}  } @@ -834,8 +883,11 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)  	if (!rdp->passed_quiesc)  		return; -	/* Tell RCU we are done (but cpu_quiet() will be the judge of that). */ -	cpu_quiet(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed); +	/* +	 * Tell RCU we are done (but rcu_report_qs_rdp() will be the +	 * judge of that). +	 */ +	rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);  }  #ifdef CONFIG_HOTPLUG_CPU @@ -895,8 +947,8 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)  static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)  {  	unsigned long flags; -	long lastcomp;  	unsigned long mask; +	int need_report = 0;  	struct rcu_data *rdp = rsp->rda[cpu];  	struct rcu_node *rnp; @@ -910,17 +962,32 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)  		spin_lock(&rnp->lock);		/* irqs already disabled. */  		rnp->qsmaskinit &= ~mask;  		if (rnp->qsmaskinit != 0) { -			spin_unlock(&rnp->lock); /* irqs remain disabled. */ +			if (rnp != rdp->mynode) +				spin_unlock(&rnp->lock); /* irqs remain disabled. */  			break;  		} -		rcu_preempt_offline_tasks(rsp, rnp, rdp); +		if (rnp == rdp->mynode) +			need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); +		else +			spin_unlock(&rnp->lock); /* irqs remain disabled. */  		mask = rnp->grpmask; -		spin_unlock(&rnp->lock);	/* irqs remain disabled. */  		rnp = rnp->parent;  	} while (rnp != NULL); -	lastcomp = rsp->completed; -	spin_unlock_irqrestore(&rsp->onofflock, flags); +	/* +	 * We still hold the leaf rcu_node structure lock here, and +	 * irqs are still disabled.  The reason for this subterfuge is +	 * because invoking rcu_report_unblock_qs_rnp() with ->onofflock +	 * held leads to deadlock. +	 */ +	spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ +	rnp = rdp->mynode; +	if (need_report & RCU_OFL_TASKS_NORM_GP) +		rcu_report_unblock_qs_rnp(rnp, flags); +	else +		spin_unlock_irqrestore(&rnp->lock, flags); +	if (need_report & RCU_OFL_TASKS_EXP_GP) +		rcu_report_exp_rnp(rsp, rnp);  	rcu_adopt_orphan_cbs(rsp);  } @@ -958,7 +1025,7 @@ static void rcu_offline_cpu(int cpu)   * Invoke any RCU callbacks that have made it to the end of their grace   * period.  Thottle as specified by rdp->blimit.   */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  {  	unsigned long flags;  	struct rcu_head *next, *list, **tail; @@ -1011,6 +1078,13 @@ static void rcu_do_batch(struct rcu_data *rdp)  	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)  		rdp->blimit = blimit; +	/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ +	if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { +		rdp->qlen_last_fqs_check = 0; +		rdp->n_force_qs_snap = rsp->n_force_qs; +	} else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) +		rdp->qlen_last_fqs_check = rdp->qlen; +  	local_irq_restore(flags);  	/* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1085,7 +1159,7 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,  	rcu_for_each_leaf_node(rsp, rnp) {  		mask = 0;  		spin_lock_irqsave(&rnp->lock, flags); -		if (rsp->completed != lastcomp) { +		if (rnp->completed != lastcomp) {  			spin_unlock_irqrestore(&rnp->lock, flags);  			return 1;  		} @@ -1099,10 +1173,10 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,  			if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))  				mask |= bit;  		} -		if (mask != 0 && rsp->completed == lastcomp) { +		if (mask != 0 && rnp->completed == lastcomp) { -			/* cpu_quiet_msk() releases rnp->lock. */ -			cpu_quiet_msk(mask, rsp, rnp, flags); +			/* rcu_report_qs_rnp() releases rnp->lock. */ +			rcu_report_qs_rnp(mask, rsp, rnp, flags);  			continue;  		}  		spin_unlock_irqrestore(&rnp->lock, flags); @@ -1120,6 +1194,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)  	long lastcomp;  	struct rcu_node *rnp = rcu_get_root(rsp);  	u8 signaled; +	u8 forcenow;  	if (!rcu_gp_in_progress(rsp))  		return;  /* No grace period in progress, nothing to force. */ @@ -1132,19 +1207,20 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)  		goto unlock_ret; /* no emergency and done recently. */  	rsp->n_force_qs++;  	spin_lock(&rnp->lock); -	lastcomp = rsp->completed; +	lastcomp = rsp->gpnum - 1;  	signaled = rsp->signaled;  	rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; -	if (lastcomp == rsp->gpnum) { +	if(!rcu_gp_in_progress(rsp)) {  		rsp->n_force_qs_ngp++;  		spin_unlock(&rnp->lock);  		goto unlock_ret;  /* no GP in progress, time updated. */  	}  	spin_unlock(&rnp->lock);  	switch (signaled) { +	case RCU_GP_IDLE:  	case RCU_GP_INIT: -		break; /* grace period still initializing, ignore. */ +		break; /* grace period idle or initializing, ignore. */  	case RCU_SAVE_DYNTICK: @@ -1155,20 +1231,29 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)  		if (rcu_process_dyntick(rsp, lastcomp,  					dyntick_save_progress_counter))  			goto unlock_ret; +		/* fall into next case. */ + +	case RCU_SAVE_COMPLETED:  		/* Update state, record completion counter. */ +		forcenow = 0;  		spin_lock(&rnp->lock); -		if (lastcomp == rsp->completed) { +		if (lastcomp + 1 == rsp->gpnum && +		    lastcomp == rsp->completed && +		    rsp->signaled == signaled) {  			rsp->signaled = RCU_FORCE_QS; -			dyntick_record_completed(rsp, lastcomp); +			rsp->completed_fqs = lastcomp; +			forcenow = signaled == RCU_SAVE_COMPLETED;  		}  		spin_unlock(&rnp->lock); -		break; +		if (!forcenow) +			break; +		/* fall into next case. */  	case RCU_FORCE_QS:  		/* Check dyntick-idle state, send IPI to laggarts. */ -		if (rcu_process_dyntick(rsp, dyntick_recall_completed(rsp), +		if (rcu_process_dyntick(rsp, rsp->completed_fqs,  					rcu_implicit_dynticks_qs))  			goto unlock_ret; @@ -1224,7 +1309,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)  	}  	/* If there are callbacks ready, invoke them. */ -	rcu_do_batch(rdp); +	rcu_do_batch(rsp, rdp);  }  /* @@ -1288,10 +1373,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */  	} -	/* Force the grace period if too many callbacks or too long waiting. */ -	if (unlikely(++rdp->qlen > qhimark)) { +	/* +	 * Force the grace period if too many callbacks or too long waiting. +	 * Enforce hysteresis, and don't invoke force_quiescent_state() +	 * if some other CPU has recently done so.  Also, don't bother +	 * invoking force_quiescent_state() if the newly enqueued callback +	 * is the only one waiting for a grace period to complete. +	 */ +	if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {  		rdp->blimit = LONG_MAX; -		force_quiescent_state(rsp, 0); +		if (rsp->n_force_qs == rdp->n_force_qs_snap && +		    *rdp->nxttail[RCU_DONE_TAIL] != head) +			force_quiescent_state(rsp, 0); +		rdp->n_force_qs_snap = rsp->n_force_qs; +		rdp->qlen_last_fqs_check = rdp->qlen;  	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)  		force_quiescent_state(rsp, 1);  	local_irq_restore(flags); @@ -1315,6 +1410,68 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))  }  EXPORT_SYMBOL_GPL(call_rcu_bh); +/** + * synchronize_sched - wait until an rcu-sched grace period has elapsed. + * + * Control will return to the caller some time after a full rcu-sched + * grace period has elapsed, in other words after all currently executing + * rcu-sched read-side critical sections have completed.   These read-side + * critical sections are delimited by rcu_read_lock_sched() and + * rcu_read_unlock_sched(), and may be nested.  Note that preempt_disable(), + * local_irq_disable(), and so on may be used in place of + * rcu_read_lock_sched(). + * + * This means that all preempt_disable code sequences, including NMI and + * hardware-interrupt handlers, in progress on entry will have completed + * before this primitive returns.  However, this does not guarantee that + * softirq handlers will have completed, since in some kernels, these + * handlers can run in process context, and can block. + * + * This primitive provides the guarantees made by the (now removed) + * synchronize_kernel() API.  In contrast, synchronize_rcu() only + * guarantees that rcu_read_lock() sections will have completed. + * In "classic RCU", these two guarantees happen to be one and + * the same, but can differ in realtime RCU implementations. + */ +void synchronize_sched(void) +{ +	struct rcu_synchronize rcu; + +	if (rcu_blocking_is_gp()) +		return; + +	init_completion(&rcu.completion); +	/* Will wake me after RCU finished. */ +	call_rcu_sched(&rcu.head, wakeme_after_rcu); +	/* Wait for it. */ +	wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_sched); + +/** + * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. + * + * Control will return to the caller some time after a full rcu_bh grace + * period has elapsed, in other words after all currently executing rcu_bh + * read-side critical sections have completed.  RCU read-side critical + * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), + * and may be nested. + */ +void synchronize_rcu_bh(void) +{ +	struct rcu_synchronize rcu; + +	if (rcu_blocking_is_gp()) +		return; + +	init_completion(&rcu.completion); +	/* Will wake me after RCU finished. */ +	call_rcu_bh(&rcu.head, wakeme_after_rcu); +	/* Wait for it. */ +	wait_for_completion(&rcu.completion); +} +EXPORT_SYMBOL_GPL(synchronize_rcu_bh); +  /*   * Check to see if there is any immediate RCU-related work to be done   * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -1324,6 +1481,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);   */  static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)  { +	struct rcu_node *rnp = rdp->mynode; +  	rdp->n_rcu_pending++;  	/* Check for CPU stalls, if enabled. */ @@ -1348,13 +1507,13 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)  	}  	/* Has another RCU grace period completed?  */ -	if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ +	if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */  		rdp->n_rp_gp_completed++;  		return 1;  	}  	/* Has a new RCU grace period started? */ -	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ +	if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */  		rdp->n_rp_gp_started++;  		return 1;  	} @@ -1397,6 +1556,21 @@ int rcu_needs_cpu(int cpu)  	       rcu_preempt_needs_cpu(cpu);  } +/* + * This function is invoked towards the end of the scheduler's initialization + * process.  Before this is called, the idle task might contain + * RCU read-side critical sections (during which time, this idle + * task is booting the system).  After this function is called, the + * idle tasks are prohibited from containing RCU read-side critical + * sections. + */ +void rcu_scheduler_starting(void) +{ +	WARN_ON(num_online_cpus() != 1); +	WARN_ON(nr_context_switches() > 0); +	rcu_scheduler_active = 1; +} +  static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};  static atomic_t rcu_barrier_cpu_count;  static DEFINE_MUTEX(rcu_barrier_mutex); @@ -1508,21 +1682,18 @@ static void __cpuinit  rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)  {  	unsigned long flags; -	long lastcomp;  	unsigned long mask;  	struct rcu_data *rdp = rsp->rda[cpu];  	struct rcu_node *rnp = rcu_get_root(rsp);  	/* Set up local state, ensuring consistent view of global state. */  	spin_lock_irqsave(&rnp->lock, flags); -	lastcomp = rsp->completed; -	rdp->completed = lastcomp; -	rdp->gpnum = lastcomp;  	rdp->passed_quiesc = 0;  /* We could be racing with new GP, */  	rdp->qs_pending = 1;	 /*  so set up to respond to current GP. */  	rdp->beenonline = 1;	 /* We have now been online. */  	rdp->preemptable = preemptable; -	rdp->passed_quiesc_completed = lastcomp - 1; +	rdp->qlen_last_fqs_check = 0; +	rdp->n_force_qs_snap = rsp->n_force_qs;  	rdp->blimit = blimit;  	spin_unlock(&rnp->lock);		/* irqs remain disabled. */ @@ -1542,6 +1713,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)  		spin_lock(&rnp->lock);	/* irqs already disabled. */  		rnp->qsmaskinit |= mask;  		mask = rnp->grpmask; +		if (rnp == rdp->mynode) { +			rdp->gpnum = rnp->completed; /* if GP in progress... */ +			rdp->completed = rnp->completed; +			rdp->passed_quiesc_completed = rnp->completed - 1; +		}  		spin_unlock(&rnp->lock); /* irqs already disabled. */  		rnp = rnp->parent;  	} while (rnp != NULL && !(rnp->qsmaskinit & mask)); @@ -1559,8 +1735,8 @@ static void __cpuinit rcu_online_cpu(int cpu)  /*   * Handle CPU online/offline notification events.   */ -int __cpuinit rcu_cpu_notify(struct notifier_block *self, -			     unsigned long action, void *hcpu) +static int __cpuinit rcu_cpu_notify(struct notifier_block *self, +				    unsigned long action, void *hcpu)  {  	long cpu = (long)hcpu; @@ -1647,8 +1823,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)  		cpustride *= rsp->levelspread[i];  		rnp = rsp->level[i];  		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { -			if (rnp != rcu_get_root(rsp)) -				spin_lock_init(&rnp->lock); +			spin_lock_init(&rnp->lock); +			lockdep_set_class(&rnp->lock, &rcu_node_class[i]);  			rnp->gpnum = 0;  			rnp->qsmask = 0;  			rnp->qsmaskinit = 0; @@ -1669,9 +1845,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)  			rnp->level = i;  			INIT_LIST_HEAD(&rnp->blocked_tasks[0]);  			INIT_LIST_HEAD(&rnp->blocked_tasks[1]); +			INIT_LIST_HEAD(&rnp->blocked_tasks[2]); +			INIT_LIST_HEAD(&rnp->blocked_tasks[3]);  		}  	} -	spin_lock_init(&rcu_get_root(rsp)->lock);  }  /* @@ -1697,16 +1874,30 @@ do { \  	} \  } while (0) -void __init __rcu_init(void) +void __init rcu_init(void)  { +	int i; +  	rcu_bootup_announce();  #ifdef CONFIG_RCU_CPU_STALL_DETECTOR  	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");  #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#if NUM_RCU_LVL_4 != 0 +	printk(KERN_INFO "Experimental four-level hierarchy is enabled.\n"); +#endif /* #if NUM_RCU_LVL_4 != 0 */  	RCU_INIT_FLAVOR(&rcu_sched_state, rcu_sched_data);  	RCU_INIT_FLAVOR(&rcu_bh_state, rcu_bh_data);  	__rcu_init_preempt();  	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); + +	/* +	 * We don't need protection against CPU-hotplug here because +	 * this is called early in boot, before either interrupts +	 * or the scheduler are operational. +	 */ +	cpu_notifier(rcu_cpu_notify, 0); +	for_each_online_cpu(i) +		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i);  }  #include "rcutree_plugin.h"  |