diff options
Diffstat (limited to 'kernel/rcutree.c')
| -rw-r--r-- | kernel/rcutree.c | 491 | 
1 files changed, 316 insertions, 175 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 3b0f1337f75..f280e542e3e 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -60,36 +60,44 @@  /* Data structures. */ -static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; +static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(structname) { \ -	.level = { &structname##_state.node[0] }, \ -	.levelcnt = { \ -		NUM_RCU_LVL_0,  /* root of hierarchy. */ \ -		NUM_RCU_LVL_1, \ -		NUM_RCU_LVL_2, \ -		NUM_RCU_LVL_3, \ -		NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \ -	}, \ +#define RCU_STATE_INITIALIZER(sname, cr) { \ +	.level = { &sname##_state.node[0] }, \ +	.call = cr, \  	.fqs_state = RCU_GP_IDLE, \  	.gpnum = -300, \  	.completed = -300, \ -	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \ -	.orphan_nxttail = &structname##_state.orphan_nxtlist, \ -	.orphan_donetail = &structname##_state.orphan_donelist, \ -	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \ -	.n_force_qs = 0, \ -	.n_force_qs_ngp = 0, \ -	.name = #structname, \ +	.onofflock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.onofflock), \ +	.orphan_nxttail = &sname##_state.orphan_nxtlist, \ +	.orphan_donetail = &sname##_state.orphan_donelist, \ +	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ +	.fqslock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.fqslock), \ +	.name = #sname, \  } -struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched); +struct rcu_state rcu_sched_state = +	RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched);  DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh); +struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, call_rcu_bh);  DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);  static struct rcu_state *rcu_state; +LIST_HEAD(rcu_struct_flavors); + +/* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ +static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; +module_param(rcu_fanout_leaf, int, 0); +int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; +static int num_rcu_lvl[] = {  /* Number of rcu_nodes at specified level. */ +	NUM_RCU_LVL_0, +	NUM_RCU_LVL_1, +	NUM_RCU_LVL_2, +	NUM_RCU_LVL_3, +	NUM_RCU_LVL_4, +}; +int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */  /*   * The rcu_scheduler_active variable transitions from zero to one just @@ -147,13 +155,6 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);  unsigned long rcutorture_testseq;  unsigned long rcutorture_vernum; -/* State information for rcu_barrier() and friends. */ - -static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; -static atomic_t rcu_barrier_cpu_count; -static DEFINE_MUTEX(rcu_barrier_mutex); -static struct completion rcu_barrier_completion; -  /*   * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s   * permit this function to be invoked without holding the root rcu_node @@ -201,6 +202,7 @@ void rcu_note_context_switch(int cpu)  {  	trace_rcu_utilization("Start context switch");  	rcu_sched_qs(cpu); +	rcu_preempt_note_context_switch(cpu);  	trace_rcu_utilization("End context switch");  }  EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -357,7 +359,7 @@ static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)  		struct task_struct *idle = idle_task(smp_processor_id());  		trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); -		ftrace_dump(DUMP_ALL); +		ftrace_dump(DUMP_ORIG);  		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",  			  current->pid, current->comm,  			  idle->pid, idle->comm); /* must be idle task! */ @@ -467,7 +469,7 @@ static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)  		trace_rcu_dyntick("Error on exit: not idle task",  				  oldval, rdtp->dynticks_nesting); -		ftrace_dump(DUMP_ALL); +		ftrace_dump(DUMP_ORIG);  		WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",  			  current->pid, current->comm,  			  idle->pid, idle->comm); /* must be idle task! */ @@ -584,8 +586,6 @@ void rcu_nmi_exit(void)  	WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);  } -#ifdef CONFIG_PROVE_RCU -  /**   * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle   * @@ -603,7 +603,7 @@ int rcu_is_cpu_idle(void)  }  EXPORT_SYMBOL(rcu_is_cpu_idle); -#ifdef CONFIG_HOTPLUG_CPU +#if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU)  /*   * Is the current CPU online?  Disable preemption to avoid false positives @@ -644,9 +644,7 @@ bool rcu_lockdep_current_cpu_online(void)  }  EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - -#endif /* #ifdef CONFIG_PROVE_RCU */ +#endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */  /**   * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle @@ -732,7 +730,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	int cpu;  	long delta;  	unsigned long flags; -	int ndetected; +	int ndetected = 0;  	struct rcu_node *rnp = rcu_get_root(rsp);  	/* Only let one CPU complain about others per time interval. */ @@ -773,7 +771,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)  	 */  	rnp = rcu_get_root(rsp);  	raw_spin_lock_irqsave(&rnp->lock, flags); -	ndetected = rcu_print_task_stall(rnp); +	ndetected += rcu_print_task_stall(rnp);  	raw_spin_unlock_irqrestore(&rnp->lock, flags);  	print_cpu_stall_info_end(); @@ -859,9 +857,10 @@ static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr)   */  void rcu_cpu_stall_reset(void)  { -	rcu_sched_state.jiffies_stall = jiffies + ULONG_MAX / 2; -	rcu_bh_state.jiffies_stall = jiffies + ULONG_MAX / 2; -	rcu_preempt_stall_reset(); +	struct rcu_state *rsp; + +	for_each_rcu_flavor(rsp) +		rsp->jiffies_stall = jiffies + ULONG_MAX / 2;  }  static struct notifier_block rcu_panic_block = { @@ -893,8 +892,9 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct  		if (rnp->qsmask & rdp->grpmask) {  			rdp->qs_pending = 1;  			rdp->passed_quiesce = 0; -		} else +		} else {  			rdp->qs_pending = 0; +		}  		zero_cpu_stall_ticks(rdp);  	}  } @@ -936,6 +936,18 @@ check_for_new_grace_period(struct rcu_state *rsp, struct rcu_data *rdp)  }  /* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ +	int i; + +	rdp->nxtlist = NULL; +	for (i = 0; i < RCU_NEXT_SIZE; i++) +		rdp->nxttail[i] = &rdp->nxtlist; +} + +/*   * Advance this CPU's callbacks, but only if the current grace period   * has ended.  This may be called only from the CPU to whom the rdp   * belongs.  In addition, the corresponding leaf rcu_node structure's @@ -1327,8 +1339,6 @@ static void  rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,  			  struct rcu_node *rnp, struct rcu_data *rdp)  { -	int i; -  	/*  	 * Orphan the callbacks.  First adjust the counts.  This is safe  	 * because ->onofflock excludes _rcu_barrier()'s adoption of @@ -1339,7 +1349,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,  		rsp->qlen += rdp->qlen;  		rdp->n_cbs_orphaned += rdp->qlen;  		rdp->qlen_lazy = 0; -		rdp->qlen = 0; +		ACCESS_ONCE(rdp->qlen) = 0;  	}  	/* @@ -1368,9 +1378,7 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,  	}  	/* Finally, initialize the rcu_data structure's list to empty.  */ -	rdp->nxtlist = NULL; -	for (i = 0; i < RCU_NEXT_SIZE; i++) -		rdp->nxttail[i] = &rdp->nxtlist; +	init_callback_list(rdp);  }  /* @@ -1504,6 +1512,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)  		raw_spin_unlock_irqrestore(&rnp->lock, flags);  	if (need_report & RCU_OFL_TASKS_EXP_GP)  		rcu_report_exp_rnp(rsp, rnp, true); +	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, +		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", +		  cpu, rdp->qlen, rdp->nxtlist);  }  #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -1530,7 +1541,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  {  	unsigned long flags;  	struct rcu_head *next, *list, **tail; -	int bl, count, count_lazy; +	int bl, count, count_lazy, i;  	/* If no callbacks are ready, just return.*/  	if (!cpu_has_callbacks_ready_to_invoke(rdp)) { @@ -1553,9 +1564,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  	rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];  	*rdp->nxttail[RCU_DONE_TAIL] = NULL;  	tail = rdp->nxttail[RCU_DONE_TAIL]; -	for (count = RCU_NEXT_SIZE - 1; count >= 0; count--) -		if (rdp->nxttail[count] == rdp->nxttail[RCU_DONE_TAIL]) -			rdp->nxttail[count] = &rdp->nxtlist; +	for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) +		if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) +			rdp->nxttail[i] = &rdp->nxtlist;  	local_irq_restore(flags);  	/* Invoke callbacks. */ @@ -1583,15 +1594,15 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  	if (list != NULL) {  		*tail = rdp->nxtlist;  		rdp->nxtlist = list; -		for (count = 0; count < RCU_NEXT_SIZE; count++) -			if (&rdp->nxtlist == rdp->nxttail[count]) -				rdp->nxttail[count] = tail; +		for (i = 0; i < RCU_NEXT_SIZE; i++) +			if (&rdp->nxtlist == rdp->nxttail[i]) +				rdp->nxttail[i] = tail;  			else  				break;  	}  	smp_mb(); /* List handling before counting for rcu_barrier(). */  	rdp->qlen_lazy -= count_lazy; -	rdp->qlen -= count; +	ACCESS_ONCE(rdp->qlen) -= count;  	rdp->n_cbs_invoked += count;  	/* Reinstate batch limit if we have worked down the excess. */ @@ -1604,6 +1615,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)  		rdp->n_force_qs_snap = rsp->n_force_qs;  	} else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)  		rdp->qlen_last_fqs_check = rdp->qlen; +	WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0));  	local_irq_restore(flags); @@ -1744,8 +1756,6 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)  		break; /* grace period idle or initializing, ignore. */  	case RCU_SAVE_DYNTICK: -		if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) -			break; /* So gcc recognizes the dead code. */  		raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */ @@ -1787,9 +1797,10 @@ unlock_fqs_ret:   * whom the rdp belongs.   */  static void -__rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) +__rcu_process_callbacks(struct rcu_state *rsp)  {  	unsigned long flags; +	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda);  	WARN_ON_ONCE(rdp->beenonline == 0); @@ -1825,11 +1836,11 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)   */  static void rcu_process_callbacks(struct softirq_action *unused)  { +	struct rcu_state *rsp; +  	trace_rcu_utilization("Start RCU core"); -	__rcu_process_callbacks(&rcu_sched_state, -				&__get_cpu_var(rcu_sched_data)); -	__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data)); -	rcu_preempt_process_callbacks(); +	for_each_rcu_flavor(rsp) +		__rcu_process_callbacks(rsp);  	trace_rcu_utilization("End RCU core");  } @@ -1856,6 +1867,56 @@ static void invoke_rcu_core(void)  	raise_softirq(RCU_SOFTIRQ);  } +/* + * Handle any core-RCU processing required by a call_rcu() invocation. + */ +static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, +			    struct rcu_head *head, unsigned long flags) +{ +	/* +	 * If called from an extended quiescent state, invoke the RCU +	 * core in order to force a re-evaluation of RCU's idleness. +	 */ +	if (rcu_is_cpu_idle() && cpu_online(smp_processor_id())) +		invoke_rcu_core(); + +	/* If interrupts were disabled or CPU offline, don't invoke RCU core. */ +	if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) +		return; + +	/* +	 * Force the grace period if too many callbacks or too long waiting. +	 * Enforce hysteresis, and don't invoke force_quiescent_state() +	 * if some other CPU has recently done so.  Also, don't bother +	 * invoking force_quiescent_state() if the newly enqueued callback +	 * is the only one waiting for a grace period to complete. +	 */ +	if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + +		/* Are we ignoring a completed grace period? */ +		rcu_process_gp_end(rsp, rdp); +		check_for_new_grace_period(rsp, rdp); + +		/* Start a new grace period if one not already started. */ +		if (!rcu_gp_in_progress(rsp)) { +			unsigned long nestflag; +			struct rcu_node *rnp_root = rcu_get_root(rsp); + +			raw_spin_lock_irqsave(&rnp_root->lock, nestflag); +			rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */ +		} else { +			/* Give the grace period a kick. */ +			rdp->blimit = LONG_MAX; +			if (rsp->n_force_qs == rdp->n_force_qs_snap && +			    *rdp->nxttail[RCU_DONE_TAIL] != head) +				force_quiescent_state(rsp, 0); +			rdp->n_force_qs_snap = rsp->n_force_qs; +			rdp->qlen_last_fqs_check = rdp->qlen; +		} +	} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) +		force_quiescent_state(rsp, 1); +} +  static void  __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  	   struct rcu_state *rsp, bool lazy) @@ -1880,7 +1941,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  	rdp = this_cpu_ptr(rsp->rda);  	/* Add the callback to our list. */ -	rdp->qlen++; +	ACCESS_ONCE(rdp->qlen)++;  	if (lazy)  		rdp->qlen_lazy++;  	else @@ -1895,43 +1956,8 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),  	else  		trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); -	/* If interrupts were disabled, don't dive into RCU core. */ -	if (irqs_disabled_flags(flags)) { -		local_irq_restore(flags); -		return; -	} - -	/* -	 * Force the grace period if too many callbacks or too long waiting. -	 * Enforce hysteresis, and don't invoke force_quiescent_state() -	 * if some other CPU has recently done so.  Also, don't bother -	 * invoking force_quiescent_state() if the newly enqueued callback -	 * is the only one waiting for a grace period to complete. -	 */ -	if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { - -		/* Are we ignoring a completed grace period? */ -		rcu_process_gp_end(rsp, rdp); -		check_for_new_grace_period(rsp, rdp); - -		/* Start a new grace period if one not already started. */ -		if (!rcu_gp_in_progress(rsp)) { -			unsigned long nestflag; -			struct rcu_node *rnp_root = rcu_get_root(rsp); - -			raw_spin_lock_irqsave(&rnp_root->lock, nestflag); -			rcu_start_gp(rsp, nestflag);  /* rlses rnp_root->lock */ -		} else { -			/* Give the grace period a kick. */ -			rdp->blimit = LONG_MAX; -			if (rsp->n_force_qs == rdp->n_force_qs_snap && -			    *rdp->nxttail[RCU_DONE_TAIL] != head) -				force_quiescent_state(rsp, 0); -			rdp->n_force_qs_snap = rsp->n_force_qs; -			rdp->qlen_last_fqs_check = rdp->qlen; -		} -	} else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) -		force_quiescent_state(rsp, 1); +	/* Go handle any RCU core processing required. */ +	__call_rcu_core(rsp, rdp, head, flags);  	local_irq_restore(flags);  } @@ -1961,28 +1987,16 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);   * occasionally incorrectly indicate that there are multiple CPUs online   * when there was in fact only one the whole time, as this just adds   * some overhead: RCU still operates correctly. - * - * Of course, sampling num_online_cpus() with preemption enabled can - * give erroneous results if there are concurrent CPU-hotplug operations. - * For example, given a demonic sequence of preemptions in num_online_cpus() - * and CPU-hotplug operations, there could be two or more CPUs online at - * all times, but num_online_cpus() might well return one (or even zero). - * - * However, all such demonic sequences require at least one CPU-offline - * operation.  Furthermore, rcu_blocking_is_gp() giving the wrong answer - * is only a problem if there is an RCU read-side critical section executing - * throughout.  But RCU-sched and RCU-bh read-side critical sections - * disable either preemption or bh, which prevents a CPU from going offline. - * Therefore, the only way that rcu_blocking_is_gp() can incorrectly return - * that there is only one CPU when in fact there was more than one throughout - * is when there were no RCU readers in the system.  If there are no - * RCU readers, the grace period by definition can be of zero length, - * regardless of the number of online CPUs.   */  static inline int rcu_blocking_is_gp(void)  { +	int ret; +  	might_sleep();  /* Check for RCU read-side critical section. */ -	return num_online_cpus() <= 1; +	preempt_disable(); +	ret = num_online_cpus() <= 1; +	preempt_enable(); +	return ret;  }  /** @@ -2117,9 +2131,9 @@ void synchronize_sched_expedited(void)  		put_online_cpus();  		/* No joy, try again later.  Or just synchronize_sched(). */ -		if (trycount++ < 10) +		if (trycount++ < 10) {  			udelay(trycount * num_online_cpus()); -		else { +		} else {  			synchronize_sched();  			return;  		} @@ -2240,9 +2254,12 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)   */  static int rcu_pending(int cpu)  { -	return __rcu_pending(&rcu_sched_state, &per_cpu(rcu_sched_data, cpu)) || -	       __rcu_pending(&rcu_bh_state, &per_cpu(rcu_bh_data, cpu)) || -	       rcu_preempt_pending(cpu); +	struct rcu_state *rsp; + +	for_each_rcu_flavor(rsp) +		if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) +			return 1; +	return 0;  }  /* @@ -2252,20 +2269,41 @@ static int rcu_pending(int cpu)   */  static int rcu_cpu_has_callbacks(int cpu)  { +	struct rcu_state *rsp; +  	/* RCU callbacks either ready or pending? */ -	return per_cpu(rcu_sched_data, cpu).nxtlist || -	       per_cpu(rcu_bh_data, cpu).nxtlist || -	       rcu_preempt_cpu_has_callbacks(cpu); +	for_each_rcu_flavor(rsp) +		if (per_cpu_ptr(rsp->rda, cpu)->nxtlist) +			return 1; +	return 0; +} + +/* + * Helper function for _rcu_barrier() tracing.  If tracing is disabled, + * the compiler is expected to optimize this away. + */ +static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, +			       int cpu, unsigned long done) +{ +	trace_rcu_barrier(rsp->name, s, cpu, +			  atomic_read(&rsp->barrier_cpu_count), done);  }  /*   * RCU callback function for _rcu_barrier().  If we are last, wake   * up the task executing _rcu_barrier().   */ -static void rcu_barrier_callback(struct rcu_head *notused) +static void rcu_barrier_callback(struct rcu_head *rhp)  { -	if (atomic_dec_and_test(&rcu_barrier_cpu_count)) -		complete(&rcu_barrier_completion); +	struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); +	struct rcu_state *rsp = rdp->rsp; + +	if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { +		_rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); +		complete(&rsp->barrier_completion); +	} else { +		_rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); +	}  }  /* @@ -2273,35 +2311,63 @@ static void rcu_barrier_callback(struct rcu_head *notused)   */  static void rcu_barrier_func(void *type)  { -	int cpu = smp_processor_id(); -	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); -	void (*call_rcu_func)(struct rcu_head *head, -			      void (*func)(struct rcu_head *head)); +	struct rcu_state *rsp = type; +	struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); -	atomic_inc(&rcu_barrier_cpu_count); -	call_rcu_func = type; -	call_rcu_func(head, rcu_barrier_callback); +	_rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); +	atomic_inc(&rsp->barrier_cpu_count); +	rsp->call(&rdp->barrier_head, rcu_barrier_callback);  }  /*   * Orchestrate the specified type of RCU barrier, waiting for all   * RCU callbacks of the specified type to complete.   */ -static void _rcu_barrier(struct rcu_state *rsp, -			 void (*call_rcu_func)(struct rcu_head *head, -					       void (*func)(struct rcu_head *head))) +static void _rcu_barrier(struct rcu_state *rsp)  {  	int cpu;  	unsigned long flags;  	struct rcu_data *rdp; -	struct rcu_head rh; +	struct rcu_data rd; +	unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); +	unsigned long snap_done; -	init_rcu_head_on_stack(&rh); +	init_rcu_head_on_stack(&rd.barrier_head); +	_rcu_barrier_trace(rsp, "Begin", -1, snap);  	/* Take mutex to serialize concurrent rcu_barrier() requests. */ -	mutex_lock(&rcu_barrier_mutex); +	mutex_lock(&rsp->barrier_mutex); + +	/* +	 * Ensure that all prior references, including to ->n_barrier_done, +	 * are ordered before the _rcu_barrier() machinery. +	 */ +	smp_mb();  /* See above block comment. */ + +	/* +	 * Recheck ->n_barrier_done to see if others did our work for us. +	 * This means checking ->n_barrier_done for an even-to-odd-to-even +	 * transition.  The "if" expression below therefore rounds the old +	 * value up to the next even number and adds two before comparing. +	 */ +	snap_done = ACCESS_ONCE(rsp->n_barrier_done); +	_rcu_barrier_trace(rsp, "Check", -1, snap_done); +	if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { +		_rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); +		smp_mb(); /* caller's subsequent code after above check. */ +		mutex_unlock(&rsp->barrier_mutex); +		return; +	} -	smp_mb();  /* Prevent any prior operations from leaking in. */ +	/* +	 * Increment ->n_barrier_done to avoid duplicate work.  Use +	 * ACCESS_ONCE() to prevent the compiler from speculating +	 * the increment to precede the early-exit check. +	 */ +	ACCESS_ONCE(rsp->n_barrier_done)++; +	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); +	_rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); +	smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */  	/*  	 * Initialize the count to one rather than to zero in order to @@ -2320,8 +2386,8 @@ static void _rcu_barrier(struct rcu_state *rsp,  	 * 6.	Both rcu_barrier_callback() callbacks are invoked, awakening  	 *	us -- but before CPU 1's orphaned callbacks are invoked!!!  	 */ -	init_completion(&rcu_barrier_completion); -	atomic_set(&rcu_barrier_cpu_count, 1); +	init_completion(&rsp->barrier_completion); +	atomic_set(&rsp->barrier_cpu_count, 1);  	raw_spin_lock_irqsave(&rsp->onofflock, flags);  	rsp->rcu_barrier_in_progress = current;  	raw_spin_unlock_irqrestore(&rsp->onofflock, flags); @@ -2337,14 +2403,19 @@ static void _rcu_barrier(struct rcu_state *rsp,  		preempt_disable();  		rdp = per_cpu_ptr(rsp->rda, cpu);  		if (cpu_is_offline(cpu)) { +			_rcu_barrier_trace(rsp, "Offline", cpu, +					   rsp->n_barrier_done);  			preempt_enable();  			while (cpu_is_offline(cpu) && ACCESS_ONCE(rdp->qlen))  				schedule_timeout_interruptible(1);  		} else if (ACCESS_ONCE(rdp->qlen)) { -			smp_call_function_single(cpu, rcu_barrier_func, -						 (void *)call_rcu_func, 1); +			_rcu_barrier_trace(rsp, "OnlineQ", cpu, +					   rsp->n_barrier_done); +			smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);  			preempt_enable();  		} else { +			_rcu_barrier_trace(rsp, "OnlineNQ", cpu, +					   rsp->n_barrier_done);  			preempt_enable();  		}  	} @@ -2361,24 +2432,32 @@ static void _rcu_barrier(struct rcu_state *rsp,  	rcu_adopt_orphan_cbs(rsp);  	rsp->rcu_barrier_in_progress = NULL;  	raw_spin_unlock_irqrestore(&rsp->onofflock, flags); -	atomic_inc(&rcu_barrier_cpu_count); +	atomic_inc(&rsp->barrier_cpu_count);  	smp_mb__after_atomic_inc(); /* Ensure atomic_inc() before callback. */ -	call_rcu_func(&rh, rcu_barrier_callback); +	rd.rsp = rsp; +	rsp->call(&rd.barrier_head, rcu_barrier_callback);  	/*  	 * Now that we have an rcu_barrier_callback() callback on each  	 * CPU, and thus each counted, remove the initial count.  	 */ -	if (atomic_dec_and_test(&rcu_barrier_cpu_count)) -		complete(&rcu_barrier_completion); +	if (atomic_dec_and_test(&rsp->barrier_cpu_count)) +		complete(&rsp->barrier_completion); + +	/* Increment ->n_barrier_done to prevent duplicate work. */ +	smp_mb(); /* Keep increment after above mechanism. */ +	ACCESS_ONCE(rsp->n_barrier_done)++; +	WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); +	_rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); +	smp_mb(); /* Keep increment before caller's subsequent code. */  	/* Wait for all rcu_barrier_callback() callbacks to be invoked. */ -	wait_for_completion(&rcu_barrier_completion); +	wait_for_completion(&rsp->barrier_completion);  	/* Other rcu_barrier() invocations can now safely proceed. */ -	mutex_unlock(&rcu_barrier_mutex); +	mutex_unlock(&rsp->barrier_mutex); -	destroy_rcu_head_on_stack(&rh); +	destroy_rcu_head_on_stack(&rd.barrier_head);  }  /** @@ -2386,7 +2465,7 @@ static void _rcu_barrier(struct rcu_state *rsp,   */  void rcu_barrier_bh(void)  { -	_rcu_barrier(&rcu_bh_state, call_rcu_bh); +	_rcu_barrier(&rcu_bh_state);  }  EXPORT_SYMBOL_GPL(rcu_barrier_bh); @@ -2395,7 +2474,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);   */  void rcu_barrier_sched(void)  { -	_rcu_barrier(&rcu_sched_state, call_rcu_sched); +	_rcu_barrier(&rcu_sched_state);  }  EXPORT_SYMBOL_GPL(rcu_barrier_sched); @@ -2406,18 +2485,15 @@ static void __init  rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)  {  	unsigned long flags; -	int i;  	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);  	struct rcu_node *rnp = rcu_get_root(rsp);  	/* Set up local state, ensuring consistent view of global state. */  	raw_spin_lock_irqsave(&rnp->lock, flags);  	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); -	rdp->nxtlist = NULL; -	for (i = 0; i < RCU_NEXT_SIZE; i++) -		rdp->nxttail[i] = &rdp->nxtlist; +	init_callback_list(rdp);  	rdp->qlen_lazy = 0; -	rdp->qlen = 0; +	ACCESS_ONCE(rdp->qlen) = 0;  	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);  	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);  	WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); @@ -2491,9 +2567,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)  static void __cpuinit rcu_prepare_cpu(int cpu)  { -	rcu_init_percpu_data(cpu, &rcu_sched_state, 0); -	rcu_init_percpu_data(cpu, &rcu_bh_state, 0); -	rcu_preempt_init_percpu_data(cpu); +	struct rcu_state *rsp; + +	for_each_rcu_flavor(rsp) +		rcu_init_percpu_data(cpu, rsp, +				     strcmp(rsp->name, "rcu_preempt") == 0);  }  /* @@ -2505,6 +2583,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,  	long cpu = (long)hcpu;  	struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);  	struct rcu_node *rnp = rdp->mynode; +	struct rcu_state *rsp;  	trace_rcu_utilization("Start CPU hotplug");  	switch (action) { @@ -2529,18 +2608,16 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,  		 * touch any data without introducing corruption. We send the  		 * dying CPU's callbacks to an arbitrarily chosen online CPU.  		 */ -		rcu_cleanup_dying_cpu(&rcu_bh_state); -		rcu_cleanup_dying_cpu(&rcu_sched_state); -		rcu_preempt_cleanup_dying_cpu(); +		for_each_rcu_flavor(rsp) +			rcu_cleanup_dying_cpu(rsp);  		rcu_cleanup_after_idle(cpu);  		break;  	case CPU_DEAD:  	case CPU_DEAD_FROZEN:  	case CPU_UP_CANCELED:  	case CPU_UP_CANCELED_FROZEN: -		rcu_cleanup_dead_cpu(cpu, &rcu_bh_state); -		rcu_cleanup_dead_cpu(cpu, &rcu_sched_state); -		rcu_preempt_cleanup_dead_cpu(cpu); +		for_each_rcu_flavor(rsp) +			rcu_cleanup_dead_cpu(cpu, rsp);  		break;  	default:  		break; @@ -2573,9 +2650,9 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)  {  	int i; -	for (i = NUM_RCU_LVLS - 1; i > 0; i--) +	for (i = rcu_num_lvls - 1; i > 0; i--)  		rsp->levelspread[i] = CONFIG_RCU_FANOUT; -	rsp->levelspread[0] = CONFIG_RCU_FANOUT_LEAF; +	rsp->levelspread[0] = rcu_fanout_leaf;  }  #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */  static void __init rcu_init_levelspread(struct rcu_state *rsp) @@ -2585,7 +2662,7 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp)  	int i;  	cprv = NR_CPUS; -	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { +	for (i = rcu_num_lvls - 1; i >= 0; i--) {  		ccur = rsp->levelcnt[i];  		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;  		cprv = ccur; @@ -2612,13 +2689,15 @@ static void __init rcu_init_one(struct rcu_state *rsp,  	/* Initialize the level-tracking arrays. */ -	for (i = 1; i < NUM_RCU_LVLS; i++) +	for (i = 0; i < rcu_num_lvls; i++) +		rsp->levelcnt[i] = num_rcu_lvl[i]; +	for (i = 1; i < rcu_num_lvls; i++)  		rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1];  	rcu_init_levelspread(rsp);  	/* Initialize the elements themselves, starting from the leaves. */ -	for (i = NUM_RCU_LVLS - 1; i >= 0; i--) { +	for (i = rcu_num_lvls - 1; i >= 0; i--) {  		cpustride *= rsp->levelspread[i];  		rnp = rsp->level[i];  		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { @@ -2648,13 +2727,74 @@ static void __init rcu_init_one(struct rcu_state *rsp,  	}  	rsp->rda = rda; -	rnp = rsp->level[NUM_RCU_LVLS - 1]; +	rnp = rsp->level[rcu_num_lvls - 1];  	for_each_possible_cpu(i) {  		while (i > rnp->grphi)  			rnp++;  		per_cpu_ptr(rsp->rda, i)->mynode = rnp;  		rcu_boot_init_percpu_data(i, rsp);  	} +	list_add(&rsp->flavors, &rcu_struct_flavors); +} + +/* + * Compute the rcu_node tree geometry from kernel parameters.  This cannot + * replace the definitions in rcutree.h because those are needed to size + * the ->node array in the rcu_state structure. + */ +static void __init rcu_init_geometry(void) +{ +	int i; +	int j; +	int n = nr_cpu_ids; +	int rcu_capacity[MAX_RCU_LVLS + 1]; + +	/* If the compile-time values are accurate, just leave. */ +	if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF) +		return; + +	/* +	 * Compute number of nodes that can be handled an rcu_node tree +	 * with the given number of levels.  Setting rcu_capacity[0] makes +	 * some of the arithmetic easier. +	 */ +	rcu_capacity[0] = 1; +	rcu_capacity[1] = rcu_fanout_leaf; +	for (i = 2; i <= MAX_RCU_LVLS; i++) +		rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; + +	/* +	 * The boot-time rcu_fanout_leaf parameter is only permitted +	 * to increase the leaf-level fanout, not decrease it.  Of course, +	 * the leaf-level fanout cannot exceed the number of bits in +	 * the rcu_node masks.  Finally, the tree must be able to accommodate +	 * the configured number of CPUs.  Complain and fall back to the +	 * compile-time values if these limits are exceeded. +	 */ +	if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || +	    rcu_fanout_leaf > sizeof(unsigned long) * 8 || +	    n > rcu_capacity[MAX_RCU_LVLS]) { +		WARN_ON(1); +		return; +	} + +	/* Calculate the number of rcu_nodes at each level of the tree. */ +	for (i = 1; i <= MAX_RCU_LVLS; i++) +		if (n <= rcu_capacity[i]) { +			for (j = 0; j <= i; j++) +				num_rcu_lvl[j] = +					DIV_ROUND_UP(n, rcu_capacity[i - j]); +			rcu_num_lvls = i; +			for (j = i + 1; j <= MAX_RCU_LVLS; j++) +				num_rcu_lvl[j] = 0; +			break; +		} + +	/* Calculate the total number of rcu_node structures. */ +	rcu_num_nodes = 0; +	for (i = 0; i <= MAX_RCU_LVLS; i++) +		rcu_num_nodes += num_rcu_lvl[i]; +	rcu_num_nodes -= n;  }  void __init rcu_init(void) @@ -2662,6 +2802,7 @@ void __init rcu_init(void)  	int cpu;  	rcu_bootup_announce(); +	rcu_init_geometry();  	rcu_init_one(&rcu_sched_state, &rcu_sched_data);  	rcu_init_one(&rcu_bh_state, &rcu_bh_data);  	__rcu_init_preempt();  |