diff options
Diffstat (limited to 'net/sched/sch_api.c')
| -rw-r--r-- | net/sched/sch_api.c | 1296 | 
1 files changed, 1296 insertions, 0 deletions
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c new file mode 100644 index 00000000000..4323a74eea3 --- /dev/null +++ b/net/sched/sch_api.c @@ -0,0 +1,1296 @@ +/* + * net/sched/sch_api.c	Packet scheduler API. + * + *		This program is free software; you can redistribute it and/or + *		modify it under the terms of the GNU General Public License + *		as published by the Free Software Foundation; either version + *		2 of the License, or (at your option) any later version. + * + * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> + * + * Fixes: + * + * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired. + * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support + * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support + */ + +#include <linux/config.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/string.h> +#include <linux/mm.h> +#include <linux/socket.h> +#include <linux/sockios.h> +#include <linux/in.h> +#include <linux/errno.h> +#include <linux/interrupt.h> +#include <linux/netdevice.h> +#include <linux/skbuff.h> +#include <linux/rtnetlink.h> +#include <linux/init.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> +#include <linux/kmod.h> +#include <linux/list.h> +#include <linux/bitops.h> + +#include <net/sock.h> +#include <net/pkt_sched.h> + +#include <asm/processor.h> +#include <asm/uaccess.h> +#include <asm/system.h> + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, +			struct Qdisc *old, struct Qdisc *new); +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, +			 struct Qdisc *q, unsigned long cl, int event); + +/* + +   Short review. +   ------------- + +   This file consists of two interrelated parts: + +   1. queueing disciplines manager frontend. +   2. traffic classes manager frontend. + +   Generally, queueing discipline ("qdisc") is a black box, +   which is able to enqueue packets and to dequeue them (when +   device is ready to send something) in order and at times +   determined by algorithm hidden in it. + +   qdisc's are divided to two categories: +   - "queues", which have no internal structure visible from outside. +   - "schedulers", which split all the packets to "traffic classes", +     using "packet classifiers" (look at cls_api.c) + +   In turn, classes may have child qdiscs (as rule, queues) +   attached to them etc. etc. etc. + +   The goal of the routines in this file is to translate +   information supplied by user in the form of handles +   to more intelligible for kernel form, to make some sanity +   checks and part of work, which is common to all qdiscs +   and to provide rtnetlink notifications. + +   All real intelligent work is done inside qdisc modules. + + + +   Every discipline has two major routines: enqueue and dequeue. + +   ---dequeue + +   dequeue usually returns a skb to send. It is allowed to return NULL, +   but it does not mean that queue is empty, it just means that +   discipline does not want to send anything this time. +   Queue is really empty if q->q.qlen == 0. +   For complicated disciplines with multiple queues q->q is not +   real packet queue, but however q->q.qlen must be valid. + +   ---enqueue + +   enqueue returns 0, if packet was enqueued successfully. +   If packet (this one or another one) was dropped, it returns +   not zero error code. +   NET_XMIT_DROP 	- this packet dropped +     Expected action: do not backoff, but wait until queue will clear. +   NET_XMIT_CN	 	- probably this packet enqueued, but another one dropped. +     Expected action: backoff or ignore +   NET_XMIT_POLICED	- dropped by police. +     Expected action: backoff or error to real-time apps. + +   Auxiliary routines: + +   ---requeue + +   requeues once dequeued packet. It is used for non-standard or +   just buggy devices, which can defer output even if dev->tbusy=0. + +   ---reset + +   returns qdisc to initial state: purge all buffers, clear all +   timers, counters (except for statistics) etc. + +   ---init + +   initializes newly created qdisc. + +   ---destroy + +   destroys resources allocated by init and during lifetime of qdisc. + +   ---change + +   changes qdisc parameters. + */ + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static DEFINE_RWLOCK(qdisc_mod_lock); + + +/************************************************ + *	Queueing disciplines manipulation.	* + ************************************************/ + + +/* The list of all installed queueing disciplines. */ + +static struct Qdisc_ops *qdisc_base; + +/* Register/uregister queueing discipline */ + +int register_qdisc(struct Qdisc_ops *qops) +{ +	struct Qdisc_ops *q, **qp; +	int rc = -EEXIST; + +	write_lock(&qdisc_mod_lock); +	for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) +		if (!strcmp(qops->id, q->id)) +			goto out; + +	if (qops->enqueue == NULL) +		qops->enqueue = noop_qdisc_ops.enqueue; +	if (qops->requeue == NULL) +		qops->requeue = noop_qdisc_ops.requeue; +	if (qops->dequeue == NULL) +		qops->dequeue = noop_qdisc_ops.dequeue; + +	qops->next = NULL; +	*qp = qops; +	rc = 0; +out: +	write_unlock(&qdisc_mod_lock); +	return rc; +} + +int unregister_qdisc(struct Qdisc_ops *qops) +{ +	struct Qdisc_ops *q, **qp; +	int err = -ENOENT; + +	write_lock(&qdisc_mod_lock); +	for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) +		if (q == qops) +			break; +	if (q) { +		*qp = q->next; +		q->next = NULL; +		err = 0; +	} +	write_unlock(&qdisc_mod_lock); +	return err; +} + +/* We know handle. Find qdisc among all qdisc's attached to device +   (root qdisc, all its children, children of children etc.) + */ + +struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) +{ +	struct Qdisc *q; + +	read_lock_bh(&qdisc_tree_lock); +	list_for_each_entry(q, &dev->qdisc_list, list) { +		if (q->handle == handle) { +			read_unlock_bh(&qdisc_tree_lock); +			return q; +		} +	} +	read_unlock_bh(&qdisc_tree_lock); +	return NULL; +} + +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +{ +	unsigned long cl; +	struct Qdisc *leaf; +	struct Qdisc_class_ops *cops = p->ops->cl_ops; + +	if (cops == NULL) +		return NULL; +	cl = cops->get(p, classid); + +	if (cl == 0) +		return NULL; +	leaf = cops->leaf(p, cl); +	cops->put(p, cl); +	return leaf; +} + +/* Find queueing discipline by name */ + +static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +{ +	struct Qdisc_ops *q = NULL; + +	if (kind) { +		read_lock(&qdisc_mod_lock); +		for (q = qdisc_base; q; q = q->next) { +			if (rtattr_strcmp(kind, q->id) == 0) { +				if (!try_module_get(q->owner)) +					q = NULL; +				break; +			} +		} +		read_unlock(&qdisc_mod_lock); +	} +	return q; +} + +static struct qdisc_rate_table *qdisc_rtab_list; + +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) +{ +	struct qdisc_rate_table *rtab; + +	for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { +		if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { +			rtab->refcnt++; +			return rtab; +		} +	} + +	if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) +		return NULL; + +	rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); +	if (rtab) { +		rtab->rate = *r; +		rtab->refcnt = 1; +		memcpy(rtab->data, RTA_DATA(tab), 1024); +		rtab->next = qdisc_rtab_list; +		qdisc_rtab_list = rtab; +	} +	return rtab; +} + +void qdisc_put_rtab(struct qdisc_rate_table *tab) +{ +	struct qdisc_rate_table *rtab, **rtabp; + +	if (!tab || --tab->refcnt) +		return; + +	for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { +		if (rtab == tab) { +			*rtabp = rtab->next; +			kfree(rtab); +			return; +		} +	} +} + + +/* Allocate an unique handle from space managed by kernel */ + +static u32 qdisc_alloc_handle(struct net_device *dev) +{ +	int i = 0x10000; +	static u32 autohandle = TC_H_MAKE(0x80000000U, 0); + +	do { +		autohandle += TC_H_MAKE(0x10000U, 0); +		if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) +			autohandle = TC_H_MAKE(0x80000000U, 0); +	} while	(qdisc_lookup(dev, autohandle) && --i > 0); + +	return i>0 ? autohandle : 0; +} + +/* Attach toplevel qdisc to device dev */ + +static struct Qdisc * +dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) +{ +	struct Qdisc *oqdisc; + +	if (dev->flags & IFF_UP) +		dev_deactivate(dev); + +	qdisc_lock_tree(dev); +	if (qdisc && qdisc->flags&TCQ_F_INGRESS) { +		oqdisc = dev->qdisc_ingress; +		/* Prune old scheduler */ +		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { +			/* delete */ +			qdisc_reset(oqdisc); +			dev->qdisc_ingress = NULL; +		} else {  /* new */ +			dev->qdisc_ingress = qdisc; +		} + +	} else { + +		oqdisc = dev->qdisc_sleeping; + +		/* Prune old scheduler */ +		if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) +			qdisc_reset(oqdisc); + +		/* ... and graft new one */ +		if (qdisc == NULL) +			qdisc = &noop_qdisc; +		dev->qdisc_sleeping = qdisc; +		dev->qdisc = &noop_qdisc; +	} + +	qdisc_unlock_tree(dev); + +	if (dev->flags & IFF_UP) +		dev_activate(dev); + +	return oqdisc; +} + + +/* Graft qdisc "new" to class "classid" of qdisc "parent" or +   to device "dev". + +   Old qdisc is not destroyed but returned in *old. + */ + +static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, +		       u32 classid, +		       struct Qdisc *new, struct Qdisc **old) +{ +	int err = 0; +	struct Qdisc *q = *old; + + +	if (parent == NULL) {  +		if (q && q->flags&TCQ_F_INGRESS) { +			*old = dev_graft_qdisc(dev, q); +		} else { +			*old = dev_graft_qdisc(dev, new); +		} +	} else { +		struct Qdisc_class_ops *cops = parent->ops->cl_ops; + +		err = -EINVAL; + +		if (cops) { +			unsigned long cl = cops->get(parent, classid); +			if (cl) { +				err = cops->graft(parent, cl, new, old); +				if (new) +					new->parent = classid; +				cops->put(parent, cl); +			} +		} +	} +	return err; +} + +/* +   Allocate and initialize new qdisc. + +   Parameters are passed via opt. + */ + +static struct Qdisc * +qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) +{ +	int err; +	struct rtattr *kind = tca[TCA_KIND-1]; +	void *p = NULL; +	struct Qdisc *sch; +	struct Qdisc_ops *ops; +	int size; + +	ops = qdisc_lookup_ops(kind); +#ifdef CONFIG_KMOD +	if (ops == NULL && kind != NULL) { +		char name[IFNAMSIZ]; +		if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { +			/* We dropped the RTNL semaphore in order to +			 * perform the module load.  So, even if we +			 * succeeded in loading the module we have to +			 * tell the caller to replay the request.  We +			 * indicate this using -EAGAIN. +			 * We replay the request because the device may +			 * go away in the mean time. +			 */ +			rtnl_unlock(); +			request_module("sch_%s", name); +			rtnl_lock(); +			ops = qdisc_lookup_ops(kind); +			if (ops != NULL) { +				/* We will try again qdisc_lookup_ops, +				 * so don't keep a reference. +				 */ +				module_put(ops->owner); +				err = -EAGAIN; +				goto err_out; +			} +		} +	} +#endif + +	err = -EINVAL; +	if (ops == NULL) +		goto err_out; + +	/* ensure that the Qdisc and the private data are 32-byte aligned */ +	size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); +	size += ops->priv_size + QDISC_ALIGN_CONST; + +	p = kmalloc(size, GFP_KERNEL); +	err = -ENOBUFS; +	if (!p) +		goto err_out2; +	memset(p, 0, size); +	sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) +	                       & ~QDISC_ALIGN_CONST); +	sch->padded = (char *)sch - (char *)p; + +	INIT_LIST_HEAD(&sch->list); +	skb_queue_head_init(&sch->q); + +	if (handle == TC_H_INGRESS) +		sch->flags |= TCQ_F_INGRESS; + +	sch->ops = ops; +	sch->enqueue = ops->enqueue; +	sch->dequeue = ops->dequeue; +	sch->dev = dev; +	dev_hold(dev); +	atomic_set(&sch->refcnt, 1); +	sch->stats_lock = &dev->queue_lock; +	if (handle == 0) { +		handle = qdisc_alloc_handle(dev); +		err = -ENOMEM; +		if (handle == 0) +			goto err_out3; +	} + +	if (handle == TC_H_INGRESS) +                sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); +        else +                sch->handle = handle; + +	if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { +		qdisc_lock_tree(dev); +		list_add_tail(&sch->list, &dev->qdisc_list); +		qdisc_unlock_tree(dev); + +#ifdef CONFIG_NET_ESTIMATOR +		if (tca[TCA_RATE-1]) +			gen_new_estimator(&sch->bstats, &sch->rate_est, +				sch->stats_lock, tca[TCA_RATE-1]); +#endif +		return sch; +	} +err_out3: +	dev_put(dev); +err_out2: +	module_put(ops->owner); +err_out: +	*errp = err; +	if (p) +		kfree(p); +	return NULL; +} + +static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) +{ +	if (tca[TCA_OPTIONS-1]) { +		int err; + +		if (sch->ops->change == NULL) +			return -EINVAL; +		err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); +		if (err) +			return err; +	} +#ifdef CONFIG_NET_ESTIMATOR +	if (tca[TCA_RATE-1]) +		gen_replace_estimator(&sch->bstats, &sch->rate_est, +			sch->stats_lock, tca[TCA_RATE-1]); +#endif +	return 0; +} + +struct check_loop_arg +{ +	struct qdisc_walker 	w; +	struct Qdisc		*p; +	int			depth; +}; + +static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); + +static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) +{ +	struct check_loop_arg	arg; + +	if (q->ops->cl_ops == NULL) +		return 0; + +	arg.w.stop = arg.w.skip = arg.w.count = 0; +	arg.w.fn = check_loop_fn; +	arg.depth = depth; +	arg.p = p; +	q->ops->cl_ops->walk(q, &arg.w); +	return arg.w.stop ? -ELOOP : 0; +} + +static int +check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) +{ +	struct Qdisc *leaf; +	struct Qdisc_class_ops *cops = q->ops->cl_ops; +	struct check_loop_arg *arg = (struct check_loop_arg *)w; + +	leaf = cops->leaf(q, cl); +	if (leaf) { +		if (leaf == arg->p || arg->depth > 7) +			return -ELOOP; +		return check_loop(leaf, arg->p, arg->depth + 1); +	} +	return 0; +} + +/* + * Delete/get qdisc. + */ + +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ +	struct tcmsg *tcm = NLMSG_DATA(n); +	struct rtattr **tca = arg; +	struct net_device *dev; +	u32 clid = tcm->tcm_parent; +	struct Qdisc *q = NULL; +	struct Qdisc *p = NULL; +	int err; + +	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) +		return -ENODEV; + +	if (clid) { +		if (clid != TC_H_ROOT) { +			if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { +				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) +					return -ENOENT; +				q = qdisc_leaf(p, clid); +			} else { /* ingress */ +				q = dev->qdisc_ingress; +                        } +		} else { +			q = dev->qdisc_sleeping; +		} +		if (!q) +			return -ENOENT; + +		if (tcm->tcm_handle && q->handle != tcm->tcm_handle) +			return -EINVAL; +	} else { +		if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) +			return -ENOENT; +	} + +	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) +		return -EINVAL; + +	if (n->nlmsg_type == RTM_DELQDISC) { +		if (!clid) +			return -EINVAL; +		if (q->handle == 0) +			return -ENOENT; +		if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) +			return err; +		if (q) { +			qdisc_notify(skb, n, clid, q, NULL); +			spin_lock_bh(&dev->queue_lock); +			qdisc_destroy(q); +			spin_unlock_bh(&dev->queue_lock); +		} +	} else { +		qdisc_notify(skb, n, clid, NULL, q); +	} +	return 0; +} + +/* +   Create/change qdisc. + */ + +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ +	struct tcmsg *tcm; +	struct rtattr **tca; +	struct net_device *dev; +	u32 clid; +	struct Qdisc *q, *p; +	int err; + +replay: +	/* Reinit, just in case something touches this. */ +	tcm = NLMSG_DATA(n); +	tca = arg; +	clid = tcm->tcm_parent; +	q = p = NULL; + +	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) +		return -ENODEV; + +	if (clid) { +		if (clid != TC_H_ROOT) { +			if (clid != TC_H_INGRESS) { +				if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) +					return -ENOENT; +				q = qdisc_leaf(p, clid); +			} else { /*ingress */ +				q = dev->qdisc_ingress; +			} +		} else { +			q = dev->qdisc_sleeping; +		} + +		/* It may be default qdisc, ignore it */ +		if (q && q->handle == 0) +			q = NULL; + +		if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { +			if (tcm->tcm_handle) { +				if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) +					return -EEXIST; +				if (TC_H_MIN(tcm->tcm_handle)) +					return -EINVAL; +				if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) +					goto create_n_graft; +				if (n->nlmsg_flags&NLM_F_EXCL) +					return -EEXIST; +				if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) +					return -EINVAL; +				if (q == p || +				    (p && check_loop(q, p, 0))) +					return -ELOOP; +				atomic_inc(&q->refcnt); +				goto graft; +			} else { +				if (q == NULL) +					goto create_n_graft; + +				/* This magic test requires explanation. +				 * +				 *   We know, that some child q is already +				 *   attached to this parent and have choice: +				 *   either to change it or to create/graft new one. +				 * +				 *   1. We are allowed to create/graft only +				 *   if CREATE and REPLACE flags are set. +				 * +				 *   2. If EXCL is set, requestor wanted to say, +				 *   that qdisc tcm_handle is not expected +				 *   to exist, so that we choose create/graft too. +				 * +				 *   3. The last case is when no flags are set. +				 *   Alas, it is sort of hole in API, we +				 *   cannot decide what to do unambiguously. +				 *   For now we select create/graft, if +				 *   user gave KIND, which does not match existing. +				 */ +				if ((n->nlmsg_flags&NLM_F_CREATE) && +				    (n->nlmsg_flags&NLM_F_REPLACE) && +				    ((n->nlmsg_flags&NLM_F_EXCL) || +				     (tca[TCA_KIND-1] && +				      rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) +					goto create_n_graft; +			} +		} +	} else { +		if (!tcm->tcm_handle) +			return -EINVAL; +		q = qdisc_lookup(dev, tcm->tcm_handle); +	} + +	/* Change qdisc parameters */ +	if (q == NULL) +		return -ENOENT; +	if (n->nlmsg_flags&NLM_F_EXCL) +		return -EEXIST; +	if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) +		return -EINVAL; +	err = qdisc_change(q, tca); +	if (err == 0) +		qdisc_notify(skb, n, clid, NULL, q); +	return err; + +create_n_graft: +	if (!(n->nlmsg_flags&NLM_F_CREATE)) +		return -ENOENT; +	if (clid == TC_H_INGRESS) +		q = qdisc_create(dev, tcm->tcm_parent, tca, &err); +        else +		q = qdisc_create(dev, tcm->tcm_handle, tca, &err); +	if (q == NULL) { +		if (err == -EAGAIN) +			goto replay; +		return err; +	} + +graft: +	if (1) { +		struct Qdisc *old_q = NULL; +		err = qdisc_graft(dev, p, clid, q, &old_q); +		if (err) { +			if (q) { +				spin_lock_bh(&dev->queue_lock); +				qdisc_destroy(q); +				spin_unlock_bh(&dev->queue_lock); +			} +			return err; +		} +		qdisc_notify(skb, n, clid, old_q, q); +		if (old_q) { +			spin_lock_bh(&dev->queue_lock); +			qdisc_destroy(old_q); +			spin_unlock_bh(&dev->queue_lock); +		} +	} +	return 0; +} + +static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, +			 u32 pid, u32 seq, unsigned flags, int event) +{ +	struct tcmsg *tcm; +	struct nlmsghdr  *nlh; +	unsigned char	 *b = skb->tail; +	struct gnet_dump d; + +	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); +	nlh->nlmsg_flags = flags; +	tcm = NLMSG_DATA(nlh); +	tcm->tcm_family = AF_UNSPEC; +	tcm->tcm_ifindex = q->dev->ifindex; +	tcm->tcm_parent = clid; +	tcm->tcm_handle = q->handle; +	tcm->tcm_info = atomic_read(&q->refcnt); +	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); +	if (q->ops->dump && q->ops->dump(q, skb) < 0) +		goto rtattr_failure; +	q->qstats.qlen = q->q.qlen; + +	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, +			TCA_XSTATS, q->stats_lock, &d) < 0) +		goto rtattr_failure; + +	if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) +		goto rtattr_failure; + +	if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR +	    gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || +#endif +	    gnet_stats_copy_queue(&d, &q->qstats) < 0) +		goto rtattr_failure; +	 +	if (gnet_stats_finish_copy(&d) < 0) +		goto rtattr_failure; +	 +	nlh->nlmsg_len = skb->tail - b; +	return skb->len; + +nlmsg_failure: +rtattr_failure: +	skb_trim(skb, b - skb->data); +	return -1; +} + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, +			u32 clid, struct Qdisc *old, struct Qdisc *new) +{ +	struct sk_buff *skb; +	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + +	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb) +		return -ENOBUFS; + +	if (old && old->handle) { +		if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) +			goto err_out; +	} +	if (new) { +		if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) +			goto err_out; +	} + +	if (skb->len) +		return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + +err_out: +	kfree_skb(skb); +	return -EINVAL; +} + +static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) +{ +	int idx, q_idx; +	int s_idx, s_q_idx; +	struct net_device *dev; +	struct Qdisc *q; + +	s_idx = cb->args[0]; +	s_q_idx = q_idx = cb->args[1]; +	read_lock(&dev_base_lock); +	for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { +		if (idx < s_idx) +			continue; +		if (idx > s_idx) +			s_q_idx = 0; +		read_lock_bh(&qdisc_tree_lock); +		q_idx = 0; +		list_for_each_entry(q, &dev->qdisc_list, list) { +			if (q_idx < s_q_idx) { +				q_idx++; +				continue; +			} +			if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, +					  cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { +				read_unlock_bh(&qdisc_tree_lock); +				goto done; +			} +			q_idx++; +		} +		read_unlock_bh(&qdisc_tree_lock); +	} + +done: +	read_unlock(&dev_base_lock); + +	cb->args[0] = idx; +	cb->args[1] = q_idx; + +	return skb->len; +} + + + +/************************************************ + *	Traffic classes manipulation.		* + ************************************************/ + + + +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ +	struct tcmsg *tcm = NLMSG_DATA(n); +	struct rtattr **tca = arg; +	struct net_device *dev; +	struct Qdisc *q = NULL; +	struct Qdisc_class_ops *cops; +	unsigned long cl = 0; +	unsigned long new_cl; +	u32 pid = tcm->tcm_parent; +	u32 clid = tcm->tcm_handle; +	u32 qid = TC_H_MAJ(clid); +	int err; + +	if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) +		return -ENODEV; + +	/* +	   parent == TC_H_UNSPEC - unspecified parent. +	   parent == TC_H_ROOT   - class is root, which has no parent. +	   parent == X:0	 - parent is root class. +	   parent == X:Y	 - parent is a node in hierarchy. +	   parent == 0:Y	 - parent is X:Y, where X:0 is qdisc. + +	   handle == 0:0	 - generate handle from kernel pool. +	   handle == 0:Y	 - class is X:Y, where X:0 is qdisc. +	   handle == X:Y	 - clear. +	   handle == X:0	 - root class. +	 */ + +	/* Step 1. Determine qdisc handle X:0 */ + +	if (pid != TC_H_ROOT) { +		u32 qid1 = TC_H_MAJ(pid); + +		if (qid && qid1) { +			/* If both majors are known, they must be identical. */ +			if (qid != qid1) +				return -EINVAL; +		} else if (qid1) { +			qid = qid1; +		} else if (qid == 0) +			qid = dev->qdisc_sleeping->handle; + +		/* Now qid is genuine qdisc handle consistent +		   both with parent and child. + +		   TC_H_MAJ(pid) still may be unspecified, complete it now. +		 */ +		if (pid) +			pid = TC_H_MAKE(qid, pid); +	} else { +		if (qid == 0) +			qid = dev->qdisc_sleeping->handle; +	} + +	/* OK. Locate qdisc */ +	if ((q = qdisc_lookup(dev, qid)) == NULL)  +		return -ENOENT; + +	/* An check that it supports classes */ +	cops = q->ops->cl_ops; +	if (cops == NULL) +		return -EINVAL; + +	/* Now try to get class */ +	if (clid == 0) { +		if (pid == TC_H_ROOT) +			clid = qid; +	} else +		clid = TC_H_MAKE(qid, clid); + +	if (clid) +		cl = cops->get(q, clid); + +	if (cl == 0) { +		err = -ENOENT; +		if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) +			goto out; +	} else { +		switch (n->nlmsg_type) { +		case RTM_NEWTCLASS:	 +			err = -EEXIST; +			if (n->nlmsg_flags&NLM_F_EXCL) +				goto out; +			break; +		case RTM_DELTCLASS: +			err = cops->delete(q, cl); +			if (err == 0) +				tclass_notify(skb, n, q, cl, RTM_DELTCLASS); +			goto out; +		case RTM_GETTCLASS: +			err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); +			goto out; +		default: +			err = -EINVAL; +			goto out; +		} +	} + +	new_cl = cl; +	err = cops->change(q, clid, pid, tca, &new_cl); +	if (err == 0) +		tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); + +out: +	if (cl) +		cops->put(q, cl); + +	return err; +} + + +static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, +			  unsigned long cl, +			  u32 pid, u32 seq, unsigned flags, int event) +{ +	struct tcmsg *tcm; +	struct nlmsghdr  *nlh; +	unsigned char	 *b = skb->tail; +	struct gnet_dump d; +	struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; + +	nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); +	nlh->nlmsg_flags = flags; +	tcm = NLMSG_DATA(nlh); +	tcm->tcm_family = AF_UNSPEC; +	tcm->tcm_ifindex = q->dev->ifindex; +	tcm->tcm_parent = q->handle; +	tcm->tcm_handle = q->handle; +	tcm->tcm_info = 0; +	RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); +	if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) +		goto rtattr_failure; + +	if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, +			TCA_XSTATS, q->stats_lock, &d) < 0) +		goto rtattr_failure; + +	if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) +		goto rtattr_failure; + +	if (gnet_stats_finish_copy(&d) < 0) +		goto rtattr_failure; + +	nlh->nlmsg_len = skb->tail - b; +	return skb->len; + +nlmsg_failure: +rtattr_failure: +	skb_trim(skb, b - skb->data); +	return -1; +} + +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, +			  struct Qdisc *q, unsigned long cl, int event) +{ +	struct sk_buff *skb; +	u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + +	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); +	if (!skb) +		return -ENOBUFS; + +	if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { +		kfree_skb(skb); +		return -EINVAL; +	} + +	return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct qdisc_dump_args +{ +	struct qdisc_walker w; +	struct sk_buff *skb; +	struct netlink_callback *cb; +}; + +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +{ +	struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; + +	return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, +			      a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); +} + +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ +	int t; +	int s_t; +	struct net_device *dev; +	struct Qdisc *q; +	struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); +	struct qdisc_dump_args arg; + +	if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) +		return 0; +	if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) +		return 0; + +	s_t = cb->args[0]; +	t = 0; + +	read_lock_bh(&qdisc_tree_lock); +	list_for_each_entry(q, &dev->qdisc_list, list) { +		if (t < s_t || !q->ops->cl_ops || +		    (tcm->tcm_parent && +		     TC_H_MAJ(tcm->tcm_parent) != q->handle)) { +			t++; +			continue; +		} +		if (t > s_t) +			memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); +		arg.w.fn = qdisc_class_dump; +		arg.skb = skb; +		arg.cb = cb; +		arg.w.stop  = 0; +		arg.w.skip = cb->args[1]; +		arg.w.count = 0; +		q->ops->cl_ops->walk(q, &arg.w); +		cb->args[1] = arg.w.count; +		if (arg.w.stop) +			break; +		t++; +	} +	read_unlock_bh(&qdisc_tree_lock); + +	cb->args[0] = t; + +	dev_put(dev); +	return skb->len; +} + +/* Main classifier routine: scans classifier chain attached +   to this qdisc, (optionally) tests for protocol and asks +   specific classifiers. + */ +int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, +	struct tcf_result *res) +{ +	int err = 0; +	u32 protocol = skb->protocol; +#ifdef CONFIG_NET_CLS_ACT +	struct tcf_proto *otp = tp; +reclassify: +#endif +	protocol = skb->protocol; + +	for ( ; tp; tp = tp->next) { +		if ((tp->protocol == protocol || +			tp->protocol == __constant_htons(ETH_P_ALL)) && +			(err = tp->classify(skb, tp, res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT +			if ( TC_ACT_RECLASSIFY == err) { +				__u32 verd = (__u32) G_TC_VERD(skb->tc_verd); +				tp = otp; + +				if (MAX_REC_LOOP < verd++) { +					printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n", +						tp->prio&0xffff, ntohs(tp->protocol)); +					return TC_ACT_SHOT; +				} +				skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd); +				goto reclassify; +			} else { +				if (skb->tc_verd)  +					skb->tc_verd = SET_TC_VERD(skb->tc_verd,0); +				return err; +			} +#else + +			return err; +#endif +		} + +	} +	return -1; +} + +static int psched_us_per_tick = 1; +static int psched_tick_per_us = 1; + +#ifdef CONFIG_PROC_FS +static int psched_show(struct seq_file *seq, void *v) +{ +	seq_printf(seq, "%08x %08x %08x %08x\n", +		      psched_tick_per_us, psched_us_per_tick, +		      1000000, HZ); + +	return 0; +} + +static int psched_open(struct inode *inode, struct file *file) +{ +	return single_open(file, psched_show, PDE(inode)->data); +} + +static struct file_operations psched_fops = { +	.owner = THIS_MODULE, +	.open = psched_open, +	.read  = seq_read, +	.llseek = seq_lseek, +	.release = single_release, +};	 +#endif + +#ifdef CONFIG_NET_SCH_CLK_CPU +psched_tdiff_t psched_clock_per_hz; +int psched_clock_scale; +EXPORT_SYMBOL(psched_clock_per_hz); +EXPORT_SYMBOL(psched_clock_scale); + +psched_time_t psched_time_base; +cycles_t psched_time_mark; +EXPORT_SYMBOL(psched_time_mark); +EXPORT_SYMBOL(psched_time_base); + +/* + * Periodically adjust psched_time_base to avoid overflow + * with 32-bit get_cycles(). Safe up to 4GHz CPU. + */ +static void psched_tick(unsigned long); +static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0); + +static void psched_tick(unsigned long dummy) +{ +	if (sizeof(cycles_t) == sizeof(u32)) { +		psched_time_t dummy_stamp; +		PSCHED_GET_TIME(dummy_stamp); +		psched_timer.expires = jiffies + 1*HZ; +		add_timer(&psched_timer); +	} +} + +int __init psched_calibrate_clock(void) +{ +	psched_time_t stamp, stamp1; +	struct timeval tv, tv1; +	psched_tdiff_t delay; +	long rdelay; +	unsigned long stop; + +	psched_tick(0); +	stop = jiffies + HZ/10; +	PSCHED_GET_TIME(stamp); +	do_gettimeofday(&tv); +	while (time_before(jiffies, stop)) { +		barrier(); +		cpu_relax(); +	} +	PSCHED_GET_TIME(stamp1); +	do_gettimeofday(&tv1); + +	delay = PSCHED_TDIFF(stamp1, stamp); +	rdelay = tv1.tv_usec - tv.tv_usec; +	rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; +	if (rdelay > delay) +		return -1; +	delay /= rdelay; +	psched_tick_per_us = delay; +	while ((delay>>=1) != 0) +		psched_clock_scale++; +	psched_us_per_tick = 1<<psched_clock_scale; +	psched_clock_per_hz = (psched_tick_per_us*(1000000/HZ))>>psched_clock_scale; +	return 0; +} +#endif + +static int __init pktsched_init(void) +{ +	struct rtnetlink_link *link_p; + +#ifdef CONFIG_NET_SCH_CLK_CPU +	if (psched_calibrate_clock() < 0) +		return -1; +#elif defined(CONFIG_NET_SCH_CLK_JIFFIES) +	psched_tick_per_us = HZ<<PSCHED_JSCALE; +	psched_us_per_tick = 1000000; +#endif + +	link_p = rtnetlink_links[PF_UNSPEC]; + +	/* Setup rtnetlink links. It is made here to avoid +	   exporting large number of public symbols. +	 */ + +	if (link_p) { +		link_p[RTM_NEWQDISC-RTM_BASE].doit = tc_modify_qdisc; +		link_p[RTM_DELQDISC-RTM_BASE].doit = tc_get_qdisc; +		link_p[RTM_GETQDISC-RTM_BASE].doit = tc_get_qdisc; +		link_p[RTM_GETQDISC-RTM_BASE].dumpit = tc_dump_qdisc; +		link_p[RTM_NEWTCLASS-RTM_BASE].doit = tc_ctl_tclass; +		link_p[RTM_DELTCLASS-RTM_BASE].doit = tc_ctl_tclass; +		link_p[RTM_GETTCLASS-RTM_BASE].doit = tc_ctl_tclass; +		link_p[RTM_GETTCLASS-RTM_BASE].dumpit = tc_dump_tclass; +	} + +	register_qdisc(&pfifo_qdisc_ops); +	register_qdisc(&bfifo_qdisc_ops); +	proc_net_fops_create("psched", 0, &psched_fops); + +	return 0; +} + +subsys_initcall(pktsched_init); + +EXPORT_SYMBOL(qdisc_get_rtab); +EXPORT_SYMBOL(qdisc_put_rtab); +EXPORT_SYMBOL(register_qdisc); +EXPORT_SYMBOL(unregister_qdisc); +EXPORT_SYMBOL(tc_classify);  |