Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
[firefly-linux-kernel-4.4.55.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/unregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* Get default qdisc if not otherwise specified */
204 void qdisc_get_default(char *name, size_t len)
205 {
206         read_lock(&qdisc_mod_lock);
207         strlcpy(name, default_qdisc_ops->id, len);
208         read_unlock(&qdisc_mod_lock);
209 }
210
211 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
212 {
213         struct Qdisc_ops *q = NULL;
214
215         for (q = qdisc_base; q; q = q->next) {
216                 if (!strcmp(name, q->id)) {
217                         if (!try_module_get(q->owner))
218                                 q = NULL;
219                         break;
220                 }
221         }
222
223         return q;
224 }
225
226 /* Set new default qdisc to use */
227 int qdisc_set_default(const char *name)
228 {
229         const struct Qdisc_ops *ops;
230
231         if (!capable(CAP_NET_ADMIN))
232                 return -EPERM;
233
234         write_lock(&qdisc_mod_lock);
235         ops = qdisc_lookup_default(name);
236         if (!ops) {
237                 /* Not found, drop lock and try to load module */
238                 write_unlock(&qdisc_mod_lock);
239                 request_module("sch_%s", name);
240                 write_lock(&qdisc_mod_lock);
241
242                 ops = qdisc_lookup_default(name);
243         }
244
245         if (ops) {
246                 /* Set new default */
247                 module_put(default_qdisc_ops->owner);
248                 default_qdisc_ops = ops;
249         }
250         write_unlock(&qdisc_mod_lock);
251
252         return ops ? 0 : -ENOENT;
253 }
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256    (root qdisc, all its children, children of children etc.)
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!(root->flags & TCQ_F_BUILTIN) &&
264             root->handle == handle)
265                 return root;
266
267         list_for_each_entry(q, &root->list, list) {
268                 if (q->handle == handle)
269                         return q;
270         }
271         return NULL;
272 }
273
274 void qdisc_list_add(struct Qdisc *q)
275 {
276         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
277                 struct Qdisc *root = qdisc_dev(q)->qdisc;
278
279                 WARN_ON_ONCE(root == &noop_qdisc);
280                 list_add_tail(&q->list, &root->list);
281         }
282 }
283 EXPORT_SYMBOL(qdisc_list_add);
284
285 void qdisc_list_del(struct Qdisc *q)
286 {
287         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
288                 list_del(&q->list);
289 }
290 EXPORT_SYMBOL(qdisc_list_del);
291
292 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
293 {
294         struct Qdisc *q;
295
296         q = qdisc_match_from_root(dev->qdisc, handle);
297         if (q)
298                 goto out;
299
300         if (dev_ingress_queue(dev))
301                 q = qdisc_match_from_root(
302                         dev_ingress_queue(dev)->qdisc_sleeping,
303                         handle);
304 out:
305         return q;
306 }
307
308 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
309 {
310         unsigned long cl;
311         struct Qdisc *leaf;
312         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
313
314         if (cops == NULL)
315                 return NULL;
316         cl = cops->get(p, classid);
317
318         if (cl == 0)
319                 return NULL;
320         leaf = cops->leaf(p, cl);
321         cops->put(p, cl);
322         return leaf;
323 }
324
325 /* Find queueing discipline by name */
326
327 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
328 {
329         struct Qdisc_ops *q = NULL;
330
331         if (kind) {
332                 read_lock(&qdisc_mod_lock);
333                 for (q = qdisc_base; q; q = q->next) {
334                         if (nla_strcmp(kind, q->id) == 0) {
335                                 if (!try_module_get(q->owner))
336                                         q = NULL;
337                                 break;
338                         }
339                 }
340                 read_unlock(&qdisc_mod_lock);
341         }
342         return q;
343 }
344
345 /* The linklayer setting were not transferred from iproute2, in older
346  * versions, and the rate tables lookup systems have been dropped in
347  * the kernel. To keep backward compatible with older iproute2 tc
348  * utils, we detect the linklayer setting by detecting if the rate
349  * table were modified.
350  *
351  * For linklayer ATM table entries, the rate table will be aligned to
352  * 48 bytes, thus some table entries will contain the same value.  The
353  * mpu (min packet unit) is also encoded into the old rate table, thus
354  * starting from the mpu, we find low and high table entries for
355  * mapping this cell.  If these entries contain the same value, when
356  * the rate tables have been modified for linklayer ATM.
357  *
358  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
359  * and then roundup to the next cell, calc the table entry one below,
360  * and compare.
361  */
362 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
363 {
364         int low       = roundup(r->mpu, 48);
365         int high      = roundup(low+1, 48);
366         int cell_low  = low >> r->cell_log;
367         int cell_high = (high >> r->cell_log) - 1;
368
369         /* rtab is too inaccurate at rates > 100Mbit/s */
370         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
371                 pr_debug("TC linklayer: Giving up ATM detection\n");
372                 return TC_LINKLAYER_ETHERNET;
373         }
374
375         if ((cell_high > cell_low) && (cell_high < 256)
376             && (rtab[cell_low] == rtab[cell_high])) {
377                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
378                          cell_low, cell_high, rtab[cell_high]);
379                 return TC_LINKLAYER_ATM;
380         }
381         return TC_LINKLAYER_ETHERNET;
382 }
383
384 static struct qdisc_rate_table *qdisc_rtab_list;
385
386 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
387 {
388         struct qdisc_rate_table *rtab;
389
390         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
391             nla_len(tab) != TC_RTAB_SIZE)
392                 return NULL;
393
394         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
395                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
396                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
397                         rtab->refcnt++;
398                         return rtab;
399                 }
400         }
401
402         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
403         if (rtab) {
404                 rtab->rate = *r;
405                 rtab->refcnt = 1;
406                 memcpy(rtab->data, nla_data(tab), 1024);
407                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
408                         r->linklayer = __detect_linklayer(r, rtab->data);
409                 rtab->next = qdisc_rtab_list;
410                 qdisc_rtab_list = rtab;
411         }
412         return rtab;
413 }
414 EXPORT_SYMBOL(qdisc_get_rtab);
415
416 void qdisc_put_rtab(struct qdisc_rate_table *tab)
417 {
418         struct qdisc_rate_table *rtab, **rtabp;
419
420         if (!tab || --tab->refcnt)
421                 return;
422
423         for (rtabp = &qdisc_rtab_list;
424              (rtab = *rtabp) != NULL;
425              rtabp = &rtab->next) {
426                 if (rtab == tab) {
427                         *rtabp = rtab->next;
428                         kfree(rtab);
429                         return;
430                 }
431         }
432 }
433 EXPORT_SYMBOL(qdisc_put_rtab);
434
435 static LIST_HEAD(qdisc_stab_list);
436 static DEFINE_SPINLOCK(qdisc_stab_lock);
437
438 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
439         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
440         [TCA_STAB_DATA] = { .type = NLA_BINARY },
441 };
442
443 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
444 {
445         struct nlattr *tb[TCA_STAB_MAX + 1];
446         struct qdisc_size_table *stab;
447         struct tc_sizespec *s;
448         unsigned int tsize = 0;
449         u16 *tab = NULL;
450         int err;
451
452         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
453         if (err < 0)
454                 return ERR_PTR(err);
455         if (!tb[TCA_STAB_BASE])
456                 return ERR_PTR(-EINVAL);
457
458         s = nla_data(tb[TCA_STAB_BASE]);
459
460         if (s->tsize > 0) {
461                 if (!tb[TCA_STAB_DATA])
462                         return ERR_PTR(-EINVAL);
463                 tab = nla_data(tb[TCA_STAB_DATA]);
464                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
465         }
466
467         if (tsize != s->tsize || (!tab && tsize > 0))
468                 return ERR_PTR(-EINVAL);
469
470         spin_lock(&qdisc_stab_lock);
471
472         list_for_each_entry(stab, &qdisc_stab_list, list) {
473                 if (memcmp(&stab->szopts, s, sizeof(*s)))
474                         continue;
475                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
476                         continue;
477                 stab->refcnt++;
478                 spin_unlock(&qdisc_stab_lock);
479                 return stab;
480         }
481
482         spin_unlock(&qdisc_stab_lock);
483
484         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
485         if (!stab)
486                 return ERR_PTR(-ENOMEM);
487
488         stab->refcnt = 1;
489         stab->szopts = *s;
490         if (tsize > 0)
491                 memcpy(stab->data, tab, tsize * sizeof(u16));
492
493         spin_lock(&qdisc_stab_lock);
494         list_add_tail(&stab->list, &qdisc_stab_list);
495         spin_unlock(&qdisc_stab_lock);
496
497         return stab;
498 }
499
500 static void stab_kfree_rcu(struct rcu_head *head)
501 {
502         kfree(container_of(head, struct qdisc_size_table, rcu));
503 }
504
505 void qdisc_put_stab(struct qdisc_size_table *tab)
506 {
507         if (!tab)
508                 return;
509
510         spin_lock(&qdisc_stab_lock);
511
512         if (--tab->refcnt == 0) {
513                 list_del(&tab->list);
514                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
515         }
516
517         spin_unlock(&qdisc_stab_lock);
518 }
519 EXPORT_SYMBOL(qdisc_put_stab);
520
521 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
522 {
523         struct nlattr *nest;
524
525         nest = nla_nest_start(skb, TCA_STAB);
526         if (nest == NULL)
527                 goto nla_put_failure;
528         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
529                 goto nla_put_failure;
530         nla_nest_end(skb, nest);
531
532         return skb->len;
533
534 nla_put_failure:
535         return -1;
536 }
537
538 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
539 {
540         int pkt_len, slot;
541
542         pkt_len = skb->len + stab->szopts.overhead;
543         if (unlikely(!stab->szopts.tsize))
544                 goto out;
545
546         slot = pkt_len + stab->szopts.cell_align;
547         if (unlikely(slot < 0))
548                 slot = 0;
549
550         slot >>= stab->szopts.cell_log;
551         if (likely(slot < stab->szopts.tsize))
552                 pkt_len = stab->data[slot];
553         else
554                 pkt_len = stab->data[stab->szopts.tsize - 1] *
555                                 (slot / stab->szopts.tsize) +
556                                 stab->data[slot % stab->szopts.tsize];
557
558         pkt_len <<= stab->szopts.size_log;
559 out:
560         if (unlikely(pkt_len < 1))
561                 pkt_len = 1;
562         qdisc_skb_cb(skb)->pkt_len = pkt_len;
563 }
564 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
565
566 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
567 {
568         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
569                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
570                         txt, qdisc->ops->id, qdisc->handle >> 16);
571                 qdisc->flags |= TCQ_F_WARN_NONWC;
572         }
573 }
574 EXPORT_SYMBOL(qdisc_warn_nonwc);
575
576 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
577 {
578         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
579                                                  timer);
580
581         qdisc_unthrottled(wd->qdisc);
582         __netif_schedule(qdisc_root(wd->qdisc));
583
584         return HRTIMER_NORESTART;
585 }
586
587 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
588 {
589         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
590         wd->timer.function = qdisc_watchdog;
591         wd->qdisc = qdisc;
592 }
593 EXPORT_SYMBOL(qdisc_watchdog_init);
594
595 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
596 {
597         if (test_bit(__QDISC_STATE_DEACTIVATED,
598                      &qdisc_root_sleeping(wd->qdisc)->state))
599                 return;
600
601         qdisc_throttled(wd->qdisc);
602
603         hrtimer_start(&wd->timer,
604                       ns_to_ktime(expires),
605                       HRTIMER_MODE_ABS_PINNED);
606 }
607 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
608
609 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
610 {
611         hrtimer_cancel(&wd->timer);
612         qdisc_unthrottled(wd->qdisc);
613 }
614 EXPORT_SYMBOL(qdisc_watchdog_cancel);
615
616 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
617 {
618         unsigned int size = n * sizeof(struct hlist_head), i;
619         struct hlist_head *h;
620
621         if (size <= PAGE_SIZE)
622                 h = kmalloc(size, GFP_KERNEL);
623         else
624                 h = (struct hlist_head *)
625                         __get_free_pages(GFP_KERNEL, get_order(size));
626
627         if (h != NULL) {
628                 for (i = 0; i < n; i++)
629                         INIT_HLIST_HEAD(&h[i]);
630         }
631         return h;
632 }
633
634 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
635 {
636         unsigned int size = n * sizeof(struct hlist_head);
637
638         if (size <= PAGE_SIZE)
639                 kfree(h);
640         else
641                 free_pages((unsigned long)h, get_order(size));
642 }
643
644 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
645 {
646         struct Qdisc_class_common *cl;
647         struct hlist_node *next;
648         struct hlist_head *nhash, *ohash;
649         unsigned int nsize, nmask, osize;
650         unsigned int i, h;
651
652         /* Rehash when load factor exceeds 0.75 */
653         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
654                 return;
655         nsize = clhash->hashsize * 2;
656         nmask = nsize - 1;
657         nhash = qdisc_class_hash_alloc(nsize);
658         if (nhash == NULL)
659                 return;
660
661         ohash = clhash->hash;
662         osize = clhash->hashsize;
663
664         sch_tree_lock(sch);
665         for (i = 0; i < osize; i++) {
666                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
667                         h = qdisc_class_hash(cl->classid, nmask);
668                         hlist_add_head(&cl->hnode, &nhash[h]);
669                 }
670         }
671         clhash->hash     = nhash;
672         clhash->hashsize = nsize;
673         clhash->hashmask = nmask;
674         sch_tree_unlock(sch);
675
676         qdisc_class_hash_free(ohash, osize);
677 }
678 EXPORT_SYMBOL(qdisc_class_hash_grow);
679
680 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
681 {
682         unsigned int size = 4;
683
684         clhash->hash = qdisc_class_hash_alloc(size);
685         if (clhash->hash == NULL)
686                 return -ENOMEM;
687         clhash->hashsize  = size;
688         clhash->hashmask  = size - 1;
689         clhash->hashelems = 0;
690         return 0;
691 }
692 EXPORT_SYMBOL(qdisc_class_hash_init);
693
694 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
695 {
696         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_destroy);
699
700 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
701                              struct Qdisc_class_common *cl)
702 {
703         unsigned int h;
704
705         INIT_HLIST_NODE(&cl->hnode);
706         h = qdisc_class_hash(cl->classid, clhash->hashmask);
707         hlist_add_head(&cl->hnode, &clhash->hash[h]);
708         clhash->hashelems++;
709 }
710 EXPORT_SYMBOL(qdisc_class_hash_insert);
711
712 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
713                              struct Qdisc_class_common *cl)
714 {
715         hlist_del(&cl->hnode);
716         clhash->hashelems--;
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_remove);
719
720 /* Allocate an unique handle from space managed by kernel
721  * Possible range is [8000-FFFF]:0000 (0x8000 values)
722  */
723 static u32 qdisc_alloc_handle(struct net_device *dev)
724 {
725         int i = 0x8000;
726         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
727
728         do {
729                 autohandle += TC_H_MAKE(0x10000U, 0);
730                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
731                         autohandle = TC_H_MAKE(0x80000000U, 0);
732                 if (!qdisc_lookup(dev, autohandle))
733                         return autohandle;
734                 cond_resched();
735         } while (--i > 0);
736
737         return 0;
738 }
739
740 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
741 {
742         const struct Qdisc_class_ops *cops;
743         unsigned long cl;
744         u32 parentid;
745         int drops;
746
747         if (n == 0)
748                 return;
749         drops = max_t(int, n, 0);
750         while ((parentid = sch->parent)) {
751                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
752                         return;
753
754                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
755                 if (sch == NULL) {
756                         WARN_ON(parentid != TC_H_ROOT);
757                         return;
758                 }
759                 cops = sch->ops->cl_ops;
760                 if (cops->qlen_notify) {
761                         cl = cops->get(sch, parentid);
762                         cops->qlen_notify(sch, cl);
763                         cops->put(sch, cl);
764                 }
765                 sch->q.qlen -= n;
766                 __qdisc_qstats_drop(sch, drops);
767         }
768 }
769 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
770
771 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
772                                struct nlmsghdr *n, u32 clid,
773                                struct Qdisc *old, struct Qdisc *new)
774 {
775         if (new || old)
776                 qdisc_notify(net, skb, n, clid, old, new);
777
778         if (old)
779                 qdisc_destroy(old);
780 }
781
782 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
783  * to device "dev".
784  *
785  * When appropriate send a netlink notification using 'skb'
786  * and "n".
787  *
788  * On success, destroy old qdisc.
789  */
790
791 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
792                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
793                        struct Qdisc *new, struct Qdisc *old)
794 {
795         struct Qdisc *q = old;
796         struct net *net = dev_net(dev);
797         int err = 0;
798
799         if (parent == NULL) {
800                 unsigned int i, num_q, ingress;
801
802                 ingress = 0;
803                 num_q = dev->num_tx_queues;
804                 if ((q && q->flags & TCQ_F_INGRESS) ||
805                     (new && new->flags & TCQ_F_INGRESS)) {
806                         num_q = 1;
807                         ingress = 1;
808                         if (!dev_ingress_queue(dev))
809                                 return -ENOENT;
810                 }
811
812                 if (dev->flags & IFF_UP)
813                         dev_deactivate(dev);
814
815                 if (new && new->ops->attach) {
816                         new->ops->attach(new);
817                         num_q = 0;
818                 }
819
820                 for (i = 0; i < num_q; i++) {
821                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
822
823                         if (!ingress)
824                                 dev_queue = netdev_get_tx_queue(dev, i);
825
826                         old = dev_graft_qdisc(dev_queue, new);
827                         if (new && i > 0)
828                                 atomic_inc(&new->refcnt);
829
830                         if (!ingress)
831                                 qdisc_destroy(old);
832                 }
833
834                 if (!ingress) {
835                         notify_and_destroy(net, skb, n, classid,
836                                            dev->qdisc, new);
837                         if (new && !new->ops->attach)
838                                 atomic_inc(&new->refcnt);
839                         dev->qdisc = new ? : &noop_qdisc;
840                 } else {
841                         notify_and_destroy(net, skb, n, classid, old, new);
842                 }
843
844                 if (dev->flags & IFF_UP)
845                         dev_activate(dev);
846         } else {
847                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
848
849                 err = -EOPNOTSUPP;
850                 if (cops && cops->graft) {
851                         unsigned long cl = cops->get(parent, classid);
852                         if (cl) {
853                                 err = cops->graft(parent, cl, new, &old);
854                                 cops->put(parent, cl);
855                         } else
856                                 err = -ENOENT;
857                 }
858                 if (!err)
859                         notify_and_destroy(net, skb, n, classid, old, new);
860         }
861         return err;
862 }
863
864 /* lockdep annotation is needed for ingress; egress gets it only for name */
865 static struct lock_class_key qdisc_tx_lock;
866 static struct lock_class_key qdisc_rx_lock;
867
868 /*
869    Allocate and initialize new qdisc.
870
871    Parameters are passed via opt.
872  */
873
874 static struct Qdisc *
875 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
876              struct Qdisc *p, u32 parent, u32 handle,
877              struct nlattr **tca, int *errp)
878 {
879         int err;
880         struct nlattr *kind = tca[TCA_KIND];
881         struct Qdisc *sch;
882         struct Qdisc_ops *ops;
883         struct qdisc_size_table *stab;
884
885         ops = qdisc_lookup_ops(kind);
886 #ifdef CONFIG_MODULES
887         if (ops == NULL && kind != NULL) {
888                 char name[IFNAMSIZ];
889                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
890                         /* We dropped the RTNL semaphore in order to
891                          * perform the module load.  So, even if we
892                          * succeeded in loading the module we have to
893                          * tell the caller to replay the request.  We
894                          * indicate this using -EAGAIN.
895                          * We replay the request because the device may
896                          * go away in the mean time.
897                          */
898                         rtnl_unlock();
899                         request_module("sch_%s", name);
900                         rtnl_lock();
901                         ops = qdisc_lookup_ops(kind);
902                         if (ops != NULL) {
903                                 /* We will try again qdisc_lookup_ops,
904                                  * so don't keep a reference.
905                                  */
906                                 module_put(ops->owner);
907                                 err = -EAGAIN;
908                                 goto err_out;
909                         }
910                 }
911         }
912 #endif
913
914         err = -ENOENT;
915         if (ops == NULL)
916                 goto err_out;
917
918         sch = qdisc_alloc(dev_queue, ops);
919         if (IS_ERR(sch)) {
920                 err = PTR_ERR(sch);
921                 goto err_out2;
922         }
923
924         sch->parent = parent;
925
926         if (handle == TC_H_INGRESS) {
927                 sch->flags |= TCQ_F_INGRESS;
928                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
929                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
930         } else {
931                 if (handle == 0) {
932                         handle = qdisc_alloc_handle(dev);
933                         err = -ENOMEM;
934                         if (handle == 0)
935                                 goto err_out3;
936                 }
937                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
938                 if (!netif_is_multiqueue(dev))
939                         sch->flags |= TCQ_F_ONETXQUEUE;
940         }
941
942         sch->handle = handle;
943
944         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
945                 if (qdisc_is_percpu_stats(sch)) {
946                         sch->cpu_bstats =
947                                 alloc_percpu(struct gnet_stats_basic_cpu);
948                         if (!sch->cpu_bstats)
949                                 goto err_out4;
950
951                         sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
952                         if (!sch->cpu_qstats)
953                                 goto err_out4;
954                 }
955
956                 if (tca[TCA_STAB]) {
957                         stab = qdisc_get_stab(tca[TCA_STAB]);
958                         if (IS_ERR(stab)) {
959                                 err = PTR_ERR(stab);
960                                 goto err_out4;
961                         }
962                         rcu_assign_pointer(sch->stab, stab);
963                 }
964                 if (tca[TCA_RATE]) {
965                         spinlock_t *root_lock;
966
967                         err = -EOPNOTSUPP;
968                         if (sch->flags & TCQ_F_MQROOT)
969                                 goto err_out4;
970
971                         if ((sch->parent != TC_H_ROOT) &&
972                             !(sch->flags & TCQ_F_INGRESS) &&
973                             (!p || !(p->flags & TCQ_F_MQROOT)))
974                                 root_lock = qdisc_root_sleeping_lock(sch);
975                         else
976                                 root_lock = qdisc_lock(sch);
977
978                         err = gen_new_estimator(&sch->bstats,
979                                                 sch->cpu_bstats,
980                                                 &sch->rate_est,
981                                                 root_lock,
982                                                 tca[TCA_RATE]);
983                         if (err)
984                                 goto err_out4;
985                 }
986
987                 qdisc_list_add(sch);
988
989                 return sch;
990         }
991 err_out3:
992         dev_put(dev);
993         kfree((char *) sch - sch->padded);
994 err_out2:
995         module_put(ops->owner);
996 err_out:
997         *errp = err;
998         return NULL;
999
1000 err_out4:
1001         free_percpu(sch->cpu_bstats);
1002         free_percpu(sch->cpu_qstats);
1003         /*
1004          * Any broken qdiscs that would require a ops->reset() here?
1005          * The qdisc was never in action so it shouldn't be necessary.
1006          */
1007         qdisc_put_stab(rtnl_dereference(sch->stab));
1008         if (ops->destroy)
1009                 ops->destroy(sch);
1010         goto err_out3;
1011 }
1012
1013 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1014 {
1015         struct qdisc_size_table *ostab, *stab = NULL;
1016         int err = 0;
1017
1018         if (tca[TCA_OPTIONS]) {
1019                 if (sch->ops->change == NULL)
1020                         return -EINVAL;
1021                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1022                 if (err)
1023                         return err;
1024         }
1025
1026         if (tca[TCA_STAB]) {
1027                 stab = qdisc_get_stab(tca[TCA_STAB]);
1028                 if (IS_ERR(stab))
1029                         return PTR_ERR(stab);
1030         }
1031
1032         ostab = rtnl_dereference(sch->stab);
1033         rcu_assign_pointer(sch->stab, stab);
1034         qdisc_put_stab(ostab);
1035
1036         if (tca[TCA_RATE]) {
1037                 /* NB: ignores errors from replace_estimator
1038                    because change can't be undone. */
1039                 if (sch->flags & TCQ_F_MQROOT)
1040                         goto out;
1041                 gen_replace_estimator(&sch->bstats,
1042                                       sch->cpu_bstats,
1043                                       &sch->rate_est,
1044                                       qdisc_root_sleeping_lock(sch),
1045                                       tca[TCA_RATE]);
1046         }
1047 out:
1048         return 0;
1049 }
1050
1051 struct check_loop_arg {
1052         struct qdisc_walker     w;
1053         struct Qdisc            *p;
1054         int                     depth;
1055 };
1056
1057 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
1058
1059 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1060 {
1061         struct check_loop_arg   arg;
1062
1063         if (q->ops->cl_ops == NULL)
1064                 return 0;
1065
1066         arg.w.stop = arg.w.skip = arg.w.count = 0;
1067         arg.w.fn = check_loop_fn;
1068         arg.depth = depth;
1069         arg.p = p;
1070         q->ops->cl_ops->walk(q, &arg.w);
1071         return arg.w.stop ? -ELOOP : 0;
1072 }
1073
1074 static int
1075 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1076 {
1077         struct Qdisc *leaf;
1078         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1079         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1080
1081         leaf = cops->leaf(q, cl);
1082         if (leaf) {
1083                 if (leaf == arg->p || arg->depth > 7)
1084                         return -ELOOP;
1085                 return check_loop(leaf, arg->p, arg->depth + 1);
1086         }
1087         return 0;
1088 }
1089
1090 /*
1091  * Delete/get qdisc.
1092  */
1093
1094 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1095 {
1096         struct net *net = sock_net(skb->sk);
1097         struct tcmsg *tcm = nlmsg_data(n);
1098         struct nlattr *tca[TCA_MAX + 1];
1099         struct net_device *dev;
1100         u32 clid;
1101         struct Qdisc *q = NULL;
1102         struct Qdisc *p = NULL;
1103         int err;
1104
1105         if ((n->nlmsg_type != RTM_GETQDISC) &&
1106             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1107                 return -EPERM;
1108
1109         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1110         if (err < 0)
1111                 return err;
1112
1113         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1114         if (!dev)
1115                 return -ENODEV;
1116
1117         clid = tcm->tcm_parent;
1118         if (clid) {
1119                 if (clid != TC_H_ROOT) {
1120                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1121                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1122                                 if (!p)
1123                                         return -ENOENT;
1124                                 q = qdisc_leaf(p, clid);
1125                         } else if (dev_ingress_queue(dev)) {
1126                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1127                         }
1128                 } else {
1129                         q = dev->qdisc;
1130                 }
1131                 if (!q)
1132                         return -ENOENT;
1133
1134                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1135                         return -EINVAL;
1136         } else {
1137                 q = qdisc_lookup(dev, tcm->tcm_handle);
1138                 if (!q)
1139                         return -ENOENT;
1140         }
1141
1142         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1143                 return -EINVAL;
1144
1145         if (n->nlmsg_type == RTM_DELQDISC) {
1146                 if (!clid)
1147                         return -EINVAL;
1148                 if (q->handle == 0)
1149                         return -ENOENT;
1150                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1151                 if (err != 0)
1152                         return err;
1153         } else {
1154                 qdisc_notify(net, skb, n, clid, NULL, q);
1155         }
1156         return 0;
1157 }
1158
1159 /*
1160  * Create/change qdisc.
1161  */
1162
1163 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1164 {
1165         struct net *net = sock_net(skb->sk);
1166         struct tcmsg *tcm;
1167         struct nlattr *tca[TCA_MAX + 1];
1168         struct net_device *dev;
1169         u32 clid;
1170         struct Qdisc *q, *p;
1171         int err;
1172
1173         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1174                 return -EPERM;
1175
1176 replay:
1177         /* Reinit, just in case something touches this. */
1178         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1179         if (err < 0)
1180                 return err;
1181
1182         tcm = nlmsg_data(n);
1183         clid = tcm->tcm_parent;
1184         q = p = NULL;
1185
1186         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1187         if (!dev)
1188                 return -ENODEV;
1189
1190
1191         if (clid) {
1192                 if (clid != TC_H_ROOT) {
1193                         if (clid != TC_H_INGRESS) {
1194                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1195                                 if (!p)
1196                                         return -ENOENT;
1197                                 q = qdisc_leaf(p, clid);
1198                         } else if (dev_ingress_queue_create(dev)) {
1199                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1200                         }
1201                 } else {
1202                         q = dev->qdisc;
1203                 }
1204
1205                 /* It may be default qdisc, ignore it */
1206                 if (q && q->handle == 0)
1207                         q = NULL;
1208
1209                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1210                         if (tcm->tcm_handle) {
1211                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1212                                         return -EEXIST;
1213                                 if (TC_H_MIN(tcm->tcm_handle))
1214                                         return -EINVAL;
1215                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1216                                 if (!q)
1217                                         goto create_n_graft;
1218                                 if (n->nlmsg_flags & NLM_F_EXCL)
1219                                         return -EEXIST;
1220                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1221                                         return -EINVAL;
1222                                 if (q == p ||
1223                                     (p && check_loop(q, p, 0)))
1224                                         return -ELOOP;
1225                                 atomic_inc(&q->refcnt);
1226                                 goto graft;
1227                         } else {
1228                                 if (!q)
1229                                         goto create_n_graft;
1230
1231                                 /* This magic test requires explanation.
1232                                  *
1233                                  *   We know, that some child q is already
1234                                  *   attached to this parent and have choice:
1235                                  *   either to change it or to create/graft new one.
1236                                  *
1237                                  *   1. We are allowed to create/graft only
1238                                  *   if CREATE and REPLACE flags are set.
1239                                  *
1240                                  *   2. If EXCL is set, requestor wanted to say,
1241                                  *   that qdisc tcm_handle is not expected
1242                                  *   to exist, so that we choose create/graft too.
1243                                  *
1244                                  *   3. The last case is when no flags are set.
1245                                  *   Alas, it is sort of hole in API, we
1246                                  *   cannot decide what to do unambiguously.
1247                                  *   For now we select create/graft, if
1248                                  *   user gave KIND, which does not match existing.
1249                                  */
1250                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1251                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1252                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1253                                      (tca[TCA_KIND] &&
1254                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1255                                         goto create_n_graft;
1256                         }
1257                 }
1258         } else {
1259                 if (!tcm->tcm_handle)
1260                         return -EINVAL;
1261                 q = qdisc_lookup(dev, tcm->tcm_handle);
1262         }
1263
1264         /* Change qdisc parameters */
1265         if (q == NULL)
1266                 return -ENOENT;
1267         if (n->nlmsg_flags & NLM_F_EXCL)
1268                 return -EEXIST;
1269         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1270                 return -EINVAL;
1271         err = qdisc_change(q, tca);
1272         if (err == 0)
1273                 qdisc_notify(net, skb, n, clid, NULL, q);
1274         return err;
1275
1276 create_n_graft:
1277         if (!(n->nlmsg_flags & NLM_F_CREATE))
1278                 return -ENOENT;
1279         if (clid == TC_H_INGRESS) {
1280                 if (dev_ingress_queue(dev))
1281                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1282                                          tcm->tcm_parent, tcm->tcm_parent,
1283                                          tca, &err);
1284                 else
1285                         err = -ENOENT;
1286         } else {
1287                 struct netdev_queue *dev_queue;
1288
1289                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1290                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1291                 else if (p)
1292                         dev_queue = p->dev_queue;
1293                 else
1294                         dev_queue = netdev_get_tx_queue(dev, 0);
1295
1296                 q = qdisc_create(dev, dev_queue, p,
1297                                  tcm->tcm_parent, tcm->tcm_handle,
1298                                  tca, &err);
1299         }
1300         if (q == NULL) {
1301                 if (err == -EAGAIN)
1302                         goto replay;
1303                 return err;
1304         }
1305
1306 graft:
1307         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1308         if (err) {
1309                 if (q)
1310                         qdisc_destroy(q);
1311                 return err;
1312         }
1313
1314         return 0;
1315 }
1316
1317 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1318                          u32 portid, u32 seq, u16 flags, int event)
1319 {
1320         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1321         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1322         struct tcmsg *tcm;
1323         struct nlmsghdr  *nlh;
1324         unsigned char *b = skb_tail_pointer(skb);
1325         struct gnet_dump d;
1326         struct qdisc_size_table *stab;
1327         __u32 qlen;
1328
1329         cond_resched();
1330         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1331         if (!nlh)
1332                 goto out_nlmsg_trim;
1333         tcm = nlmsg_data(nlh);
1334         tcm->tcm_family = AF_UNSPEC;
1335         tcm->tcm__pad1 = 0;
1336         tcm->tcm__pad2 = 0;
1337         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1338         tcm->tcm_parent = clid;
1339         tcm->tcm_handle = q->handle;
1340         tcm->tcm_info = atomic_read(&q->refcnt);
1341         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1342                 goto nla_put_failure;
1343         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1344                 goto nla_put_failure;
1345         qlen = q->q.qlen;
1346
1347         stab = rtnl_dereference(q->stab);
1348         if (stab && qdisc_dump_stab(skb, stab) < 0)
1349                 goto nla_put_failure;
1350
1351         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1352                                          qdisc_root_sleeping_lock(q), &d) < 0)
1353                 goto nla_put_failure;
1354
1355         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1356                 goto nla_put_failure;
1357
1358         if (qdisc_is_percpu_stats(q)) {
1359                 cpu_bstats = q->cpu_bstats;
1360                 cpu_qstats = q->cpu_qstats;
1361         }
1362
1363         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats) < 0 ||
1364             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1365             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1366                 goto nla_put_failure;
1367
1368         if (gnet_stats_finish_copy(&d) < 0)
1369                 goto nla_put_failure;
1370
1371         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1372         return skb->len;
1373
1374 out_nlmsg_trim:
1375 nla_put_failure:
1376         nlmsg_trim(skb, b);
1377         return -1;
1378 }
1379
1380 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1381 {
1382         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1383 }
1384
1385 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1386                         struct nlmsghdr *n, u32 clid,
1387                         struct Qdisc *old, struct Qdisc *new)
1388 {
1389         struct sk_buff *skb;
1390         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1391
1392         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1393         if (!skb)
1394                 return -ENOBUFS;
1395
1396         if (old && !tc_qdisc_dump_ignore(old)) {
1397                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1398                                   0, RTM_DELQDISC) < 0)
1399                         goto err_out;
1400         }
1401         if (new && !tc_qdisc_dump_ignore(new)) {
1402                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1403                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1404                         goto err_out;
1405         }
1406
1407         if (skb->len)
1408                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1409                                       n->nlmsg_flags & NLM_F_ECHO);
1410
1411 err_out:
1412         kfree_skb(skb);
1413         return -EINVAL;
1414 }
1415
1416 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1417                               struct netlink_callback *cb,
1418                               int *q_idx_p, int s_q_idx)
1419 {
1420         int ret = 0, q_idx = *q_idx_p;
1421         struct Qdisc *q;
1422
1423         if (!root)
1424                 return 0;
1425
1426         q = root;
1427         if (q_idx < s_q_idx) {
1428                 q_idx++;
1429         } else {
1430                 if (!tc_qdisc_dump_ignore(q) &&
1431                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1432                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1433                         goto done;
1434                 q_idx++;
1435         }
1436         list_for_each_entry(q, &root->list, list) {
1437                 if (q_idx < s_q_idx) {
1438                         q_idx++;
1439                         continue;
1440                 }
1441                 if (!tc_qdisc_dump_ignore(q) &&
1442                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1443                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1444                         goto done;
1445                 q_idx++;
1446         }
1447
1448 out:
1449         *q_idx_p = q_idx;
1450         return ret;
1451 done:
1452         ret = -1;
1453         goto out;
1454 }
1455
1456 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1457 {
1458         struct net *net = sock_net(skb->sk);
1459         int idx, q_idx;
1460         int s_idx, s_q_idx;
1461         struct net_device *dev;
1462
1463         s_idx = cb->args[0];
1464         s_q_idx = q_idx = cb->args[1];
1465
1466         idx = 0;
1467         ASSERT_RTNL();
1468         for_each_netdev(net, dev) {
1469                 struct netdev_queue *dev_queue;
1470
1471                 if (idx < s_idx)
1472                         goto cont;
1473                 if (idx > s_idx)
1474                         s_q_idx = 0;
1475                 q_idx = 0;
1476
1477                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1478                         goto done;
1479
1480                 dev_queue = dev_ingress_queue(dev);
1481                 if (dev_queue &&
1482                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1483                                        &q_idx, s_q_idx) < 0)
1484                         goto done;
1485
1486 cont:
1487                 idx++;
1488         }
1489
1490 done:
1491         cb->args[0] = idx;
1492         cb->args[1] = q_idx;
1493
1494         return skb->len;
1495 }
1496
1497
1498
1499 /************************************************
1500  *      Traffic classes manipulation.           *
1501  ************************************************/
1502
1503
1504
1505 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1506 {
1507         struct net *net = sock_net(skb->sk);
1508         struct tcmsg *tcm = nlmsg_data(n);
1509         struct nlattr *tca[TCA_MAX + 1];
1510         struct net_device *dev;
1511         struct Qdisc *q = NULL;
1512         const struct Qdisc_class_ops *cops;
1513         unsigned long cl = 0;
1514         unsigned long new_cl;
1515         u32 portid;
1516         u32 clid;
1517         u32 qid;
1518         int err;
1519
1520         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1521             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1522                 return -EPERM;
1523
1524         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1525         if (err < 0)
1526                 return err;
1527
1528         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1529         if (!dev)
1530                 return -ENODEV;
1531
1532         /*
1533            parent == TC_H_UNSPEC - unspecified parent.
1534            parent == TC_H_ROOT   - class is root, which has no parent.
1535            parent == X:0         - parent is root class.
1536            parent == X:Y         - parent is a node in hierarchy.
1537            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1538
1539            handle == 0:0         - generate handle from kernel pool.
1540            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1541            handle == X:Y         - clear.
1542            handle == X:0         - root class.
1543          */
1544
1545         /* Step 1. Determine qdisc handle X:0 */
1546
1547         portid = tcm->tcm_parent;
1548         clid = tcm->tcm_handle;
1549         qid = TC_H_MAJ(clid);
1550
1551         if (portid != TC_H_ROOT) {
1552                 u32 qid1 = TC_H_MAJ(portid);
1553
1554                 if (qid && qid1) {
1555                         /* If both majors are known, they must be identical. */
1556                         if (qid != qid1)
1557                                 return -EINVAL;
1558                 } else if (qid1) {
1559                         qid = qid1;
1560                 } else if (qid == 0)
1561                         qid = dev->qdisc->handle;
1562
1563                 /* Now qid is genuine qdisc handle consistent
1564                  * both with parent and child.
1565                  *
1566                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1567                  */
1568                 if (portid)
1569                         portid = TC_H_MAKE(qid, portid);
1570         } else {
1571                 if (qid == 0)
1572                         qid = dev->qdisc->handle;
1573         }
1574
1575         /* OK. Locate qdisc */
1576         q = qdisc_lookup(dev, qid);
1577         if (!q)
1578                 return -ENOENT;
1579
1580         /* An check that it supports classes */
1581         cops = q->ops->cl_ops;
1582         if (cops == NULL)
1583                 return -EINVAL;
1584
1585         /* Now try to get class */
1586         if (clid == 0) {
1587                 if (portid == TC_H_ROOT)
1588                         clid = qid;
1589         } else
1590                 clid = TC_H_MAKE(qid, clid);
1591
1592         if (clid)
1593                 cl = cops->get(q, clid);
1594
1595         if (cl == 0) {
1596                 err = -ENOENT;
1597                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1598                     !(n->nlmsg_flags & NLM_F_CREATE))
1599                         goto out;
1600         } else {
1601                 switch (n->nlmsg_type) {
1602                 case RTM_NEWTCLASS:
1603                         err = -EEXIST;
1604                         if (n->nlmsg_flags & NLM_F_EXCL)
1605                                 goto out;
1606                         break;
1607                 case RTM_DELTCLASS:
1608                         err = -EOPNOTSUPP;
1609                         if (cops->delete)
1610                                 err = cops->delete(q, cl);
1611                         if (err == 0)
1612                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1613                         goto out;
1614                 case RTM_GETTCLASS:
1615                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1616                         goto out;
1617                 default:
1618                         err = -EINVAL;
1619                         goto out;
1620                 }
1621         }
1622
1623         new_cl = cl;
1624         err = -EOPNOTSUPP;
1625         if (cops->change)
1626                 err = cops->change(q, clid, portid, tca, &new_cl);
1627         if (err == 0)
1628                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1629
1630 out:
1631         if (cl)
1632                 cops->put(q, cl);
1633
1634         return err;
1635 }
1636
1637
1638 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1639                           unsigned long cl,
1640                           u32 portid, u32 seq, u16 flags, int event)
1641 {
1642         struct tcmsg *tcm;
1643         struct nlmsghdr  *nlh;
1644         unsigned char *b = skb_tail_pointer(skb);
1645         struct gnet_dump d;
1646         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1647
1648         cond_resched();
1649         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1650         if (!nlh)
1651                 goto out_nlmsg_trim;
1652         tcm = nlmsg_data(nlh);
1653         tcm->tcm_family = AF_UNSPEC;
1654         tcm->tcm__pad1 = 0;
1655         tcm->tcm__pad2 = 0;
1656         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1657         tcm->tcm_parent = q->handle;
1658         tcm->tcm_handle = q->handle;
1659         tcm->tcm_info = 0;
1660         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1661                 goto nla_put_failure;
1662         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1663                 goto nla_put_failure;
1664
1665         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1666                                          qdisc_root_sleeping_lock(q), &d) < 0)
1667                 goto nla_put_failure;
1668
1669         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1670                 goto nla_put_failure;
1671
1672         if (gnet_stats_finish_copy(&d) < 0)
1673                 goto nla_put_failure;
1674
1675         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1676         return skb->len;
1677
1678 out_nlmsg_trim:
1679 nla_put_failure:
1680         nlmsg_trim(skb, b);
1681         return -1;
1682 }
1683
1684 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1685                          struct nlmsghdr *n, struct Qdisc *q,
1686                          unsigned long cl, int event)
1687 {
1688         struct sk_buff *skb;
1689         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1690
1691         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1692         if (!skb)
1693                 return -ENOBUFS;
1694
1695         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1696                 kfree_skb(skb);
1697                 return -EINVAL;
1698         }
1699
1700         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1701                               n->nlmsg_flags & NLM_F_ECHO);
1702 }
1703
1704 struct qdisc_dump_args {
1705         struct qdisc_walker     w;
1706         struct sk_buff          *skb;
1707         struct netlink_callback *cb;
1708 };
1709
1710 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1711 {
1712         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1713
1714         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1715                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1716 }
1717
1718 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1719                                 struct tcmsg *tcm, struct netlink_callback *cb,
1720                                 int *t_p, int s_t)
1721 {
1722         struct qdisc_dump_args arg;
1723
1724         if (tc_qdisc_dump_ignore(q) ||
1725             *t_p < s_t || !q->ops->cl_ops ||
1726             (tcm->tcm_parent &&
1727              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1728                 (*t_p)++;
1729                 return 0;
1730         }
1731         if (*t_p > s_t)
1732                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1733         arg.w.fn = qdisc_class_dump;
1734         arg.skb = skb;
1735         arg.cb = cb;
1736         arg.w.stop  = 0;
1737         arg.w.skip = cb->args[1];
1738         arg.w.count = 0;
1739         q->ops->cl_ops->walk(q, &arg.w);
1740         cb->args[1] = arg.w.count;
1741         if (arg.w.stop)
1742                 return -1;
1743         (*t_p)++;
1744         return 0;
1745 }
1746
1747 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1748                                struct tcmsg *tcm, struct netlink_callback *cb,
1749                                int *t_p, int s_t)
1750 {
1751         struct Qdisc *q;
1752
1753         if (!root)
1754                 return 0;
1755
1756         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1757                 return -1;
1758
1759         list_for_each_entry(q, &root->list, list) {
1760                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1761                         return -1;
1762         }
1763
1764         return 0;
1765 }
1766
1767 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1768 {
1769         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1770         struct net *net = sock_net(skb->sk);
1771         struct netdev_queue *dev_queue;
1772         struct net_device *dev;
1773         int t, s_t;
1774
1775         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1776                 return 0;
1777         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1778         if (!dev)
1779                 return 0;
1780
1781         s_t = cb->args[0];
1782         t = 0;
1783
1784         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1785                 goto done;
1786
1787         dev_queue = dev_ingress_queue(dev);
1788         if (dev_queue &&
1789             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1790                                 &t, s_t) < 0)
1791                 goto done;
1792
1793 done:
1794         cb->args[0] = t;
1795
1796         dev_put(dev);
1797         return skb->len;
1798 }
1799
1800 /* Main classifier routine: scans classifier chain attached
1801  * to this qdisc, (optionally) tests for protocol and asks
1802  * specific classifiers.
1803  */
1804 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1805                        struct tcf_result *res)
1806 {
1807         __be16 protocol = skb->protocol;
1808         int err;
1809
1810         for (; tp; tp = rcu_dereference_bh(tp->next)) {
1811                 if (tp->protocol != protocol &&
1812                     tp->protocol != htons(ETH_P_ALL))
1813                         continue;
1814                 err = tp->classify(skb, tp, res);
1815
1816                 if (err >= 0) {
1817 #ifdef CONFIG_NET_CLS_ACT
1818                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1819                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1820 #endif
1821                         return err;
1822                 }
1823         }
1824         return -1;
1825 }
1826 EXPORT_SYMBOL(tc_classify_compat);
1827
1828 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1829                 struct tcf_result *res)
1830 {
1831         int err = 0;
1832 #ifdef CONFIG_NET_CLS_ACT
1833         const struct tcf_proto *otp = tp;
1834 reclassify:
1835 #endif
1836
1837         err = tc_classify_compat(skb, tp, res);
1838 #ifdef CONFIG_NET_CLS_ACT
1839         if (err == TC_ACT_RECLASSIFY) {
1840                 u32 verd = G_TC_VERD(skb->tc_verd);
1841                 tp = otp;
1842
1843                 if (verd++ >= MAX_REC_LOOP) {
1844                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1845                                                tp->q->ops->id,
1846                                                tp->prio & 0xffff,
1847                                                ntohs(tp->protocol));
1848                         return TC_ACT_SHOT;
1849                 }
1850                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1851                 goto reclassify;
1852         }
1853 #endif
1854         return err;
1855 }
1856 EXPORT_SYMBOL(tc_classify);
1857
1858 void tcf_destroy(struct tcf_proto *tp)
1859 {
1860         tp->ops->destroy(tp);
1861         module_put(tp->ops->owner);
1862         kfree_rcu(tp, rcu);
1863 }
1864
1865 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1866 {
1867         struct tcf_proto *tp;
1868
1869         while ((tp = rtnl_dereference(*fl)) != NULL) {
1870                 RCU_INIT_POINTER(*fl, tp->next);
1871                 tcf_destroy(tp);
1872         }
1873 }
1874 EXPORT_SYMBOL(tcf_destroy_chain);
1875
1876 #ifdef CONFIG_PROC_FS
1877 static int psched_show(struct seq_file *seq, void *v)
1878 {
1879         struct timespec ts;
1880
1881         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1882         seq_printf(seq, "%08x %08x %08x %08x\n",
1883                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1884                    1000000,
1885                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1886
1887         return 0;
1888 }
1889
1890 static int psched_open(struct inode *inode, struct file *file)
1891 {
1892         return single_open(file, psched_show, NULL);
1893 }
1894
1895 static const struct file_operations psched_fops = {
1896         .owner = THIS_MODULE,
1897         .open = psched_open,
1898         .read  = seq_read,
1899         .llseek = seq_lseek,
1900         .release = single_release,
1901 };
1902
1903 static int __net_init psched_net_init(struct net *net)
1904 {
1905         struct proc_dir_entry *e;
1906
1907         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1908         if (e == NULL)
1909                 return -ENOMEM;
1910
1911         return 0;
1912 }
1913
1914 static void __net_exit psched_net_exit(struct net *net)
1915 {
1916         remove_proc_entry("psched", net->proc_net);
1917 }
1918 #else
1919 static int __net_init psched_net_init(struct net *net)
1920 {
1921         return 0;
1922 }
1923
1924 static void __net_exit psched_net_exit(struct net *net)
1925 {
1926 }
1927 #endif
1928
1929 static struct pernet_operations psched_net_ops = {
1930         .init = psched_net_init,
1931         .exit = psched_net_exit,
1932 };
1933
1934 static int __init pktsched_init(void)
1935 {
1936         int err;
1937
1938         err = register_pernet_subsys(&psched_net_ops);
1939         if (err) {
1940                 pr_err("pktsched_init: "
1941                        "cannot initialize per netns operations\n");
1942                 return err;
1943         }
1944
1945         register_qdisc(&pfifo_fast_ops);
1946         register_qdisc(&pfifo_qdisc_ops);
1947         register_qdisc(&bfifo_qdisc_ops);
1948         register_qdisc(&pfifo_head_drop_qdisc_ops);
1949         register_qdisc(&mq_qdisc_ops);
1950
1951         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1952         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1953         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1954         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1955         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1956         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1957
1958         return 0;
1959 }
1960
1961 subsys_initcall(pktsched_init);