fe1ba54b93f7506612fdd4fc46ae2f967124776d
[firefly-linux-kernel-4.4.55.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296                         rtab->refcnt++;
297                         return rtab;
298                 }
299         }
300
301         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302             nla_len(tab) != TC_RTAB_SIZE)
303                 return NULL;
304
305         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306         if (rtab) {
307                 rtab->rate = *r;
308                 rtab->refcnt = 1;
309                 memcpy(rtab->data, nla_data(tab), 1024);
310                 rtab->next = qdisc_rtab_list;
311                 qdisc_rtab_list = rtab;
312         }
313         return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319         struct qdisc_rate_table *rtab, **rtabp;
320
321         if (!tab || --tab->refcnt)
322                 return;
323
324         for (rtabp = &qdisc_rtab_list;
325              (rtab = *rtabp) != NULL;
326              rtabp = &rtab->next) {
327                 if (rtab == tab) {
328                         *rtabp = rtab->next;
329                         kfree(rtab);
330                         return;
331                 }
332         }
333 }
334 EXPORT_SYMBOL(qdisc_put_rtab);
335
336 static LIST_HEAD(qdisc_stab_list);
337 static DEFINE_SPINLOCK(qdisc_stab_lock);
338
339 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
340         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
341         [TCA_STAB_DATA] = { .type = NLA_BINARY },
342 };
343
344 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
345 {
346         struct nlattr *tb[TCA_STAB_MAX + 1];
347         struct qdisc_size_table *stab;
348         struct tc_sizespec *s;
349         unsigned int tsize = 0;
350         u16 *tab = NULL;
351         int err;
352
353         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
354         if (err < 0)
355                 return ERR_PTR(err);
356         if (!tb[TCA_STAB_BASE])
357                 return ERR_PTR(-EINVAL);
358
359         s = nla_data(tb[TCA_STAB_BASE]);
360
361         if (s->tsize > 0) {
362                 if (!tb[TCA_STAB_DATA])
363                         return ERR_PTR(-EINVAL);
364                 tab = nla_data(tb[TCA_STAB_DATA]);
365                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
366         }
367
368         if (tsize != s->tsize || (!tab && tsize > 0))
369                 return ERR_PTR(-EINVAL);
370
371         spin_lock(&qdisc_stab_lock);
372
373         list_for_each_entry(stab, &qdisc_stab_list, list) {
374                 if (memcmp(&stab->szopts, s, sizeof(*s)))
375                         continue;
376                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
377                         continue;
378                 stab->refcnt++;
379                 spin_unlock(&qdisc_stab_lock);
380                 return stab;
381         }
382
383         spin_unlock(&qdisc_stab_lock);
384
385         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
386         if (!stab)
387                 return ERR_PTR(-ENOMEM);
388
389         stab->refcnt = 1;
390         stab->szopts = *s;
391         if (tsize > 0)
392                 memcpy(stab->data, tab, tsize * sizeof(u16));
393
394         spin_lock(&qdisc_stab_lock);
395         list_add_tail(&stab->list, &qdisc_stab_list);
396         spin_unlock(&qdisc_stab_lock);
397
398         return stab;
399 }
400
401 static void stab_kfree_rcu(struct rcu_head *head)
402 {
403         kfree(container_of(head, struct qdisc_size_table, rcu));
404 }
405
406 void qdisc_put_stab(struct qdisc_size_table *tab)
407 {
408         if (!tab)
409                 return;
410
411         spin_lock(&qdisc_stab_lock);
412
413         if (--tab->refcnt == 0) {
414                 list_del(&tab->list);
415                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
416         }
417
418         spin_unlock(&qdisc_stab_lock);
419 }
420 EXPORT_SYMBOL(qdisc_put_stab);
421
422 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
423 {
424         struct nlattr *nest;
425
426         nest = nla_nest_start(skb, TCA_STAB);
427         if (nest == NULL)
428                 goto nla_put_failure;
429         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
430                 goto nla_put_failure;
431         nla_nest_end(skb, nest);
432
433         return skb->len;
434
435 nla_put_failure:
436         return -1;
437 }
438
439 void __qdisc_calculate_pkt_len(struct sk_buff *skb, const struct qdisc_size_table *stab)
440 {
441         int pkt_len, slot;
442
443         pkt_len = skb->len + stab->szopts.overhead;
444         if (unlikely(!stab->szopts.tsize))
445                 goto out;
446
447         slot = pkt_len + stab->szopts.cell_align;
448         if (unlikely(slot < 0))
449                 slot = 0;
450
451         slot >>= stab->szopts.cell_log;
452         if (likely(slot < stab->szopts.tsize))
453                 pkt_len = stab->data[slot];
454         else
455                 pkt_len = stab->data[stab->szopts.tsize - 1] *
456                                 (slot / stab->szopts.tsize) +
457                                 stab->data[slot % stab->szopts.tsize];
458
459         pkt_len <<= stab->szopts.size_log;
460 out:
461         if (unlikely(pkt_len < 1))
462                 pkt_len = 1;
463         qdisc_skb_cb(skb)->pkt_len = pkt_len;
464 }
465 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
466
467 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
468 {
469         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
470                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
471                         txt, qdisc->ops->id, qdisc->handle >> 16);
472                 qdisc->flags |= TCQ_F_WARN_NONWC;
473         }
474 }
475 EXPORT_SYMBOL(qdisc_warn_nonwc);
476
477 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
478 {
479         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
480                                                  timer);
481
482         qdisc_unthrottled(wd->qdisc);
483         __netif_schedule(qdisc_root(wd->qdisc));
484
485         return HRTIMER_NORESTART;
486 }
487
488 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
489 {
490         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
491         wd->timer.function = qdisc_watchdog;
492         wd->qdisc = qdisc;
493 }
494 EXPORT_SYMBOL(qdisc_watchdog_init);
495
496 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
497 {
498         if (test_bit(__QDISC_STATE_DEACTIVATED,
499                      &qdisc_root_sleeping(wd->qdisc)->state))
500                 return;
501
502         qdisc_throttled(wd->qdisc);
503
504         hrtimer_start(&wd->timer,
505                       ns_to_ktime(expires),
506                       HRTIMER_MODE_ABS);
507 }
508 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
509
510 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
511 {
512         hrtimer_cancel(&wd->timer);
513         qdisc_unthrottled(wd->qdisc);
514 }
515 EXPORT_SYMBOL(qdisc_watchdog_cancel);
516
517 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
518 {
519         unsigned int size = n * sizeof(struct hlist_head), i;
520         struct hlist_head *h;
521
522         if (size <= PAGE_SIZE)
523                 h = kmalloc(size, GFP_KERNEL);
524         else
525                 h = (struct hlist_head *)
526                         __get_free_pages(GFP_KERNEL, get_order(size));
527
528         if (h != NULL) {
529                 for (i = 0; i < n; i++)
530                         INIT_HLIST_HEAD(&h[i]);
531         }
532         return h;
533 }
534
535 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
536 {
537         unsigned int size = n * sizeof(struct hlist_head);
538
539         if (size <= PAGE_SIZE)
540                 kfree(h);
541         else
542                 free_pages((unsigned long)h, get_order(size));
543 }
544
545 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
546 {
547         struct Qdisc_class_common *cl;
548         struct hlist_node *n, *next;
549         struct hlist_head *nhash, *ohash;
550         unsigned int nsize, nmask, osize;
551         unsigned int i, h;
552
553         /* Rehash when load factor exceeds 0.75 */
554         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
555                 return;
556         nsize = clhash->hashsize * 2;
557         nmask = nsize - 1;
558         nhash = qdisc_class_hash_alloc(nsize);
559         if (nhash == NULL)
560                 return;
561
562         ohash = clhash->hash;
563         osize = clhash->hashsize;
564
565         sch_tree_lock(sch);
566         for (i = 0; i < osize; i++) {
567                 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
568                         h = qdisc_class_hash(cl->classid, nmask);
569                         hlist_add_head(&cl->hnode, &nhash[h]);
570                 }
571         }
572         clhash->hash     = nhash;
573         clhash->hashsize = nsize;
574         clhash->hashmask = nmask;
575         sch_tree_unlock(sch);
576
577         qdisc_class_hash_free(ohash, osize);
578 }
579 EXPORT_SYMBOL(qdisc_class_hash_grow);
580
581 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
582 {
583         unsigned int size = 4;
584
585         clhash->hash = qdisc_class_hash_alloc(size);
586         if (clhash->hash == NULL)
587                 return -ENOMEM;
588         clhash->hashsize  = size;
589         clhash->hashmask  = size - 1;
590         clhash->hashelems = 0;
591         return 0;
592 }
593 EXPORT_SYMBOL(qdisc_class_hash_init);
594
595 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
596 {
597         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
598 }
599 EXPORT_SYMBOL(qdisc_class_hash_destroy);
600
601 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
602                              struct Qdisc_class_common *cl)
603 {
604         unsigned int h;
605
606         INIT_HLIST_NODE(&cl->hnode);
607         h = qdisc_class_hash(cl->classid, clhash->hashmask);
608         hlist_add_head(&cl->hnode, &clhash->hash[h]);
609         clhash->hashelems++;
610 }
611 EXPORT_SYMBOL(qdisc_class_hash_insert);
612
613 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
614                              struct Qdisc_class_common *cl)
615 {
616         hlist_del(&cl->hnode);
617         clhash->hashelems--;
618 }
619 EXPORT_SYMBOL(qdisc_class_hash_remove);
620
621 /* Allocate an unique handle from space managed by kernel
622  * Possible range is [8000-FFFF]:0000 (0x8000 values)
623  */
624 static u32 qdisc_alloc_handle(struct net_device *dev)
625 {
626         int i = 0x8000;
627         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
628
629         do {
630                 autohandle += TC_H_MAKE(0x10000U, 0);
631                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
632                         autohandle = TC_H_MAKE(0x80000000U, 0);
633                 if (!qdisc_lookup(dev, autohandle))
634                         return autohandle;
635                 cond_resched();
636         } while (--i > 0);
637
638         return 0;
639 }
640
641 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
642 {
643         const struct Qdisc_class_ops *cops;
644         unsigned long cl;
645         u32 parentid;
646
647         if (n == 0)
648                 return;
649         while ((parentid = sch->parent)) {
650                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
651                         return;
652
653                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
654                 if (sch == NULL) {
655                         WARN_ON(parentid != TC_H_ROOT);
656                         return;
657                 }
658                 cops = sch->ops->cl_ops;
659                 if (cops->qlen_notify) {
660                         cl = cops->get(sch, parentid);
661                         cops->qlen_notify(sch, cl);
662                         cops->put(sch, cl);
663                 }
664                 sch->q.qlen -= n;
665         }
666 }
667 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
668
669 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
670                                struct nlmsghdr *n, u32 clid,
671                                struct Qdisc *old, struct Qdisc *new)
672 {
673         if (new || old)
674                 qdisc_notify(net, skb, n, clid, old, new);
675
676         if (old)
677                 qdisc_destroy(old);
678 }
679
680 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
681  * to device "dev".
682  *
683  * When appropriate send a netlink notification using 'skb'
684  * and "n".
685  *
686  * On success, destroy old qdisc.
687  */
688
689 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
690                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
691                        struct Qdisc *new, struct Qdisc *old)
692 {
693         struct Qdisc *q = old;
694         struct net *net = dev_net(dev);
695         int err = 0;
696
697         if (parent == NULL) {
698                 unsigned int i, num_q, ingress;
699
700                 ingress = 0;
701                 num_q = dev->num_tx_queues;
702                 if ((q && q->flags & TCQ_F_INGRESS) ||
703                     (new && new->flags & TCQ_F_INGRESS)) {
704                         num_q = 1;
705                         ingress = 1;
706                         if (!dev_ingress_queue(dev))
707                                 return -ENOENT;
708                 }
709
710                 if (dev->flags & IFF_UP)
711                         dev_deactivate(dev);
712
713                 if (new && new->ops->attach) {
714                         new->ops->attach(new);
715                         num_q = 0;
716                 }
717
718                 for (i = 0; i < num_q; i++) {
719                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
720
721                         if (!ingress)
722                                 dev_queue = netdev_get_tx_queue(dev, i);
723
724                         old = dev_graft_qdisc(dev_queue, new);
725                         if (new && i > 0)
726                                 atomic_inc(&new->refcnt);
727
728                         if (!ingress)
729                                 qdisc_destroy(old);
730                 }
731
732                 if (!ingress) {
733                         notify_and_destroy(net, skb, n, classid,
734                                            dev->qdisc, new);
735                         if (new && !new->ops->attach)
736                                 atomic_inc(&new->refcnt);
737                         dev->qdisc = new ? : &noop_qdisc;
738                 } else {
739                         notify_and_destroy(net, skb, n, classid, old, new);
740                 }
741
742                 if (dev->flags & IFF_UP)
743                         dev_activate(dev);
744         } else {
745                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
746
747                 err = -EOPNOTSUPP;
748                 if (cops && cops->graft) {
749                         unsigned long cl = cops->get(parent, classid);
750                         if (cl) {
751                                 err = cops->graft(parent, cl, new, &old);
752                                 cops->put(parent, cl);
753                         } else
754                                 err = -ENOENT;
755                 }
756                 if (!err)
757                         notify_and_destroy(net, skb, n, classid, old, new);
758         }
759         return err;
760 }
761
762 /* lockdep annotation is needed for ingress; egress gets it only for name */
763 static struct lock_class_key qdisc_tx_lock;
764 static struct lock_class_key qdisc_rx_lock;
765
766 /*
767    Allocate and initialize new qdisc.
768
769    Parameters are passed via opt.
770  */
771
772 static struct Qdisc *
773 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
774              struct Qdisc *p, u32 parent, u32 handle,
775              struct nlattr **tca, int *errp)
776 {
777         int err;
778         struct nlattr *kind = tca[TCA_KIND];
779         struct Qdisc *sch;
780         struct Qdisc_ops *ops;
781         struct qdisc_size_table *stab;
782
783         ops = qdisc_lookup_ops(kind);
784 #ifdef CONFIG_MODULES
785         if (ops == NULL && kind != NULL) {
786                 char name[IFNAMSIZ];
787                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
788                         /* We dropped the RTNL semaphore in order to
789                          * perform the module load.  So, even if we
790                          * succeeded in loading the module we have to
791                          * tell the caller to replay the request.  We
792                          * indicate this using -EAGAIN.
793                          * We replay the request because the device may
794                          * go away in the mean time.
795                          */
796                         rtnl_unlock();
797                         request_module("sch_%s", name);
798                         rtnl_lock();
799                         ops = qdisc_lookup_ops(kind);
800                         if (ops != NULL) {
801                                 /* We will try again qdisc_lookup_ops,
802                                  * so don't keep a reference.
803                                  */
804                                 module_put(ops->owner);
805                                 err = -EAGAIN;
806                                 goto err_out;
807                         }
808                 }
809         }
810 #endif
811
812         err = -ENOENT;
813         if (ops == NULL)
814                 goto err_out;
815
816         sch = qdisc_alloc(dev_queue, ops);
817         if (IS_ERR(sch)) {
818                 err = PTR_ERR(sch);
819                 goto err_out2;
820         }
821
822         sch->parent = parent;
823
824         if (handle == TC_H_INGRESS) {
825                 sch->flags |= TCQ_F_INGRESS;
826                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
827                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
828         } else {
829                 if (handle == 0) {
830                         handle = qdisc_alloc_handle(dev);
831                         err = -ENOMEM;
832                         if (handle == 0)
833                                 goto err_out3;
834                 }
835                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
836                 if (!netif_is_multiqueue(dev))
837                         sch->flags |= TCQ_F_ONETXQUEUE;
838         }
839
840         sch->handle = handle;
841
842         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
843                 if (tca[TCA_STAB]) {
844                         stab = qdisc_get_stab(tca[TCA_STAB]);
845                         if (IS_ERR(stab)) {
846                                 err = PTR_ERR(stab);
847                                 goto err_out4;
848                         }
849                         rcu_assign_pointer(sch->stab, stab);
850                 }
851                 if (tca[TCA_RATE]) {
852                         spinlock_t *root_lock;
853
854                         err = -EOPNOTSUPP;
855                         if (sch->flags & TCQ_F_MQROOT)
856                                 goto err_out4;
857
858                         if ((sch->parent != TC_H_ROOT) &&
859                             !(sch->flags & TCQ_F_INGRESS) &&
860                             (!p || !(p->flags & TCQ_F_MQROOT)))
861                                 root_lock = qdisc_root_sleeping_lock(sch);
862                         else
863                                 root_lock = qdisc_lock(sch);
864
865                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
866                                                 root_lock, tca[TCA_RATE]);
867                         if (err)
868                                 goto err_out4;
869                 }
870
871                 qdisc_list_add(sch);
872
873                 return sch;
874         }
875 err_out3:
876         dev_put(dev);
877         kfree((char *) sch - sch->padded);
878 err_out2:
879         module_put(ops->owner);
880 err_out:
881         *errp = err;
882         return NULL;
883
884 err_out4:
885         /*
886          * Any broken qdiscs that would require a ops->reset() here?
887          * The qdisc was never in action so it shouldn't be necessary.
888          */
889         qdisc_put_stab(rtnl_dereference(sch->stab));
890         if (ops->destroy)
891                 ops->destroy(sch);
892         goto err_out3;
893 }
894
895 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
896 {
897         struct qdisc_size_table *ostab, *stab = NULL;
898         int err = 0;
899
900         if (tca[TCA_OPTIONS]) {
901                 if (sch->ops->change == NULL)
902                         return -EINVAL;
903                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
904                 if (err)
905                         return err;
906         }
907
908         if (tca[TCA_STAB]) {
909                 stab = qdisc_get_stab(tca[TCA_STAB]);
910                 if (IS_ERR(stab))
911                         return PTR_ERR(stab);
912         }
913
914         ostab = rtnl_dereference(sch->stab);
915         rcu_assign_pointer(sch->stab, stab);
916         qdisc_put_stab(ostab);
917
918         if (tca[TCA_RATE]) {
919                 /* NB: ignores errors from replace_estimator
920                    because change can't be undone. */
921                 if (sch->flags & TCQ_F_MQROOT)
922                         goto out;
923                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
924                                             qdisc_root_sleeping_lock(sch),
925                                             tca[TCA_RATE]);
926         }
927 out:
928         return 0;
929 }
930
931 struct check_loop_arg {
932         struct qdisc_walker     w;
933         struct Qdisc            *p;
934         int                     depth;
935 };
936
937 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
938
939 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
940 {
941         struct check_loop_arg   arg;
942
943         if (q->ops->cl_ops == NULL)
944                 return 0;
945
946         arg.w.stop = arg.w.skip = arg.w.count = 0;
947         arg.w.fn = check_loop_fn;
948         arg.depth = depth;
949         arg.p = p;
950         q->ops->cl_ops->walk(q, &arg.w);
951         return arg.w.stop ? -ELOOP : 0;
952 }
953
954 static int
955 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
956 {
957         struct Qdisc *leaf;
958         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
959         struct check_loop_arg *arg = (struct check_loop_arg *)w;
960
961         leaf = cops->leaf(q, cl);
962         if (leaf) {
963                 if (leaf == arg->p || arg->depth > 7)
964                         return -ELOOP;
965                 return check_loop(leaf, arg->p, arg->depth + 1);
966         }
967         return 0;
968 }
969
970 /*
971  * Delete/get qdisc.
972  */
973
974 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
975 {
976         struct net *net = sock_net(skb->sk);
977         struct tcmsg *tcm = nlmsg_data(n);
978         struct nlattr *tca[TCA_MAX + 1];
979         struct net_device *dev;
980         u32 clid = tcm->tcm_parent;
981         struct Qdisc *q = NULL;
982         struct Qdisc *p = NULL;
983         int err;
984
985         if ((n->nlmsg_type != RTM_GETQDISC) && !capable(CAP_NET_ADMIN))
986                 return -EPERM;
987
988         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
989         if (!dev)
990                 return -ENODEV;
991
992         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
993         if (err < 0)
994                 return err;
995
996         if (clid) {
997                 if (clid != TC_H_ROOT) {
998                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
999                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1000                                 if (!p)
1001                                         return -ENOENT;
1002                                 q = qdisc_leaf(p, clid);
1003                         } else if (dev_ingress_queue(dev)) {
1004                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1005                         }
1006                 } else {
1007                         q = dev->qdisc;
1008                 }
1009                 if (!q)
1010                         return -ENOENT;
1011
1012                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1013                         return -EINVAL;
1014         } else {
1015                 q = qdisc_lookup(dev, tcm->tcm_handle);
1016                 if (!q)
1017                         return -ENOENT;
1018         }
1019
1020         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1021                 return -EINVAL;
1022
1023         if (n->nlmsg_type == RTM_DELQDISC) {
1024                 if (!clid)
1025                         return -EINVAL;
1026                 if (q->handle == 0)
1027                         return -ENOENT;
1028                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1029                 if (err != 0)
1030                         return err;
1031         } else {
1032                 qdisc_notify(net, skb, n, clid, NULL, q);
1033         }
1034         return 0;
1035 }
1036
1037 /*
1038  * Create/change qdisc.
1039  */
1040
1041 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1042 {
1043         struct net *net = sock_net(skb->sk);
1044         struct tcmsg *tcm;
1045         struct nlattr *tca[TCA_MAX + 1];
1046         struct net_device *dev;
1047         u32 clid;
1048         struct Qdisc *q, *p;
1049         int err;
1050
1051         if (!capable(CAP_NET_ADMIN))
1052                 return -EPERM;
1053
1054 replay:
1055         /* Reinit, just in case something touches this. */
1056         tcm = nlmsg_data(n);
1057         clid = tcm->tcm_parent;
1058         q = p = NULL;
1059
1060         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1061         if (!dev)
1062                 return -ENODEV;
1063
1064         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1065         if (err < 0)
1066                 return err;
1067
1068         if (clid) {
1069                 if (clid != TC_H_ROOT) {
1070                         if (clid != TC_H_INGRESS) {
1071                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1072                                 if (!p)
1073                                         return -ENOENT;
1074                                 q = qdisc_leaf(p, clid);
1075                         } else if (dev_ingress_queue_create(dev)) {
1076                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1077                         }
1078                 } else {
1079                         q = dev->qdisc;
1080                 }
1081
1082                 /* It may be default qdisc, ignore it */
1083                 if (q && q->handle == 0)
1084                         q = NULL;
1085
1086                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1087                         if (tcm->tcm_handle) {
1088                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1089                                         return -EEXIST;
1090                                 if (TC_H_MIN(tcm->tcm_handle))
1091                                         return -EINVAL;
1092                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1093                                 if (!q)
1094                                         goto create_n_graft;
1095                                 if (n->nlmsg_flags & NLM_F_EXCL)
1096                                         return -EEXIST;
1097                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1098                                         return -EINVAL;
1099                                 if (q == p ||
1100                                     (p && check_loop(q, p, 0)))
1101                                         return -ELOOP;
1102                                 atomic_inc(&q->refcnt);
1103                                 goto graft;
1104                         } else {
1105                                 if (!q)
1106                                         goto create_n_graft;
1107
1108                                 /* This magic test requires explanation.
1109                                  *
1110                                  *   We know, that some child q is already
1111                                  *   attached to this parent and have choice:
1112                                  *   either to change it or to create/graft new one.
1113                                  *
1114                                  *   1. We are allowed to create/graft only
1115                                  *   if CREATE and REPLACE flags are set.
1116                                  *
1117                                  *   2. If EXCL is set, requestor wanted to say,
1118                                  *   that qdisc tcm_handle is not expected
1119                                  *   to exist, so that we choose create/graft too.
1120                                  *
1121                                  *   3. The last case is when no flags are set.
1122                                  *   Alas, it is sort of hole in API, we
1123                                  *   cannot decide what to do unambiguously.
1124                                  *   For now we select create/graft, if
1125                                  *   user gave KIND, which does not match existing.
1126                                  */
1127                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1128                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1129                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1130                                      (tca[TCA_KIND] &&
1131                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1132                                         goto create_n_graft;
1133                         }
1134                 }
1135         } else {
1136                 if (!tcm->tcm_handle)
1137                         return -EINVAL;
1138                 q = qdisc_lookup(dev, tcm->tcm_handle);
1139         }
1140
1141         /* Change qdisc parameters */
1142         if (q == NULL)
1143                 return -ENOENT;
1144         if (n->nlmsg_flags & NLM_F_EXCL)
1145                 return -EEXIST;
1146         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1147                 return -EINVAL;
1148         err = qdisc_change(q, tca);
1149         if (err == 0)
1150                 qdisc_notify(net, skb, n, clid, NULL, q);
1151         return err;
1152
1153 create_n_graft:
1154         if (!(n->nlmsg_flags & NLM_F_CREATE))
1155                 return -ENOENT;
1156         if (clid == TC_H_INGRESS) {
1157                 if (dev_ingress_queue(dev))
1158                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1159                                          tcm->tcm_parent, tcm->tcm_parent,
1160                                          tca, &err);
1161                 else
1162                         err = -ENOENT;
1163         } else {
1164                 struct netdev_queue *dev_queue;
1165
1166                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1167                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1168                 else if (p)
1169                         dev_queue = p->dev_queue;
1170                 else
1171                         dev_queue = netdev_get_tx_queue(dev, 0);
1172
1173                 q = qdisc_create(dev, dev_queue, p,
1174                                  tcm->tcm_parent, tcm->tcm_handle,
1175                                  tca, &err);
1176         }
1177         if (q == NULL) {
1178                 if (err == -EAGAIN)
1179                         goto replay;
1180                 return err;
1181         }
1182
1183 graft:
1184         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1185         if (err) {
1186                 if (q)
1187                         qdisc_destroy(q);
1188                 return err;
1189         }
1190
1191         return 0;
1192 }
1193
1194 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1195                          u32 portid, u32 seq, u16 flags, int event)
1196 {
1197         struct tcmsg *tcm;
1198         struct nlmsghdr  *nlh;
1199         unsigned char *b = skb_tail_pointer(skb);
1200         struct gnet_dump d;
1201         struct qdisc_size_table *stab;
1202
1203         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1204         if (!nlh)
1205                 goto out_nlmsg_trim;
1206         tcm = nlmsg_data(nlh);
1207         tcm->tcm_family = AF_UNSPEC;
1208         tcm->tcm__pad1 = 0;
1209         tcm->tcm__pad2 = 0;
1210         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1211         tcm->tcm_parent = clid;
1212         tcm->tcm_handle = q->handle;
1213         tcm->tcm_info = atomic_read(&q->refcnt);
1214         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1215                 goto nla_put_failure;
1216         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1217                 goto nla_put_failure;
1218         q->qstats.qlen = q->q.qlen;
1219
1220         stab = rtnl_dereference(q->stab);
1221         if (stab && qdisc_dump_stab(skb, stab) < 0)
1222                 goto nla_put_failure;
1223
1224         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1225                                          qdisc_root_sleeping_lock(q), &d) < 0)
1226                 goto nla_put_failure;
1227
1228         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1229                 goto nla_put_failure;
1230
1231         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1232             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1233             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1234                 goto nla_put_failure;
1235
1236         if (gnet_stats_finish_copy(&d) < 0)
1237                 goto nla_put_failure;
1238
1239         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1240         return skb->len;
1241
1242 out_nlmsg_trim:
1243 nla_put_failure:
1244         nlmsg_trim(skb, b);
1245         return -1;
1246 }
1247
1248 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1249 {
1250         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1251 }
1252
1253 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1254                         struct nlmsghdr *n, u32 clid,
1255                         struct Qdisc *old, struct Qdisc *new)
1256 {
1257         struct sk_buff *skb;
1258         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1259
1260         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1261         if (!skb)
1262                 return -ENOBUFS;
1263
1264         if (old && !tc_qdisc_dump_ignore(old)) {
1265                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1266                                   0, RTM_DELQDISC) < 0)
1267                         goto err_out;
1268         }
1269         if (new && !tc_qdisc_dump_ignore(new)) {
1270                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1271                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1272                         goto err_out;
1273         }
1274
1275         if (skb->len)
1276                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1277                                       n->nlmsg_flags & NLM_F_ECHO);
1278
1279 err_out:
1280         kfree_skb(skb);
1281         return -EINVAL;
1282 }
1283
1284 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1285                               struct netlink_callback *cb,
1286                               int *q_idx_p, int s_q_idx)
1287 {
1288         int ret = 0, q_idx = *q_idx_p;
1289         struct Qdisc *q;
1290
1291         if (!root)
1292                 return 0;
1293
1294         q = root;
1295         if (q_idx < s_q_idx) {
1296                 q_idx++;
1297         } else {
1298                 if (!tc_qdisc_dump_ignore(q) &&
1299                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1300                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1301                         goto done;
1302                 q_idx++;
1303         }
1304         list_for_each_entry(q, &root->list, list) {
1305                 if (q_idx < s_q_idx) {
1306                         q_idx++;
1307                         continue;
1308                 }
1309                 if (!tc_qdisc_dump_ignore(q) &&
1310                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1311                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1312                         goto done;
1313                 q_idx++;
1314         }
1315
1316 out:
1317         *q_idx_p = q_idx;
1318         return ret;
1319 done:
1320         ret = -1;
1321         goto out;
1322 }
1323
1324 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1325 {
1326         struct net *net = sock_net(skb->sk);
1327         int idx, q_idx;
1328         int s_idx, s_q_idx;
1329         struct net_device *dev;
1330
1331         s_idx = cb->args[0];
1332         s_q_idx = q_idx = cb->args[1];
1333
1334         rcu_read_lock();
1335         idx = 0;
1336         for_each_netdev_rcu(net, dev) {
1337                 struct netdev_queue *dev_queue;
1338
1339                 if (idx < s_idx)
1340                         goto cont;
1341                 if (idx > s_idx)
1342                         s_q_idx = 0;
1343                 q_idx = 0;
1344
1345                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1346                         goto done;
1347
1348                 dev_queue = dev_ingress_queue(dev);
1349                 if (dev_queue &&
1350                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1351                                        &q_idx, s_q_idx) < 0)
1352                         goto done;
1353
1354 cont:
1355                 idx++;
1356         }
1357
1358 done:
1359         rcu_read_unlock();
1360
1361         cb->args[0] = idx;
1362         cb->args[1] = q_idx;
1363
1364         return skb->len;
1365 }
1366
1367
1368
1369 /************************************************
1370  *      Traffic classes manipulation.           *
1371  ************************************************/
1372
1373
1374
1375 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1376 {
1377         struct net *net = sock_net(skb->sk);
1378         struct tcmsg *tcm = nlmsg_data(n);
1379         struct nlattr *tca[TCA_MAX + 1];
1380         struct net_device *dev;
1381         struct Qdisc *q = NULL;
1382         const struct Qdisc_class_ops *cops;
1383         unsigned long cl = 0;
1384         unsigned long new_cl;
1385         u32 portid = tcm->tcm_parent;
1386         u32 clid = tcm->tcm_handle;
1387         u32 qid = TC_H_MAJ(clid);
1388         int err;
1389
1390         if ((n->nlmsg_type != RTM_GETTCLASS) && !capable(CAP_NET_ADMIN))
1391                 return -EPERM;
1392
1393         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1394         if (!dev)
1395                 return -ENODEV;
1396
1397         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1398         if (err < 0)
1399                 return err;
1400
1401         /*
1402            parent == TC_H_UNSPEC - unspecified parent.
1403            parent == TC_H_ROOT   - class is root, which has no parent.
1404            parent == X:0         - parent is root class.
1405            parent == X:Y         - parent is a node in hierarchy.
1406            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1407
1408            handle == 0:0         - generate handle from kernel pool.
1409            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1410            handle == X:Y         - clear.
1411            handle == X:0         - root class.
1412          */
1413
1414         /* Step 1. Determine qdisc handle X:0 */
1415
1416         if (portid != TC_H_ROOT) {
1417                 u32 qid1 = TC_H_MAJ(portid);
1418
1419                 if (qid && qid1) {
1420                         /* If both majors are known, they must be identical. */
1421                         if (qid != qid1)
1422                                 return -EINVAL;
1423                 } else if (qid1) {
1424                         qid = qid1;
1425                 } else if (qid == 0)
1426                         qid = dev->qdisc->handle;
1427
1428                 /* Now qid is genuine qdisc handle consistent
1429                  * both with parent and child.
1430                  *
1431                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1432                  */
1433                 if (portid)
1434                         portid = TC_H_MAKE(qid, portid);
1435         } else {
1436                 if (qid == 0)
1437                         qid = dev->qdisc->handle;
1438         }
1439
1440         /* OK. Locate qdisc */
1441         q = qdisc_lookup(dev, qid);
1442         if (!q)
1443                 return -ENOENT;
1444
1445         /* An check that it supports classes */
1446         cops = q->ops->cl_ops;
1447         if (cops == NULL)
1448                 return -EINVAL;
1449
1450         /* Now try to get class */
1451         if (clid == 0) {
1452                 if (portid == TC_H_ROOT)
1453                         clid = qid;
1454         } else
1455                 clid = TC_H_MAKE(qid, clid);
1456
1457         if (clid)
1458                 cl = cops->get(q, clid);
1459
1460         if (cl == 0) {
1461                 err = -ENOENT;
1462                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1463                     !(n->nlmsg_flags & NLM_F_CREATE))
1464                         goto out;
1465         } else {
1466                 switch (n->nlmsg_type) {
1467                 case RTM_NEWTCLASS:
1468                         err = -EEXIST;
1469                         if (n->nlmsg_flags & NLM_F_EXCL)
1470                                 goto out;
1471                         break;
1472                 case RTM_DELTCLASS:
1473                         err = -EOPNOTSUPP;
1474                         if (cops->delete)
1475                                 err = cops->delete(q, cl);
1476                         if (err == 0)
1477                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1478                         goto out;
1479                 case RTM_GETTCLASS:
1480                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1481                         goto out;
1482                 default:
1483                         err = -EINVAL;
1484                         goto out;
1485                 }
1486         }
1487
1488         new_cl = cl;
1489         err = -EOPNOTSUPP;
1490         if (cops->change)
1491                 err = cops->change(q, clid, portid, tca, &new_cl);
1492         if (err == 0)
1493                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1494
1495 out:
1496         if (cl)
1497                 cops->put(q, cl);
1498
1499         return err;
1500 }
1501
1502
1503 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1504                           unsigned long cl,
1505                           u32 portid, u32 seq, u16 flags, int event)
1506 {
1507         struct tcmsg *tcm;
1508         struct nlmsghdr  *nlh;
1509         unsigned char *b = skb_tail_pointer(skb);
1510         struct gnet_dump d;
1511         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1512
1513         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1514         if (!nlh)
1515                 goto out_nlmsg_trim;
1516         tcm = nlmsg_data(nlh);
1517         tcm->tcm_family = AF_UNSPEC;
1518         tcm->tcm__pad1 = 0;
1519         tcm->tcm__pad2 = 0;
1520         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1521         tcm->tcm_parent = q->handle;
1522         tcm->tcm_handle = q->handle;
1523         tcm->tcm_info = 0;
1524         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1525                 goto nla_put_failure;
1526         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1527                 goto nla_put_failure;
1528
1529         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1530                                          qdisc_root_sleeping_lock(q), &d) < 0)
1531                 goto nla_put_failure;
1532
1533         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1534                 goto nla_put_failure;
1535
1536         if (gnet_stats_finish_copy(&d) < 0)
1537                 goto nla_put_failure;
1538
1539         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1540         return skb->len;
1541
1542 out_nlmsg_trim:
1543 nla_put_failure:
1544         nlmsg_trim(skb, b);
1545         return -1;
1546 }
1547
1548 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1549                          struct nlmsghdr *n, struct Qdisc *q,
1550                          unsigned long cl, int event)
1551 {
1552         struct sk_buff *skb;
1553         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1554
1555         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1556         if (!skb)
1557                 return -ENOBUFS;
1558
1559         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1560                 kfree_skb(skb);
1561                 return -EINVAL;
1562         }
1563
1564         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1565                               n->nlmsg_flags & NLM_F_ECHO);
1566 }
1567
1568 struct qdisc_dump_args {
1569         struct qdisc_walker     w;
1570         struct sk_buff          *skb;
1571         struct netlink_callback *cb;
1572 };
1573
1574 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1575 {
1576         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1577
1578         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1579                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1580 }
1581
1582 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1583                                 struct tcmsg *tcm, struct netlink_callback *cb,
1584                                 int *t_p, int s_t)
1585 {
1586         struct qdisc_dump_args arg;
1587
1588         if (tc_qdisc_dump_ignore(q) ||
1589             *t_p < s_t || !q->ops->cl_ops ||
1590             (tcm->tcm_parent &&
1591              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1592                 (*t_p)++;
1593                 return 0;
1594         }
1595         if (*t_p > s_t)
1596                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1597         arg.w.fn = qdisc_class_dump;
1598         arg.skb = skb;
1599         arg.cb = cb;
1600         arg.w.stop  = 0;
1601         arg.w.skip = cb->args[1];
1602         arg.w.count = 0;
1603         q->ops->cl_ops->walk(q, &arg.w);
1604         cb->args[1] = arg.w.count;
1605         if (arg.w.stop)
1606                 return -1;
1607         (*t_p)++;
1608         return 0;
1609 }
1610
1611 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1612                                struct tcmsg *tcm, struct netlink_callback *cb,
1613                                int *t_p, int s_t)
1614 {
1615         struct Qdisc *q;
1616
1617         if (!root)
1618                 return 0;
1619
1620         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1621                 return -1;
1622
1623         list_for_each_entry(q, &root->list, list) {
1624                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1625                         return -1;
1626         }
1627
1628         return 0;
1629 }
1630
1631 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1632 {
1633         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1634         struct net *net = sock_net(skb->sk);
1635         struct netdev_queue *dev_queue;
1636         struct net_device *dev;
1637         int t, s_t;
1638
1639         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1640                 return 0;
1641         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1642         if (!dev)
1643                 return 0;
1644
1645         s_t = cb->args[0];
1646         t = 0;
1647
1648         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1649                 goto done;
1650
1651         dev_queue = dev_ingress_queue(dev);
1652         if (dev_queue &&
1653             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1654                                 &t, s_t) < 0)
1655                 goto done;
1656
1657 done:
1658         cb->args[0] = t;
1659
1660         dev_put(dev);
1661         return skb->len;
1662 }
1663
1664 /* Main classifier routine: scans classifier chain attached
1665  * to this qdisc, (optionally) tests for protocol and asks
1666  * specific classifiers.
1667  */
1668 int tc_classify_compat(struct sk_buff *skb, const struct tcf_proto *tp,
1669                        struct tcf_result *res)
1670 {
1671         __be16 protocol = skb->protocol;
1672         int err;
1673
1674         for (; tp; tp = tp->next) {
1675                 if (tp->protocol != protocol &&
1676                     tp->protocol != htons(ETH_P_ALL))
1677                         continue;
1678                 err = tp->classify(skb, tp, res);
1679
1680                 if (err >= 0) {
1681 #ifdef CONFIG_NET_CLS_ACT
1682                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1683                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1684 #endif
1685                         return err;
1686                 }
1687         }
1688         return -1;
1689 }
1690 EXPORT_SYMBOL(tc_classify_compat);
1691
1692 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1693                 struct tcf_result *res)
1694 {
1695         int err = 0;
1696 #ifdef CONFIG_NET_CLS_ACT
1697         const struct tcf_proto *otp = tp;
1698 reclassify:
1699 #endif
1700
1701         err = tc_classify_compat(skb, tp, res);
1702 #ifdef CONFIG_NET_CLS_ACT
1703         if (err == TC_ACT_RECLASSIFY) {
1704                 u32 verd = G_TC_VERD(skb->tc_verd);
1705                 tp = otp;
1706
1707                 if (verd++ >= MAX_REC_LOOP) {
1708                         net_notice_ratelimited("%s: packet reclassify loop rule prio %u protocol %02x\n",
1709                                                tp->q->ops->id,
1710                                                tp->prio & 0xffff,
1711                                                ntohs(tp->protocol));
1712                         return TC_ACT_SHOT;
1713                 }
1714                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1715                 goto reclassify;
1716         }
1717 #endif
1718         return err;
1719 }
1720 EXPORT_SYMBOL(tc_classify);
1721
1722 void tcf_destroy(struct tcf_proto *tp)
1723 {
1724         tp->ops->destroy(tp);
1725         module_put(tp->ops->owner);
1726         kfree(tp);
1727 }
1728
1729 void tcf_destroy_chain(struct tcf_proto **fl)
1730 {
1731         struct tcf_proto *tp;
1732
1733         while ((tp = *fl) != NULL) {
1734                 *fl = tp->next;
1735                 tcf_destroy(tp);
1736         }
1737 }
1738 EXPORT_SYMBOL(tcf_destroy_chain);
1739
1740 #ifdef CONFIG_PROC_FS
1741 static int psched_show(struct seq_file *seq, void *v)
1742 {
1743         struct timespec ts;
1744
1745         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1746         seq_printf(seq, "%08x %08x %08x %08x\n",
1747                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1748                    1000000,
1749                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1750
1751         return 0;
1752 }
1753
1754 static int psched_open(struct inode *inode, struct file *file)
1755 {
1756         return single_open(file, psched_show, NULL);
1757 }
1758
1759 static const struct file_operations psched_fops = {
1760         .owner = THIS_MODULE,
1761         .open = psched_open,
1762         .read  = seq_read,
1763         .llseek = seq_lseek,
1764         .release = single_release,
1765 };
1766
1767 static int __net_init psched_net_init(struct net *net)
1768 {
1769         struct proc_dir_entry *e;
1770
1771         e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1772         if (e == NULL)
1773                 return -ENOMEM;
1774
1775         return 0;
1776 }
1777
1778 static void __net_exit psched_net_exit(struct net *net)
1779 {
1780         proc_net_remove(net, "psched");
1781 }
1782 #else
1783 static int __net_init psched_net_init(struct net *net)
1784 {
1785         return 0;
1786 }
1787
1788 static void __net_exit psched_net_exit(struct net *net)
1789 {
1790 }
1791 #endif
1792
1793 static struct pernet_operations psched_net_ops = {
1794         .init = psched_net_init,
1795         .exit = psched_net_exit,
1796 };
1797
1798 static int __init pktsched_init(void)
1799 {
1800         int err;
1801
1802         err = register_pernet_subsys(&psched_net_ops);
1803         if (err) {
1804                 pr_err("pktsched_init: "
1805                        "cannot initialize per netns operations\n");
1806                 return err;
1807         }
1808
1809         register_qdisc(&pfifo_qdisc_ops);
1810         register_qdisc(&bfifo_qdisc_ops);
1811         register_qdisc(&pfifo_head_drop_qdisc_ops);
1812         register_qdisc(&mq_qdisc_ops);
1813
1814         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1815         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1816         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc, NULL);
1817         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1818         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1819         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass, NULL);
1820
1821         return 0;
1822 }
1823
1824 subsys_initcall(pktsched_init);