1 /*
2 * net/sched/estimator.c Simple rate estimator.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
12 #include <asm/uaccess.h>
13 #include <asm/system.h>
14 #include <asm/bitops.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/sched.h>
18 #include <linux/string.h>
19 #include <linux/mm.h>
20 #include <linux/socket.h>
21 #include <linux/sockios.h>
22 #include <linux/in.h>
23 #include <linux/errno.h>
24 #include <linux/interrupt.h>
25 #include <linux/netdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/rtnetlink.h>
28 #include <linux/init.h>
29 #include <linux/proc_fs.h>
30 #include <net/sock.h>
31 #include <net/pkt_sched.h>
32
33 /*
34 This code is NOT intended to be used for statistics collection,
35 its purpose is to provide a base for statistical multiplexing
36 for controlled load service.
37 If you need only statistics, run a user level daemon which
38 periodically reads byte counters.
39
40 Unfortunately, rate estimation is not a very easy task.
41 F.e. I did not find a simple way to estimate the current peak rate
42 and even failed to formulate the problem 8)8)
43
44 So I preferred not to built an estimator into the scheduler,
45 but run this task separately.
46 Ideally, it should be kernel thread(s), but for now it runs
47 from timers, which puts apparent top bounds on the number of rated
48 flows, has minimal overhead on small, but is enough
49 to handle controlled load service, sets of aggregates.
50
51 We measure rate over A=(1<<interval) seconds and evaluate EWMA:
52
53 avrate = avrate*(1-W) + rate*W
54
55 where W is chosen as negative power of 2: W = 2^(-ewma_log)
56
57 The resulting time constant is:
58
59 T = A/(-ln(1-W))
60
61
62 NOTES.
63
64 * The stored value for avbps is scaled by 2^5, so that maximal
65 rate is ~1Gbit, avpps is scaled by 2^10.
66
67 * Minimal interval is HZ/4=250msec (it is the greatest common divisor
68 for HZ=100 and HZ=1024 8)), maximal interval
69 is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
70 are too expensive, longer ones can be implemented
71 at user level painlessly.
72 */
73
74 #define EST_MAX_INTERVAL 5
75
76 struct qdisc_estimator
77 {
78 struct qdisc_estimator *next;
79 struct tc_stats *stats;
80 unsigned interval;
81 int ewma_log;
82 u64 last_bytes;
83 u32 last_packets;
84 u32 avpps;
85 u32 avbps;
86 };
87
88 struct qdisc_estimator_head
89 {
90 struct timer_list timer;
91 struct qdisc_estimator *list;
92 };
93
94 static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1];
95
96 /* Estimator array lock */
97 static rwlock_t est_lock = RW_LOCK_UNLOCKED;
98
est_timer(unsigned long arg)99 static void est_timer(unsigned long arg)
100 {
101 int idx = (int)arg;
102 struct qdisc_estimator *e;
103
104 read_lock(&est_lock);
105 for (e = elist[idx].list; e; e = e->next) {
106 struct tc_stats *st = e->stats;
107 u64 nbytes;
108 u32 npackets;
109 u32 rate;
110
111 spin_lock(st->lock);
112 nbytes = st->bytes;
113 npackets = st->packets;
114 rate = (nbytes - e->last_bytes)<<(7 - idx);
115 e->last_bytes = nbytes;
116 e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
117 st->bps = (e->avbps+0xF)>>5;
118
119 rate = (npackets - e->last_packets)<<(12 - idx);
120 e->last_packets = npackets;
121 e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
122 e->stats->pps = (e->avpps+0x1FF)>>10;
123 spin_unlock(st->lock);
124 }
125
126 mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
127 read_unlock(&est_lock);
128 }
129
qdisc_new_estimator(struct tc_stats * stats,struct rtattr * opt)130 int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt)
131 {
132 struct qdisc_estimator *est;
133 struct tc_estimator *parm = RTA_DATA(opt);
134
135 if (RTA_PAYLOAD(opt) < sizeof(*parm))
136 return -EINVAL;
137
138 if (parm->interval < -2 || parm->interval > 3)
139 return -EINVAL;
140
141 est = kmalloc(sizeof(*est), GFP_KERNEL);
142 if (est == NULL)
143 return -ENOBUFS;
144
145 memset(est, 0, sizeof(*est));
146 est->interval = parm->interval + 2;
147 est->stats = stats;
148 est->ewma_log = parm->ewma_log;
149 est->last_bytes = stats->bytes;
150 est->avbps = stats->bps<<5;
151 est->last_packets = stats->packets;
152 est->avpps = stats->pps<<10;
153
154 est->next = elist[est->interval].list;
155 if (est->next == NULL) {
156 init_timer(&elist[est->interval].timer);
157 elist[est->interval].timer.data = est->interval;
158 elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
159 elist[est->interval].timer.function = est_timer;
160 add_timer(&elist[est->interval].timer);
161 }
162 write_lock_bh(&est_lock);
163 elist[est->interval].list = est;
164 write_unlock_bh(&est_lock);
165 return 0;
166 }
167
qdisc_kill_estimator(struct tc_stats * stats)168 void qdisc_kill_estimator(struct tc_stats *stats)
169 {
170 int idx;
171 struct qdisc_estimator *est, **pest;
172
173 for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
174 int killed = 0;
175 pest = &elist[idx].list;
176 while ((est=*pest) != NULL) {
177 if (est->stats != stats) {
178 pest = &est->next;
179 continue;
180 }
181
182 write_lock_bh(&est_lock);
183 *pest = est->next;
184 write_unlock_bh(&est_lock);
185
186 kfree(est);
187 killed++;
188 }
189 if (killed && elist[idx].list == NULL)
190 del_timer(&elist[idx].timer);
191 }
192 }
193
194