1 /*
2  * net/sched/estimator.c	Simple rate estimator.
3  *
4  *		This program is free software; you can redistribute it and/or
5  *		modify it under the terms of the GNU General Public License
6  *		as published by the Free Software Foundation; either version
7  *		2 of the License, or (at your option) any later version.
8  *
9  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  */
11 
12 #include <asm/uaccess.h>
13 #include <asm/system.h>
14 #include <asm/bitops.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/sched.h>
18 #include <linux/string.h>
19 #include <linux/mm.h>
20 #include <linux/socket.h>
21 #include <linux/sockios.h>
22 #include <linux/in.h>
23 #include <linux/errno.h>
24 #include <linux/interrupt.h>
25 #include <linux/netdevice.h>
26 #include <linux/skbuff.h>
27 #include <linux/rtnetlink.h>
28 #include <linux/init.h>
29 #include <linux/proc_fs.h>
30 #include <net/sock.h>
31 #include <net/pkt_sched.h>
32 
33 /*
34    This code is NOT intended to be used for statistics collection,
35    its purpose is to provide a base for statistical multiplexing
36    for controlled load service.
37    If you need only statistics, run a user level daemon which
38    periodically reads byte counters.
39 
40    Unfortunately, rate estimation is not a very easy task.
41    F.e. I did not find a simple way to estimate the current peak rate
42    and even failed to formulate the problem 8)8)
43 
44    So I preferred not to built an estimator into the scheduler,
45    but run this task separately.
46    Ideally, it should be kernel thread(s), but for now it runs
47    from timers, which puts apparent top bounds on the number of rated
48    flows, has minimal overhead on small, but is enough
49    to handle controlled load service, sets of aggregates.
50 
51    We measure rate over A=(1<<interval) seconds and evaluate EWMA:
52 
53    avrate = avrate*(1-W) + rate*W
54 
55    where W is chosen as negative power of 2: W = 2^(-ewma_log)
56 
57    The resulting time constant is:
58 
59    T = A/(-ln(1-W))
60 
61 
62    NOTES.
63 
64    * The stored value for avbps is scaled by 2^5, so that maximal
65      rate is ~1Gbit, avpps is scaled by 2^10.
66 
67    * Minimal interval is HZ/4=250msec (it is the greatest common divisor
68      for HZ=100 and HZ=1024 8)), maximal interval
69      is (HZ*2^EST_MAX_INTERVAL)/4 = 8sec. Shorter intervals
70      are too expensive, longer ones can be implemented
71      at user level painlessly.
72  */
73 
74 #define EST_MAX_INTERVAL	5
75 
76 struct qdisc_estimator
77 {
78 	struct qdisc_estimator	*next;
79 	struct tc_stats		*stats;
80 	unsigned		interval;
81 	int			ewma_log;
82 	u64			last_bytes;
83 	u32			last_packets;
84 	u32			avpps;
85 	u32			avbps;
86 };
87 
88 struct qdisc_estimator_head
89 {
90 	struct timer_list	timer;
91 	struct qdisc_estimator	*list;
92 };
93 
94 static struct qdisc_estimator_head elist[EST_MAX_INTERVAL+1];
95 
96 /* Estimator array lock */
97 static rwlock_t est_lock = RW_LOCK_UNLOCKED;
98 
est_timer(unsigned long arg)99 static void est_timer(unsigned long arg)
100 {
101 	int idx = (int)arg;
102 	struct qdisc_estimator *e;
103 
104 	read_lock(&est_lock);
105 	for (e = elist[idx].list; e; e = e->next) {
106 		struct tc_stats *st = e->stats;
107 		u64 nbytes;
108 		u32 npackets;
109 		u32 rate;
110 
111 		spin_lock(st->lock);
112 		nbytes = st->bytes;
113 		npackets = st->packets;
114 		rate = (nbytes - e->last_bytes)<<(7 - idx);
115 		e->last_bytes = nbytes;
116 		e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log;
117 		st->bps = (e->avbps+0xF)>>5;
118 
119 		rate = (npackets - e->last_packets)<<(12 - idx);
120 		e->last_packets = npackets;
121 		e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log;
122 		e->stats->pps = (e->avpps+0x1FF)>>10;
123 		spin_unlock(st->lock);
124 	}
125 
126 	mod_timer(&elist[idx].timer, jiffies + ((HZ<<idx)/4));
127 	read_unlock(&est_lock);
128 }
129 
qdisc_new_estimator(struct tc_stats * stats,struct rtattr * opt)130 int qdisc_new_estimator(struct tc_stats *stats, struct rtattr *opt)
131 {
132 	struct qdisc_estimator *est;
133 	struct tc_estimator *parm = RTA_DATA(opt);
134 
135 	if (RTA_PAYLOAD(opt) < sizeof(*parm))
136 		return -EINVAL;
137 
138 	if (parm->interval < -2 || parm->interval > 3)
139 		return -EINVAL;
140 
141 	est = kmalloc(sizeof(*est), GFP_KERNEL);
142 	if (est == NULL)
143 		return -ENOBUFS;
144 
145 	memset(est, 0, sizeof(*est));
146 	est->interval = parm->interval + 2;
147 	est->stats = stats;
148 	est->ewma_log = parm->ewma_log;
149 	est->last_bytes = stats->bytes;
150 	est->avbps = stats->bps<<5;
151 	est->last_packets = stats->packets;
152 	est->avpps = stats->pps<<10;
153 
154 	est->next = elist[est->interval].list;
155 	if (est->next == NULL) {
156 		init_timer(&elist[est->interval].timer);
157 		elist[est->interval].timer.data = est->interval;
158 		elist[est->interval].timer.expires = jiffies + ((HZ<<est->interval)/4);
159 		elist[est->interval].timer.function = est_timer;
160 		add_timer(&elist[est->interval].timer);
161 	}
162 	write_lock_bh(&est_lock);
163 	elist[est->interval].list = est;
164 	write_unlock_bh(&est_lock);
165 	return 0;
166 }
167 
qdisc_kill_estimator(struct tc_stats * stats)168 void qdisc_kill_estimator(struct tc_stats *stats)
169 {
170 	int idx;
171 	struct qdisc_estimator *est, **pest;
172 
173 	for (idx=0; idx <= EST_MAX_INTERVAL; idx++) {
174 		int killed = 0;
175 		pest = &elist[idx].list;
176 		while ((est=*pest) != NULL) {
177 			if (est->stats != stats) {
178 				pest = &est->next;
179 				continue;
180 			}
181 
182 			write_lock_bh(&est_lock);
183 			*pest = est->next;
184 			write_unlock_bh(&est_lock);
185 
186 			kfree(est);
187 			killed++;
188 		}
189 		if (killed && elist[idx].list == NULL)
190 			del_timer(&elist[idx].timer);
191 	}
192 }
193 
194