1#!/bin/bash
2# SPDX-License-Identifier: GPL-2.0
3#
4# Copyright (c) 2019 Facebook
5#
6# This program is free software; you can redistribute it and/or
7# modify it under the terms of version 2 of the GNU General Public
8# License as published by the Free Software Foundation.
9
10Usage() {
11  echo "Script for testing HBM (Host Bandwidth Manager) framework."
12  echo "It creates a cgroup to use for testing and load a BPF program to limit"
13  echo "egress or ingress bandwidth. It then uses iperf3 or netperf to create"
14  echo "loads. The output is the goodput in Mbps (unless -D was used)."
15  echo ""
16  echo "USAGE: $name [out] [-b=<prog>|--bpf=<prog>] [-c=<cc>|--cc=<cc>]"
17  echo "             [-D] [-d=<delay>|--delay=<delay>] [--debug] [-E] [--edt]"
18  echo "             [-f=<#flows>|--flows=<#flows>] [-h] [-i=<id>|--id=<id >]"
19  echo "             [-l] [-N] [--no_cn] [-p=<port>|--port=<port>] [-P]"
20  echo "             [-q=<qdisc>] [-R] [-s=<server>|--server=<server]"
21  echo "             [-S|--stats] -t=<time>|--time=<time>] [-w] [cubic|dctcp]"
22  echo "  Where:"
23  echo "    out               egress (default)"
24  echo "    -b or --bpf       BPF program filename to load and attach."
25  echo "                      Default is hbm_out_kern.o for egress,"
26  echo "    -c or -cc         TCP congestion control (cubic or dctcp)"
27  echo "    --debug           print BPF trace buffer"
28  echo "    -d or --delay     add a delay in ms using netem"
29  echo "    -D                In addition to the goodput in Mbps, it also outputs"
30  echo "                      other detailed information. This information is"
31  echo "                      test dependent (i.e. iperf3 or netperf)."
32  echo "    -E                enable ECN (not required for dctcp)"
33  echo "    --edt             use fq's Earliest Departure Time (requires fq)"
34  echo "    -f or --flows     number of concurrent flows (default=1)"
35  echo "    -i or --id        cgroup id (an integer, default is 1)"
36  echo "    -N                use netperf instead of iperf3"
37  echo "    --no_cn           Do not return CN notifications"
38  echo "    -l                do not limit flows using loopback"
39  echo "    -h                Help"
40  echo "    -p or --port      iperf3 port (default is 5201)"
41  echo "    -P                use an iperf3 instance for each flow"
42  echo "    -q                use the specified qdisc"
43  echo "    -r or --rate      rate in Mbps (default 1s 1Gbps)"
44  echo "    -R                Use TCP_RR for netperf. 1st flow has req"
45  echo "                      size of 10KB, rest of 1MB. Reply in all"
46  echo "                      cases is 1 byte."
47  echo "                      More detailed output for each flow can be found"
48  echo "                      in the files netperf.<cg>.<flow>, where <cg> is the"
49  echo "                      cgroup id as specified with the -i flag, and <flow>"
50  echo "                      is the flow id starting at 1 and increasing by 1 for"
51  echo "                      flow (as specified by -f)."
52  echo "    -s or --server    hostname of netperf server. Used to create netperf"
53  echo "                      test traffic between to hosts (default is within host)"
54  echo "                      netserver must be running on the host."
55  echo "    -S or --stats     whether to update hbm stats (default is yes)."
56  echo "    -t or --time      duration of iperf3 in seconds (default=5)"
57  echo "    -w                Work conserving flag. cgroup can increase its"
58  echo "                      bandwidth beyond the rate limit specified"
59  echo "                      while there is available bandwidth. Current"
60  echo "                      implementation assumes there is only one NIC"
61  echo "                      (eth0), but can be extended to support multiple"
62  echo "                       NICs."
63  echo "    cubic or dctcp    specify which TCP CC to use"
64  echo " "
65  exit
66}
67
68#set -x
69
70debug_flag=0
71args="$@"
72name="$0"
73netem=0
74cc=x
75dir="-o"
76dir_name="out"
77dur=5
78flows=1
79id=1
80prog=""
81port=5201
82rate=1000
83multi_iperf=0
84flow_cnt=1
85use_netperf=0
86rr=0
87ecn=0
88details=0
89server=""
90qdisc=""
91flags=""
92do_stats=0
93
94BPFFS=/sys/fs/bpf
95function config_bpffs () {
96	if mount | grep $BPFFS > /dev/null; then
97		echo "bpffs already mounted"
98	else
99		echo "bpffs not mounted. Mounting..."
100		mount -t bpf none $BPFFS
101	fi
102}
103
104function start_hbm () {
105  rm -f hbm.out
106  echo "./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog" > hbm.out
107  echo " " >> hbm.out
108  ./hbm $dir -n $id -r $rate -t $dur $flags $dbg $prog >> hbm.out 2>&1  &
109  echo $!
110}
111
112processArgs () {
113  for i in $args ; do
114    case $i in
115    # Support for upcomming ingress rate limiting
116    #in)         # support for upcoming ingress rate limiting
117    #  dir="-i"
118    #  dir_name="in"
119    #  ;;
120    out)
121      dir="-o"
122      dir_name="out"
123      ;;
124    -b=*|--bpf=*)
125      prog="${i#*=}"
126      ;;
127    -c=*|--cc=*)
128      cc="${i#*=}"
129      ;;
130    --no_cn)
131      flags="$flags --no_cn"
132      ;;
133    --debug)
134      flags="$flags -d"
135      debug_flag=1
136      ;;
137    -d=*|--delay=*)
138      netem="${i#*=}"
139      ;;
140    -D)
141      details=1
142      ;;
143    -E)
144      ecn=1
145      ;;
146    --edt)
147      flags="$flags --edt"
148      qdisc="fq"
149     ;;
150    -f=*|--flows=*)
151      flows="${i#*=}"
152      ;;
153    -i=*|--id=*)
154      id="${i#*=}"
155      ;;
156    -l)
157      flags="$flags -l"
158      ;;
159    -N)
160      use_netperf=1
161      ;;
162    -p=*|--port=*)
163      port="${i#*=}"
164      ;;
165    -P)
166      multi_iperf=1
167      ;;
168    -q=*)
169      qdisc="${i#*=}"
170      ;;
171    -r=*|--rate=*)
172      rate="${i#*=}"
173      ;;
174    -R)
175      rr=1
176      ;;
177    -s=*|--server=*)
178      server="${i#*=}"
179      ;;
180    -S|--stats)
181      flags="$flags -s"
182      do_stats=1
183      ;;
184    -t=*|--time=*)
185      dur="${i#*=}"
186      ;;
187    -w)
188      flags="$flags -w"
189      ;;
190    cubic)
191      cc=cubic
192      ;;
193    dctcp)
194      cc=dctcp
195      ;;
196    *)
197      echo "Unknown arg:$i"
198      Usage
199      ;;
200    esac
201  done
202}
203
204processArgs
205config_bpffs
206
207if [ $debug_flag -eq 1 ] ; then
208  rm -f hbm_out.log
209fi
210
211hbm_pid=$(start_hbm)
212usleep 100000
213
214host=`hostname`
215cg_base_dir=/sys/fs/cgroup/unified
216cg_dir="$cg_base_dir/cgroup-test-work-dir/hbm$id"
217
218echo $$ >> $cg_dir/cgroup.procs
219
220ulimit -l unlimited
221
222rm -f ss.out
223rm -f hbm.[0-9]*.$dir_name
224if [ $ecn -ne 0 ] ; then
225  sysctl -w -q -n net.ipv4.tcp_ecn=1
226fi
227
228if [ $use_netperf -eq 0 ] ; then
229  cur_cc=`sysctl -n net.ipv4.tcp_congestion_control`
230  if [ "$cc" != "x" ] ; then
231    sysctl -w -q -n net.ipv4.tcp_congestion_control=$cc
232  fi
233fi
234
235if [ "$netem" -ne "0" ] ; then
236  if [ "$qdisc" != "" ] ; then
237    echo "WARNING: Ignoring -q options because -d option used"
238  fi
239  tc qdisc del dev lo root > /dev/null 2>&1
240  tc qdisc add dev lo root netem delay $netem\ms > /dev/null 2>&1
241elif [ "$qdisc" != "" ] ; then
242  tc qdisc del dev eth0 root > /dev/null 2>&1
243  tc qdisc add dev eth0 root $qdisc > /dev/null 2>&1
244fi
245
246n=0
247m=$[$dur * 5]
248hn="::1"
249if [ $use_netperf -ne 0 ] ; then
250  if [ "$server" != "" ] ; then
251    hn=$server
252  fi
253fi
254
255( ping6 -i 0.2 -c $m $hn > ping.out 2>&1 ) &
256
257if [ $use_netperf -ne 0 ] ; then
258  begNetserverPid=`ps ax | grep netserver | grep --invert-match "grep" | \
259                   awk '{ print $1 }'`
260  if [ "$begNetserverPid" == "" ] ; then
261    if [ "$server" == "" ] ; then
262      ( ./netserver > /dev/null 2>&1) &
263      usleep 100000
264    fi
265  fi
266  flow_cnt=1
267  if [ "$server" == "" ] ; then
268    np_server=$host
269  else
270    np_server=$server
271  fi
272  if [ "$cc" == "x" ] ; then
273    np_cc=""
274  else
275    np_cc="-K $cc,$cc"
276  fi
277  replySize=1
278  while [ $flow_cnt -le $flows ] ; do
279    if [ $rr -ne 0 ] ; then
280      reqSize=1M
281      if [ $flow_cnt -eq 1 ] ; then
282        reqSize=10K
283      fi
284      if [ "$dir" == "-i" ] ; then
285        replySize=$reqSize
286        reqSize=1
287      fi
288      ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR  -- -r $reqSize,$replySize $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,REMOTE_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,LOCAL_RECV_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
289    else
290      if [ "$dir" == "-i" ] ; then
291        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_RR -- -r 1,10M $np_cc -k P50_LATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REMOTE_TRANSPORT_RETRANS,REMOTE_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
292      else
293        ( ./netperf -H $np_server -l $dur -f m -j -t TCP_STREAM -- $np_cc -k P50_lATENCY,P90_LATENCY,LOCAL_TRANSPORT_RETRANS,LOCAL_SEND_THROUGHPUT,REQUEST_SIZE,RESPONSE_SIZE > netperf.$id.$flow_cnt ) &
294      fi
295    fi
296    flow_cnt=$[flow_cnt+1]
297  done
298
299# sleep for duration of test (plus some buffer)
300  n=$[dur+2]
301  sleep $n
302
303# force graceful termination of netperf
304  pids=`pgrep netperf`
305  for p in $pids ; do
306    kill -SIGALRM $p
307  done
308
309  flow_cnt=1
310  rate=0
311  if [ $details -ne 0 ] ; then
312    echo ""
313    echo "Details for HBM in cgroup $id"
314    if [ $do_stats -eq 1 ] ; then
315      if [ -e hbm.$id.$dir_name ] ; then
316        cat hbm.$id.$dir_name
317      fi
318    fi
319  fi
320  while [ $flow_cnt -le $flows ] ; do
321    if [ "$dir" == "-i" ] ; then
322      r=`cat netperf.$id.$flow_cnt | grep -o "REMOTE_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
323    else
324      r=`cat netperf.$id.$flow_cnt | grep -o "LOCAL_SEND_THROUGHPUT=[0-9]*" | grep -o "[0-9]*"`
325    fi
326    echo "rate for flow $flow_cnt: $r"
327    rate=$[rate+r]
328    if [ $details -ne 0 ] ; then
329      echo "-----"
330      echo "Details for cgroup $id, flow $flow_cnt"
331      cat netperf.$id.$flow_cnt
332    fi
333    flow_cnt=$[flow_cnt+1]
334  done
335  if [ $details -ne 0 ] ; then
336    echo ""
337    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
338    echo "PING AVG DELAY:$delay"
339    echo "AGGREGATE_GOODPUT:$rate"
340  else
341    echo $rate
342  fi
343elif [ $multi_iperf -eq 0 ] ; then
344  (iperf3 -s -p $port -1 > /dev/null 2>&1) &
345  usleep 100000
346  iperf3 -c $host -p $port -i 0 -P $flows -f m -t $dur > iperf.$id
347  rates=`grep receiver iperf.$id | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*"`
348  rate=`echo $rates | grep -o "[0-9]*$"`
349
350  if [ $details -ne 0 ] ; then
351    echo ""
352    echo "Details for HBM in cgroup $id"
353    if [ $do_stats -eq 1 ] ; then
354      if [ -e hbm.$id.$dir_name ] ; then
355        cat hbm.$id.$dir_name
356      fi
357    fi
358    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
359    echo "PING AVG DELAY:$delay"
360    echo "AGGREGATE_GOODPUT:$rate"
361  else
362    echo $rate
363  fi
364else
365  flow_cnt=1
366  while [ $flow_cnt -le $flows ] ; do
367    (iperf3 -s -p $port -1 > /dev/null 2>&1) &
368    ( iperf3 -c $host -p $port -i 0 -P 1 -f m -t $dur | grep receiver | grep -o "[0-9.]* Mbits" | grep -o "^[0-9]*" | grep -o "[0-9]*$" > iperf3.$id.$flow_cnt ) &
369    port=$[port+1]
370    flow_cnt=$[flow_cnt+1]
371  done
372  n=$[dur+1]
373  sleep $n
374  flow_cnt=1
375  rate=0
376  if [ $details -ne 0 ] ; then
377    echo ""
378    echo "Details for HBM in cgroup $id"
379    if [ $do_stats -eq 1 ] ; then
380      if [ -e hbm.$id.$dir_name ] ; then
381        cat hbm.$id.$dir_name
382      fi
383    fi
384  fi
385
386  while [ $flow_cnt -le $flows ] ; do
387    r=`cat iperf3.$id.$flow_cnt`
388#    echo "rate for flow $flow_cnt: $r"
389  if [ $details -ne 0 ] ; then
390    echo "Rate for cgroup $id, flow $flow_cnt LOCAL_SEND_THROUGHPUT=$r"
391  fi
392    rate=$[rate+r]
393    flow_cnt=$[flow_cnt+1]
394  done
395  if [ $details -ne 0 ] ; then
396    delay=`grep "avg" ping.out | grep -o "= [0-9.]*/[0-9.]*" | grep -o "[0-9.]*$"`
397    echo "PING AVG DELAY:$delay"
398    echo "AGGREGATE_GOODPUT:$rate"
399  else
400    echo $rate
401  fi
402fi
403
404if [ $use_netperf -eq 0 ] ; then
405  sysctl -w -q -n net.ipv4.tcp_congestion_control=$cur_cc
406fi
407if [ $ecn -ne 0 ] ; then
408  sysctl -w -q -n net.ipv4.tcp_ecn=0
409fi
410if [ "$netem" -ne "0" ] ; then
411  tc qdisc del dev lo root > /dev/null 2>&1
412fi
413if [ "$qdisc" != "" ] ; then
414  tc qdisc del dev eth0 root > /dev/null 2>&1
415fi
416sleep 2
417
418hbmPid=`ps ax | grep "hbm " | grep --invert-match "grep" | awk '{ print $1 }'`
419if [ "$hbmPid" == "$hbm_pid" ] ; then
420  kill $hbm_pid
421fi
422
423sleep 1
424
425# Detach any pinned BPF programs that may have lingered
426rm -rf $BPFFS/hbm*
427
428if [ $use_netperf -ne 0 ] ; then
429  if [ "$server" == "" ] ; then
430    if [ "$begNetserverPid" == "" ] ; then
431      netserverPid=`ps ax | grep netserver | grep --invert-match "grep" | awk '{ print $1 }'`
432      if [ "$netserverPid" != "" ] ; then
433        kill $netserverPid
434      fi
435    fi
436  fi
437fi
438exit
439