util: Add a README file for the m5 utility.
[gem5.git] / util / dist / gem5-dist.sh
1 #! /bin/bash
2
3 #
4 # Copyright (c) 2015 ARM Limited
5 # All rights reserved
6 #
7 # The license below extends only to copyright in the software and shall
8 # not be construed as granting a license to any other intellectual
9 # property including but not limited to intellectual property relating
10 # to a hardware implementation of the functionality of the software
11 # licensed hereunder. You may use the software subject to the license
12 # terms below provided that you ensure that this notice is replicated
13 # unmodified and in its entirety in all distributions of the software,
14 # modified or unmodified, in source code or in binary form.
15 #
16 # Copyright (c) 2015 University of Illinois Urbana Champaign
17 # All rights reserved
18 #
19 # Redistribution and use in source and binary forms, with or without
20 # modification, are permitted provided that the following conditions are
21 # met: redistributions of source code must retain the above copyright
22 # notice, this list of conditions and the following disclaimer;
23 # redistributions in binary form must reproduce the above copyright
24 # notice, this list of conditions and the following disclaimer in the
25 # documentation and/or other materials provided with the distribution;
26 # neither the name of the copyright holders nor the names of its
27 # contributors may be used to endorse or promote products derived from
28 # this software without specific prior written permission.
29 #
30 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
31 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
32 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
33 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
34 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
36 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
37 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
38 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
39 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
40 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
41
42
43 # This is a wrapper script to run a dist gem5 simulations.
44 # See the usage_func() below for hints on how to use it. Also,
45 # there are some examples in the util/dist directory (e.g.
46 # see util/dist/test-2nodes-AArch64.sh)
47 #
48 #
49 # Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS
50 # environment variable (which is what LSF does by default).
51 # E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots
52 # allocated to launch the gem5 processes, 2 of them are on host hname1
53 # and 4 of them are on host hname2.
54 # If LSB_MCPU_HOSTS environment variable is not defined then we launch all
55 # processes on the localhost.
56 #
57 # Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel
58 # boot params. The total number of gem5 processes is also passed in.
59 # These values can be used in the boot script to configure the MAC/IP
60 # addresses - among other things (see util/dist/bootscript.rcS).
61 #
62 # Each gem5 process will create an m5out.$GEM5_RANK directory for
63 # the usual output files. Furthermore, there will be a separate log file
64 # for each ssh session (we use ssh to start gem5 processes) and one for
65 # the server. These are called log.$GEM5_RANK and log.switch.
66 #
67
68
69 # print help
70 usage_func ()
71 {
72 echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe "
73 echo " -debug : debug mode (start gem5 in gdb)"
74 echo " nnodes : number of gem5 processes"
75 echo " rundir : run simulation under this path. If not specified, current dir will be used"
76 echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used"
77
78 echo " fullsystem: fullsystem config file"
79 echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..."
80 echo " port : switch listen port"
81 echo " switch : switch config file"
82 echo " sw_args : switch config specific argument list: arg1 arg2 ..."
83 echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..."
84 echo " gem5_exe : gem5 executable (full path required)"
85 echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..."
86 echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost."
87 }
88
89 # Process (optional) command line options
90 FS_ARGS=" "
91 SW_ARGS=" "
92 CF_ARGS=" "
93 M5_ARGS=" "
94 while (($# > 0))
95 do
96 case "x$1" in
97 x-debug)
98 GEM5_DEBUG="-debug"
99 shift 1
100 ;;
101 x-n|x-nodes)
102 NNODES=$2
103 shift 2
104 ;;
105 x-r|x-rundir)
106 RUN_DIR=$2
107 shift 2
108 ;;
109 x-c|x-ckptdir)
110 CKPT_DIR=$2
111 shift 2
112 ;;
113 x-p|x-port)
114 SW_PORT=$2
115 shift 2
116 ;;
117 x-s|x-switch)
118 SW_CONFIG=$2
119 shift 2
120 ;;
121 x--sw-args)
122 CUR_ARGS="SW_ARGS"
123 shift 1
124 ;;
125 x-f|x-fullsystem)
126 FS_CONFIG=$2
127 shift 2
128 ;;
129 x--fs-args)
130 CUR_ARGS="FS_ARGS"
131 shift 1
132 ;;
133 x--cf-args)
134 CUR_ARGS="CF_ARGS"
135 shift 1
136 ;;
137 x--m5-args)
138 CUR_ARGS="M5_ARGS"
139 shift 1
140 ;;
141 x-x)
142 GEM5_EXE=$2
143 shift 2
144 ;;
145 x-*)
146 [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; }
147 case "x$2" in
148 x-*|x)
149 eval $CUR_ARGS=\"${!CUR_ARGS} $1\"
150 shift 1
151 ;;
152 *)
153 eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\"
154 shift 2
155 ;;
156 esac
157 ;;
158 *)
159 echo "Unknown arg: $1"
160 usage_func
161 exit 1
162 ;;
163 esac
164 done
165
166 # Default values to use (in case they are not defined as command line options)
167 DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py
168 DEFAULT_SW_CONFIG=$M5_PATH/configs/dist/sw.py
169 DEFAULT_SW_PORT=2200
170
171 [ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG
172 [ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG
173 [ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT
174 [ -z "$NNODES" ] && NNODES=2
175 [ -z "$RUN_DIR" ] && RUN_DIR=$(pwd)
176 [ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd)
177
178 # Check if all the executables we need exist
179 [ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; }
180 [ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; }
181 [ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; }
182 # make sure that RUN_DIR exists
183 mkdir -p $RUN_DIR > /dev/null 2>&1
184
185 declare -a SSH_PIDS
186 declare -a HOSTS
187 declare -a NCORES
188
189 # Find out which cluster hosts/slots are allocated or
190 # use localhost if there is no LSF allocation.
191 # We assume that allocated slots are listed in the LSB_MCPU_HOSTS
192 # environment variable in the form:
193 # host1 nslots1 host2 nslots2 ...
194 # (This is what LSF does by default.)
195 NH=0
196 [ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES"
197 host=""
198 for hc in $LSB_MCPU_HOSTS
199 do
200 if [ "x$host" == "x" ]
201 then
202 host=$hc
203 HOSTS+=($hc)
204 else
205 NCORES+=($hc)
206 ((NH+=hc))
207 host=""
208 fi
209 done
210 ((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; }
211
212 # function to clean up and abort if something goes wrong
213 abort_func ()
214 {
215 echo
216 echo "KILLED $(date)"
217 # Try to Kill the server first. That should trigger an exit for all connected
218 # gem5 processes.
219 [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null
220 sleep 20
221 # (try to) kill gem5 processes - just in case something went wrong with the
222 # server triggered exit
223 bname=$(basename $GEM5_EXE)
224 killall -q -s SIGKILL $bname
225 for h in ${HOSTS[@]}
226 do
227 ssh $h killall -q -s SIGKILL $bname
228 done
229 sleep 5
230 # kill the watchdog
231 [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null
232 exit -1
233 }
234
235 # We need a watchdog to trigger full clean up if a gem5 process dies
236 watchdog_func ()
237 {
238 while true
239 do
240 sleep 30
241 ((NDEAD=0))
242 for p in ${SSH_PIDS[*]}
243 do
244 kill -0 $p 2>/dev/null || ((NDEAD+=1))
245 done
246 kill -0 $SW_PID || ((NDEAD+=1))
247 if ((NDEAD>0))
248 then
249 # we may be in the middle of an orderly termination,
250 # give it some time to complete before reporting abort
251 sleep 60
252 echo -n "(I) (some) gem5 process(es) exited"
253 abort_func
254 fi
255 done
256 }
257
258 # This function launches the gem5 processes. The only purpose is to enable
259 # launching gem5 processes under gdb control for debugging
260 start_func ()
261 {
262 local N=$1
263 local HOST=$2
264 local ENV_ARGS=$3
265 shift 3
266 if [ "x$GEM5_DEBUG" != "x" ]
267 then
268 echo "DEBUG starting terminal..."
269 MY_ARGS="$@"
270 xterm -e "gdb --args $MY_ARGS" &
271 else
272 ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N &
273 fi
274 }
275
276 # block till the gem5 process starts
277 connected ()
278 {
279 FILE=$1
280 STRING=$2
281 echo -n "waiting for $3 to start "
282 while : ;
283 do
284 kill -0 $4 || { echo "Failed to start $3"; exit -1; }
285 [[ -f "$FILE" ]] && \
286 grep -q "$STRING" "$FILE" && \
287 echo -e "\nnode #$3 started" && \
288 break
289
290 sleep 2
291 echo -n "."
292 done
293 }
294
295 # Trigger full clean up in case we are being killed by external signal
296 trap 'abort_func' INT TERM
297
298 # env args to be passed explicitly to gem5 processes started via ssh
299 ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH"
300
301 #cleanup log files before starting gem5 processes
302 rm $RUN_DIR/log.switch > /dev/null 2>&1
303
304 # make sure that CKPT_DIR exists
305 mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1
306 # launch switch gem5
307 SW_HOST=${HOSTS[0]}
308 echo "launch switch gem5 process on $SW_HOST ..."
309 start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \
310 $M5_ARGS \
311 $SW_CONFIG \
312 $SW_ARGS \
313 $CF_ARGS \
314 --checkpoint-dir=$CKPT_DIR/m5out.switch \
315 --is-switch \
316 --dist-size=$NNODES \
317 --dist-server-port=$SW_PORT
318 SW_PID=$!
319
320 # block here till switch process starts
321 connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID
322 LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch)
323
324 IFS=' ' read -ra ADDR <<< "$LINE"
325 # actual port that switch is listening on may be different
326 # from what we specified if the port was busy
327 SW_PORT=${ADDR[5]}
328
329 # Now launch all the gem5 processes with ssh.
330 echo "START $(date)"
331 n=0
332 for ((i=0; i < ${#HOSTS[@]}; i++))
333 do
334 h=${HOSTS[$i]}
335 for ((j=0; j < ${NCORES[i]}; j++))
336 do
337 #cleanup log files before starting gem5 processes
338 rm $RUN_DIR/log.$n > /dev/null 2>&1
339 # make sure that CKPT_DIR exists
340 mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1
341 echo "starting gem5 on $h ..."
342 start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \
343 $M5_ARGS \
344 $FS_CONFIG \
345 $FS_ARGS \
346 $CF_ARGS \
347 --checkpoint-dir=$CKPT_DIR/m5out.$n \
348 --dist \
349 --dist-rank=$n \
350 --dist-size=$NNODES \
351 --dist-server-name=${HOSTS[0]} \
352 --dist-server-port=$SW_PORT
353 SSH_PIDS[$n]=$!
354 ((n+=1))
355 done
356 done
357
358 # Wait here if it is a debug session
359 [ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; }
360
361 # start watchdog to trigger complete abort (after a grace period) if any
362 # gem5 process dies
363 watchdog_func &
364 WATCHDOG_PID=$!
365
366 # wait for exit statuses
367 ((NFAIL=0))
368 for p in ${SSH_PIDS[*]}
369 do
370 wait $p || ((NFAIL+=1))
371 done
372 wait $SW_PID || ((NFAIL+=1))
373
374 # all done, let's terminate the watchdog
375 kill $WATCHDOG_PID 2>/dev/null
376
377 if ((NFAIL==0))
378 then
379 echo "EXIT $(date)"
380 else
381 echo "ABORT $(date)"
382 fi