From 64ca31976fe91eedd91b2d703c6e3e62328f8e1d Mon Sep 17 00:00:00 2001 From: Gabor Dozsa Date: Thu, 7 Jan 2016 16:33:47 -0600 Subject: [PATCH] config: Updates for distributed gem5 simulations --- configs/common/FSConfig.py | 36 ++ configs/common/Options.py | 31 ++ configs/example/fs.py | 12 + util/dist/gem5-dist.sh | 385 ++++++++++++++++++ .../test/simple_bootscript.rcS} | 73 ++-- util/dist/test/test-2nodes-AArch64.sh | 82 ++++ util/multi/gem5-multi.sh | 275 ------------- 7 files changed, 575 insertions(+), 319 deletions(-) create mode 100755 util/dist/gem5-dist.sh rename util/{multi/bootscript.rcS => dist/test/simple_bootscript.rcS} (65%) create mode 100644 util/dist/test/test-2nodes-AArch64.sh delete mode 100755 util/multi/gem5-multi.sh diff --git a/configs/common/FSConfig.py b/configs/common/FSConfig.py index 0f63ec9e7..004d06d55 100644 --- a/configs/common/FSConfig.py +++ b/configs/common/FSConfig.py @@ -654,3 +654,39 @@ def makeDualRoot(full_system, testSystem, driveSystem, dumpfile): self.etherlink.dump = Parent.etherdump return self + + +def makeDistRoot(testSystem, + rank, + size, + server_name, + server_port, + sync_repeat, + sync_start, + linkspeed, + linkdelay, + dumpfile): + self = Root(full_system = True) + self.testsys = testSystem + + self.etherlink = DistEtherLink(speed = linkspeed, + delay = linkdelay, + dist_rank = rank, + dist_size = size, + server_name = server_name, + server_port = server_port, + sync_start = sync_start, + sync_repeat = sync_repeat) + + if hasattr(testSystem, 'realview'): + self.etherlink.int0 = Parent.testsys.realview.ethernet.interface + elif hasattr(testSystem, 'tsunami'): + self.etherlink.int0 = Parent.testsys.tsunami.ethernet.interface + else: + fatal("Don't know how to connect DistEtherLink to this system") + + if dumpfile: + self.etherdump = EtherDump(file=dumpfile) + self.etherlink.dump = Parent.etherdump + + return self diff --git a/configs/common/Options.py b/configs/common/Options.py index 45be8e2f8..d5671f311 100644 --- a/configs/common/Options.py +++ b/configs/common/Options.py @@ -297,10 +297,41 @@ def addFSOptions(parser): # Benchmark options parser.add_option("--dual", action="store_true", help="Simulate two systems attached with an ethernet link") + parser.add_option("--dist", action="store_true", + help="Parallel distributed gem5 simulation.") + parser.add_option("--is-switch", action="store_true", + help="Select the network switch simulator process for a"\ + "distributed gem5 run") + parser.add_option("--dist-rank", default=0, action="store", type="int", + help="Rank of this system within the dist gem5 run.") + parser.add_option("--dist-size", default=0, action="store", type="int", + help="Number of gem5 processes within the dist gem5 run.") + parser.add_option("--dist-server-name", + default="127.0.0.1", + action="store", type="string", + help="Name of the message server host\nDEFAULT: localhost") + parser.add_option("--dist-server-port", + default=2200, + action="store", type="int", + help="Message server listen port\nDEFAULT: 2200") + parser.add_option("--dist-sync-repeat", + default="0us", + action="store", type="string", + help="Repeat interval for synchronisation barriers among dist-gem5 processes\nDEFAULT: --ethernet-linkdelay") + parser.add_option("--dist-sync-start", + default="5200000000000t", + action="store", type="string", + help="Time to schedule the first dist synchronisation barrier\nDEFAULT:5200000000000t") parser.add_option("-b", "--benchmark", action="store", type="string", dest="benchmark", help="Specify the benchmark to run. Available benchmarks: %s"\ % DefinedBenchmarks) + parser.add_option("--ethernet-linkspeed", default="10Gbps", + action="store", type="string", + help="Link speed in bps\nDEFAULT: 10Gbps") + parser.add_option("--ethernet-linkdelay", default="10us", + action="store", type="string", + help="Link delay in seconds\nDEFAULT: 10us") # Metafile options parser.add_option("--etherdump", action="store", type="string", dest="etherdump", diff --git a/configs/example/fs.py b/configs/example/fs.py index dddb2ea3c..6ee969a6e 100644 --- a/configs/example/fs.py +++ b/configs/example/fs.py @@ -340,6 +340,18 @@ test_sys = build_test_system(np) if len(bm) == 2: drive_sys = build_drive_system(np) root = makeDualRoot(True, test_sys, drive_sys, options.etherdump) +elif len(bm) == 1 and options.dist: + # This system is part of a dist-gem5 simulation + root = makeDistRoot(test_sys, + options.dist_rank, + options.dist_size, + options.dist_server_name, + options.dist_server_port, + options.dist_sync_repeat, + options.dist_sync_start, + options.ethernet_linkspeed, + options.ethernet_linkdelay, + options.etherdump); elif len(bm) == 1: root = Root(full_system=True, system=test_sys) else: diff --git a/util/dist/gem5-dist.sh b/util/dist/gem5-dist.sh new file mode 100755 index 000000000..8fa799acc --- /dev/null +++ b/util/dist/gem5-dist.sh @@ -0,0 +1,385 @@ +#! /bin/bash + +# +# Copyright (c) 2015 ARM Limited +# All rights reserved +# +# The license below extends only to copyright in the software and shall +# not be construed as granting a license to any other intellectual +# property including but not limited to intellectual property relating +# to a hardware implementation of the functionality of the software +# licensed hereunder. You may use the software subject to the license +# terms below provided that you ensure that this notice is replicated +# unmodified and in its entirety in all distributions of the software, +# modified or unmodified, in source code or in binary form. +# +# Copyright (c) 2015 University of Illinois Urbana Champaign +# All rights reserved +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer; +# redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution; +# neither the name of the copyright holders nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# +# Authors: Gabor Dozsa +# Mohammad Alian + + +# This is a wrapper script to run a dist gem5 simulations. +# See the usage_func() below for hints on how to use it. Also, +# there are some examples in the util/dist directory (e.g. +# see util/dist/test-2nodes-AArch64.sh) +# +# +# Allocated hosts/cores are assumed to be listed in the LSB_MCPU_HOSTS +# environment variable (which is what LSF does by default). +# E.g. LSB_MCPU_HOSTS=\"hname1 2 hname2 4\" means we have altogether 6 slots +# allocated to launch the gem5 processes, 2 of them are on host hname1 +# and 4 of them are on host hname2. +# If LSB_MCPU_HOSTS environment variable is not defined then we launch all +# processes on the localhost. +# +# Each gem5 process are passed in a unique rank ID [0..N-1] via the kernel +# boot params. The total number of gem5 processes is also passed in. +# These values can be used in the boot script to configure the MAC/IP +# addresses - among other things (see util/dist/bootscript.rcS). +# +# Each gem5 process will create an m5out.$GEM5_RANK directory for +# the usual output files. Furthermore, there will be a separate log file +# for each ssh session (we use ssh to start gem5 processes) and one for +# the server. These are called log.$GEM5_RANK and log.switch. +# + + +# print help +usage_func () +{ + echo "Usage:$0 [-debug] [-n nnodes] [-r rundir] [-c ckptdir] [-p port] [-sw switch] [--sw-args sw_args] [-fs fullsystem] [--fs-args fs_args] [--cf-args conf_args] [--m5-args m5_args] -x gem5_exe " + echo " -debug : debug mode (start gem5 in gdb)" + echo " nnodes : number of gem5 processes" + echo " rundir : run simulation under this path. If not specified, current dir will be used" + echo " ckptdir : dump/restore checkpoints to/from this path. If not specified, current dir will be used" + + echo " fullsystem: fullsystem config file" + echo " fs_args : fullsystem config specific argument list: arg1 arg2 ..." + echo " port : switch listen port" + echo " switch : switch config file" + echo " sw_args : switch config specific argument list: arg1 arg2 ..." + echo " conf_args : common (for both fullsystem and switch) config argument list: arg1 arg2 ..." + echo " gem5_exe : gem5 executable (full path required)" + echo " m5_args : common m5 argument list (e.g. debug flags): arg1 arg2 ..." + echo "Note: if no LSF slots allocation is found all proceses are launched on the localhost." +} + +# Process (optional) command line options +FS_ARGS=" " +SW_ARGS=" " +CF_ARGS=" " +M5_ARGS=" " +while (($# > 0)) +do + case "x$1" in + x-debug) + GEM5_DEBUG="-debug" + shift 1 + ;; + x-n|x-nodes) + NNODES=$2 + shift 2 + ;; + x-r|x-rundir) + RUN_DIR=$2 + shift 2 + ;; + x-c|x-ckptdir) + CKPT_DIR=$2 + shift 2 + ;; + x-p|x-port) + SW_PORT=$2 + shift 2 + ;; + x-s|x-switch) + SW_CONFIG=$2 + shift 2 + ;; + x--sw-args) + CUR_ARGS="SW_ARGS" + shift 1 + ;; + x-f|x-fullsystem) + FS_CONFIG=$2 + shift 2 + ;; + x--fs-args) + CUR_ARGS="FS_ARGS" + shift 1 + ;; + x--cf-args) + CUR_ARGS="CF_ARGS" + shift 1 + ;; + x--m5-args) + CUR_ARGS="M5_ARGS" + shift 1 + ;; + x-x) + GEM5_EXE=$2 + shift 2 + ;; + x-*) + [ -n "$CUR_ARGS" ] || { echo "Unexpected arg: $1"; usage_func; exit -1; } + case "x$2" in + x-*|x) + eval $CUR_ARGS=\"${!CUR_ARGS} $1\" + shift 1 + ;; + *) + eval $CUR_ARGS=\"${!CUR_ARGS} $1 $2\" + shift 2 + ;; + esac + ;; + *) + echo "Unknown arg: $1" + usage_func + exit 1 + ;; + esac +done + +# Default values to use (in case they are not defined as command line options) +DEFAULT_FS_CONFIG=$M5_PATH/configs/example/fs.py +DEFAULT_SW_CONFIG=$M5_PATH/configs/example/sw.py +DEFAULT_SW_PORT=2200 + +[ -z "$FS_CONFIG" ] && FS_CONFIG=$DEFAULT_FS_CONFIG +[ -z "$SW_CONFIG" ] && SW_CONFIG=$DEFAULT_SW_CONFIG +[ -z "$SW_PORT" ] && SW_PORT=$DEFAULT_SW_PORT +[ -z "$NNODES" ] && NNODES=2 +[ -z "$RUN_DIR" ] && RUN_DIR=$(pwd) +[ -z "$CKPT_DIR" ] && CKPT_DIR=$(pwd) + +# Check if all the executables we need exist +[ -f "$FS_CONFIG" ] || { echo "FS config ${FS_CONFIG} not found"; exit 1; } +[ -f "$SW_CONFIG" ] || { echo "Switch config ${SW_CONFIG} not found"; exit 1; } +[ -x "$GEM5_EXE" ] || { echo "Executable ${GEM5_EXE} not found"; exit 1; } +# make sure that RUN_DIR exists +mkdir -p $RUN_DIR > /dev/null 2>&1 + +declare -a SSH_PIDS +declare -a HOSTS +declare -a NCORES + +# Find out which cluster hosts/slots are allocated or +# use localhost if there is no LSF allocation. +# We assume that allocated slots are listed in the LSB_MCPU_HOSTS +# environment variable in the form: +# host1 nslots1 host2 nslots2 ... +# (This is what LSF does by default.) +NH=0 +[ "x$LSB_MCPU_HOSTS" != "x" ] || LSB_MCPU_HOSTS="127.0.0.1 $NNODES" +host="" +for hc in $LSB_MCPU_HOSTS +do + if [ "x$host" == "x" ] + then + host=$hc + HOSTS+=($hc) + else + NCORES+=($hc) + ((NH+=hc)) + host="" + fi +done +((NNODES==NH)) || { echo "(E) Number of cluster slots ($NH) and gem5 instances ($N) differ"; exit -1; } + +# function to clean up and abort if something goes wrong +abort_func () +{ + echo + echo "KILLED $(date)" + # Try to Kill the server first. That should trigger an exit for all connected + # gem5 processes. + [ "x$SW_PID" != "x" ] && kill $SW_PID 2>/dev/null + sleep 20 + # (try to) kill gem5 processes - just in case something went wrong with the + # server triggered exit + bname=$(basename $GEM5_EXE) + killall -q -s SIGKILL $bname + for h in ${HOSTS[@]} + do + ssh $h killall -q -s SIGKILL $bname + done + sleep 5 + # kill the watchdog + [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null + exit -1 +} + +# We need a watchdog to trigger full clean up if a gem5 process dies +watchdog_func () +{ + while true + do + sleep 30 + ((NDEAD=0)) + for p in ${SSH_PIDS[*]} + do + kill -0 $p 2>/dev/null || ((NDEAD+=1)) + done + kill -0 $SW_PID || ((NDEAD+=1)) + if ((NDEAD>0)) + then + # we may be in the middle of an orderly termination, + # give it some time to complete before reporting abort + sleep 60 + echo -n "(I) (some) gem5 process(es) exited" + abort_func + fi + done +} + +# This function launches the gem5 processes. The only purpose is to enable +# launching gem5 processes under gdb control for debugging +start_func () +{ + local N=$1 + local HOST=$2 + local ENV_ARGS=$3 + shift 3 + if [ "x$GEM5_DEBUG" != "x" ] + then + echo "DEBUG starting terminal..." + MY_ARGS="$@" + xterm -e "gdb --args $MY_ARGS" & + else + ssh $HOST $ENV_ARGS "$@" &> $RUN_DIR/log.$N & + fi +} + +# block till the gem5 process starts +connected () +{ + FILE=$1 + STRING=$2 + echo -n "waiting for $3 to start " + while : ; + do + kill -0 $4 || { echo "Failed to start $3"; exit -1; } + [[ -f "$FILE" ]] && \ + grep -q "$STRING" "$FILE" && \ + echo -e "\nnode #$3 started" && \ + break + + sleep 2 + echo -n "." + done +} + +# Trigger full clean up in case we are being killed by external signal +trap 'abort_func' INT TERM + +# env args to be passed explicitly to gem5 processes started via ssh +ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH" + +#cleanup log files before starting gem5 processes +rm $RUN_DIR/log.switch > /dev/null 2>&1 + +# make sure that CKPT_DIR exists +mkdir -p $CKPT_DIR/m5out.switch > /dev/null 2>&1 +# launch switch gem5 +SW_HOST=${HOSTS[0]} +echo "launch switch gem5 process on $SW_HOST ..." +start_func "switch" $SW_HOST "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.switch \ + $M5_ARGS \ + $SW_CONFIG \ + $SW_ARGS \ + $CF_ARGS \ + --checkpoint-dir=$CKPT_DIR/m5out.switch \ + --is-switch \ + --dist-size=$NNODES \ + --dist-server-port=$SW_PORT +SW_PID=$! + +# block here till switch process starts +connected $RUN_DIR/log.switch "tcp_iface listening on port" "switch" $SW_PID +LINE=$(grep -r "tcp_iface listening on port" $RUN_DIR/log.switch) + +IFS=' ' read -ra ADDR <<< "$LINE" +# actual port that switch is listening on may be different +# from what we specified if the port was busy +SW_PORT=${ADDR[5]} + +# Now launch all the gem5 processes with ssh. +echo "START $(date)" +n=0 +for ((i=0; i < ${#HOSTS[@]}; i++)) +do + h=${HOSTS[$i]} + for ((j=0; j < ${NCORES[i]}; j++)) + do + #cleanup log files before starting gem5 processes + rm $RUN_DIR/log.$n > /dev/null 2>&1 + # make sure that CKPT_DIR exists + mkdir -p $CKPT_DIR/m5out.$n > /dev/null 2>&1 + echo "starting gem5 on $h ..." + start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $RUN_DIR/m5out.$n \ + $M5_ARGS \ + $FS_CONFIG \ + $FS_ARGS \ + $CF_ARGS \ + --checkpoint-dir=$CKPT_DIR/m5out.$n \ + --dist \ + --dist-rank=$n \ + --dist-size=$NNODES \ + --dist-server-name=${HOSTS[0]} \ + --dist-server-port=$SW_PORT + SSH_PIDS[$n]=$! + ((n+=1)) + done +done + +# Wait here if it is a debug session +[ "x$GEM5_DEBUG" == "x" ] || { echo "DEBUG session"; wait $SW_PID; exit -1; } + +# start watchdog to trigger complete abort (after a grace period) if any +# gem5 process dies +watchdog_func & +WATCHDOG_PID=$! + +# wait for exit statuses +((NFAIL=0)) +for p in ${SSH_PIDS[*]} +do + wait $p || ((NFAIL+=1)) +done +wait $SW_PID || ((NFAIL+=1)) + +# all done, let's terminate the watchdog +kill $WATCHDOG_PID 2>/dev/null + +if ((NFAIL==0)) +then + echo "EXIT $(date)" +else + echo "ABORT $(date)" +fi diff --git a/util/multi/bootscript.rcS b/util/dist/test/simple_bootscript.rcS similarity index 65% rename from util/multi/bootscript.rcS rename to util/dist/test/simple_bootscript.rcS index 95736f4b7..7c9b75538 100644 --- a/util/multi/bootscript.rcS +++ b/util/dist/test/simple_bootscript.rcS @@ -40,33 +40,28 @@ # Authors: Gabor Dozsa # # -# This is an example boot script to use for muti gem5 runs. The important -# task here is to extract the rank and size information from the kernel -# boot args and use those to configure MAC/IP addresses and hostname. -# Then we can kick off our (parallel) workload ... +# This is an example boot script to use for dist-gem5 runs. The important +# task here is to extract the rank and size information through the m5 +# initparam utility and use those to configure MAC/IP addresses and hostname. # -# You are expected to costumize this scipt for your needs (e.g. change +# You are expected to customize this scipt for your needs (e.g. change # the command at the end of the scipt to run your tests/workloads. source /root/.bashrc echo "bootscript.rcS is running" -m='GEM5\_RANK=([0-9]+) GEM5\_SIZE=([0-9]+)' -if [[ $(cat /proc/cmdline) =~ $m ]] -then - MY_RANK=${BASH_REMATCH[1]} - MY_SIZE=${BASH_REMATCH[2]} -else - echo "(E) GEM5_RANK/GEM5_SIZE was not defined in bootargs, exiting ..." - /sbin/m5 abort -fi +# Retrieve dist-gem5 rank and size parameters using the 'm5' utility +MY_RANK=$(/sbin/m5 initparam dist-rank) +[ $? = 0 ] || { echo "m5 initparam failed"; exit -1; } +MY_SIZE=$(/sbin/m5 initparam dist-size) +[ $? = 0 ] || { echo "m5 initparam failed"; exit -1; } /bin/hostname node${MY_RANK} # Keep MAC address assignment simple for now ... -(($MY_RANK>97)) && { echo "(E) Rank must be less than 98"; /sbin/m5 abort; } -((MY_ADDR=MY_RANK+2)) -if (($MY_ADDR<10)) +(($MY_RANK > 97)) && { echo "(E) Rank must be less than 98"; /sbin/m5 abort; } +((MY_ADDR = MY_RANK + 2)) +if (($MY_ADDR < 10)) then MY_ADDR_PADDED=0${MY_ADDR} else @@ -78,45 +73,35 @@ fi /sbin/ifconfig -a -# Prepare host lists for mpirun -MY_MPI_HOSTS="192.168.0.2" -for ((i=1; i/dev/null - [ "x$WATCHDOG_PID" != "x" ] && kill $WATCHDOG_PID 2>/dev/null - exit -1 -} - - -# We need a watchdog to trigger full clean up if a gem5 process dies -watchdog_func () -{ - while true - do - sleep 30 - ((NDEAD=0)) - for p in ${SSH_PIDS[*]} - do - kill -0 $p 2>/dev/null || ((NDEAD+=1)) - done - kill -0 $SERVER_PID || ((NDEAD+=1)) - if ((NDEAD>0)) - then - # we may be in the middle of an orderly termination, - # give it some time to complete before reporting abort - sleep 60 - echo -n "(I) (some) gem5 process(es) exited" - abort_func - fi - done -} - -# This function launches the gem5 processes. We use it only to allow launching -# gem5 processes under gdb control (in the foreground) for debugging -start_func () -{ - local N=$1 - local HOST=$2 - local ENV_ARGS=$3 - shift 3 - if [ "x$GEM5_DEBUG" != "x" ] - then - gdb --args "$@" - else - ssh $HOST $ENV_ARGS "$@" &>log.$N & - fi -} - - -# Trigger full clean up in case we are being killed by external signal -trap 'abort_func' INT TERM - -# env args to be passed explicitly to gem5 processes started via ssh -ENV_ARGS="LD_LIBRARY_PATH=$LD_LIBRARY_PATH M5_PATH=$M5_PATH" - -# launch the mesage server and check if it has started okay -$TCP_SERVER $GEM5_DEBUG $NNODES $SERVER_PORT &>log.server & -SERVER_PID=$! -sleep 2 -kill -0 $SERVER_PID || { echo "Failed to start message server"; exit -1; } - -# Now launch all the gem5 processes with ssh. -echo "START $(date)" -n=0 -for ((i=0; i < ${#HOSTS[@]}; i++)) -do - h=${HOSTS[$i]} - for ((j=0; j < ${NCORES[i]}; j++)) - do - echo "starting gem5 on $h ..." - start_func $n $h "$ENV_ARGS" $GEM5_EXE -d $(pwd)/m5out.$n $GEM5_ARGS \ - --multi \ - --multi-rank=$n \ - --multi-server-name=${HOSTS[0]} \ - --multi-server-port=$SERVER_PORT \ - --testsys-toplevel-LinuxArmSystem.boot_osflags="\"GEM5_RANK=$n GEM5_SIZE=$NNODES\"" - SSH_PIDS[$n]=$! - ((n+=1)) - done -done - -[ "x$GEM5_DEBUG" == "x" ] || { kill $SERVER_PID; echo "DEBUG exit"; exit -1; } - -# start watchdog to trigger complete abort (after a grace period) if any -# gem5 process dies -watchdog_func & -WATCHDOG_PID=$! - -# wait for exit statuses -((NFAIL=0)) -for p in ${SSH_PIDS[*]} -do - wait $p || ((NFAIL+=1)) -done -wait $SERVER_PID || ((NFAIL+=1)) - -# all done, let's terminate the watchdog -kill $WATCHDOG_PID 2>/dev/null - -if ((NFAIL==0)) -then - echo "EXIT $(date)" -else - echo "ABORT $(date)" -fi -- 2.30.2