configs/example/apu_se.py

   1 # Copyright (c) 2015 Advanced Micro Devices, Inc.
   2 # All rights reserved.
   3 #
   4 # For use for simulation and test purposes only
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 # 1. Redistributions of source code must retain the above copyright notice,
  10 # this list of conditions and the following disclaimer.
  11 #
  12 # 2. Redistributions in binary form must reproduce the above copyright notice,
  13 # this list of conditions and the following disclaimer in the documentation
  14 # and/or other materials provided with the distribution.
  15 #
  16 # 3. Neither the name of the copyright holder nor the names of its
  17 # contributors may be used to endorse or promote products derived from this
  18 # software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32 import optparse, os, re, getpass
  33 import math
  34 import glob
  35 import inspect
  36
  37 import m5
  38 from m5.objects import *
  39 from m5.util import addToPath
  40
  41 addToPath('../')
  42
  43 from ruby import Ruby
  44
  45 from common import Options
  46 from common import Simulation
  47 from common import GPUTLBOptions, GPUTLBConfig
  48
  49 import hsaTopology
  50 from common import FileSystemConfig
  51
  52 ########################## Script Options ########################
  53 def setOption(parser, opt_str, value = 1):
  54     # check to make sure the option actually exists
  55     if not parser.has_option(opt_str):
  56         raise Exception("cannot find %s in list of possible options" % opt_str)
  57
  58     opt = parser.get_option(opt_str)
  59     # set the value
  60     exec("parser.values.%s = %s" % (opt.dest, value))
  61
  62 def getOption(parser, opt_str):
  63     # check to make sure the option actually exists
  64     if not parser.has_option(opt_str):
  65         raise Exception("cannot find %s in list of possible options" % opt_str)
  66
  67     opt = parser.get_option(opt_str)
  68     # get the value
  69     exec("return_value = parser.values.%s" % opt.dest)
  70     return return_value
  71
  72 # Adding script options
  73 parser = optparse.OptionParser()
  74 Options.addCommonOptions(parser)
  75 Options.addSEOptions(parser)
  76
  77 parser.add_option("--cpu-only-mode", action="store_true", default=False,
  78                   help="APU mode. Used to take care of problems in "\
  79                        "Ruby.py while running APU protocols")
  80 parser.add_option("-u", "--num-compute-units", type="int", default=4,
  81                   help="number of GPU compute units"),
  82 parser.add_option("--num-cp", type="int", default=0,
  83                   help="Number of GPU Command Processors (CP)")
  84 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
  85
  86 # not super important now, but to avoid putting the number 4 everywhere, make
  87 # it an option/knob
  88 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
  89                   "sharing an SQC (icache, and thus icache TLB)")
  90 parser.add_option('--cu-per-scalar-cache', type='int', default=4,
  91                   help='Number of CUs sharing a scalar cache')
  92 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
  93                   "per CU")
  94 parser.add_option('--cu-per-sa', type='int', default=4,
  95                   help='Number of CUs per shader array. This must be a '
  96                   'multiple of options.cu-per-sqc and options.cu-per-scalar')
  97 parser.add_option('--sa-per-complex', type='int', default=1,
  98                   help='Number of shader arrays per complex')
  99 parser.add_option('--num-gpu-complexes', type='int', default=1,
 100                   help='Number of GPU complexes')
 101 parser.add_option("--wf-size", type="int", default=64,
 102                   help="Wavefront size(in workitems)")
 103 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
 104                   help="Number of stages of bypass path in vector ALU for "
 105                   "Single Precision ops")
 106 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
 107                   help="Number of stages of bypass path in vector ALU for "
 108                   "Double Precision ops")
 109 # issue period per SIMD unit: number of cycles before issuing another vector
 110 parser.add_option("--issue-period", type="int", default=4, \
 111                   help="Number of cycles per vector instruction issue period")
 112 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
 113                   help="VGPR to Coalescer (Global Memory) data bus width "
 114                   "in bytes")
 115 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
 116                   help="Coalescer to VGPR (Global Memory) data bus width in "
 117                   "bytes")
 118 # Currently we only support 1 local memory pipe
 119 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
 120                   help="Number of Shared Memory pipelines per CU")
 121 # Currently we only support 1 global memory pipe
 122 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
 123                   help="Number of Global Memory pipelines per CU")
 124 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
 125                   "WF slots per SIMD")
 126
 127 parser.add_option("--registerManagerPolicy", type="string", default="static",
 128                   help="Register manager policy")
 129 parser.add_option("--vreg-file-size", type="int", default=2048,
 130                   help="number of physical vector registers per SIMD")
 131 parser.add_option("--vreg-min-alloc", type="int", default=4,
 132                   help="Minimum number of registers that can be allocated "
 133                   "from the VRF. The total number of registers will be "
 134                   "aligned to this value.")
 135
 136 parser.add_option("--sreg-file-size", type="int", default=2048,
 137                   help="number of physical vector registers per SIMD")
 138 parser.add_option("--sreg-min-alloc", type="int", default=4,
 139                   help="Minimum number of registers that can be allocated "
 140                   "from the SRF. The total number of registers will be "
 141                   "aligned to this value.")
 142
 143 parser.add_option("--bw-scalor", type="int", default=0,
 144                   help="bandwidth scalor for scalability analysis")
 145 parser.add_option("--CPUClock", type="string", default="2GHz",
 146                   help="CPU clock")
 147 parser.add_option("--gpu-clock", type="string", default="1GHz",
 148                   help="GPU clock")
 149 parser.add_option("--cpu-voltage", action="store", type="string",
 150                   default='1.0V',
 151                   help = """CPU  voltage domain""")
 152 parser.add_option("--gpu-voltage", action="store", type="string",
 153                   default='1.0V',
 154                   help = """CPU  voltage domain""")
 155 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
 156                   help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
 157 parser.add_option("--SegFaultDebug",action="store_true",
 158                  help="checks for GPU seg fault before TLB access")
 159 parser.add_option("--FunctionalTLB",action="store_true",
 160                  help="Assumes TLB has no latency")
 161 parser.add_option("--LocalMemBarrier",action="store_true",
 162                  help="Barrier does not wait for writethroughs to complete")
 163 parser.add_option("--countPages", action="store_true",
 164                  help="Count Page Accesses and output in per-CU output files")
 165 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
 166                   "TLBs")
 167 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
 168                   "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
 169 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
 170 parser.add_option("--numLdsBanks", type="int", default=32,
 171                   help="number of physical banks per LDS module")
 172 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
 173                   help="number of cycles per LDS bank conflict")
 174 parser.add_option("--lds-size", type="int", default=65536,
 175                    help="Size of the LDS in bytes")
 176 parser.add_option('--fast-forward-pseudo-op', action='store_true',
 177                   help = 'fast forward using kvm until the m5_switchcpu'
 178                   ' pseudo-op is encountered, then switch cpus. subsequent'
 179                   ' m5_switchcpu pseudo-ops will toggle back and forth')
 180 parser.add_option("--num-hw-queues", type="int", default=10,
 181                   help="number of hw queues in packet processor")
 182 parser.add_option("--reg-alloc-policy",type="string", default="simple",
 183                   help="register allocation policy (simple/dynamic)")
 184
 185 Ruby.define_options(parser)
 186
 187 #add TLB options to the parser
 188 GPUTLBOptions.tlb_options(parser)
 189
 190 (options, args) = parser.parse_args()
 191
 192 # The GPU cache coherence protocols only work with the backing store
 193 setOption(parser, "--access-backing-store")
 194
 195 # if benchmark root is specified explicitly, that overrides the search path
 196 if options.benchmark_root:
 197     benchmark_path = [options.benchmark_root]
 198 else:
 199     # Set default benchmark search path to current dir
 200     benchmark_path = ['.']
 201
 202 ########################## Sanity Check ########################
 203
 204 # Currently the gpu model requires ruby
 205 if buildEnv['PROTOCOL'] == 'None':
 206     fatal("GPU model requires ruby")
 207
 208 # Currently the gpu model requires only timing or detailed CPU
 209 if not (options.cpu_type == "TimingSimpleCPU" or
 210    options.cpu_type == "DerivO3CPU"):
 211     fatal("GPU model requires TimingSimpleCPU or DerivO3CPU")
 212
 213 # This file can support multiple compute units
 214 assert(options.num_compute_units >= 1)
 215
 216 # Currently, the sqc (I-Cache of GPU) is shared by
 217 # multiple compute units(CUs). The protocol works just fine
 218 # even if sqc is not shared. Overriding this option here
 219 # so that the user need not explicitly set this (assuming
 220 # sharing sqc is the common usage)
 221 n_cu = options.num_compute_units
 222 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
 223 options.num_sqc = num_sqc # pass this to Ruby
 224 num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache))
 225 options.num_scalar_cache = num_scalar_cache
 226
 227 print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
 228       'Num CU = ', n_cu)
 229
 230 ########################## Creating the GPU system ########################
 231 # shader is the GPU
 232 shader = Shader(n_wf = options.wfs_per_simd,
 233                 clk_domain = SrcClockDomain(
 234                     clock = options.gpu_clock,
 235                     voltage_domain = VoltageDomain(
 236                         voltage = options.gpu_voltage)))
 237
 238 # VIPER GPU protocol implements release consistency at GPU side. So,
 239 # we make their writes visible to the global memory and should read
 240 # from global memory during kernal boundary. The pipeline initiates
 241 # (or do not initiate) the acquire/release operation depending on
 242 # these impl_kern_launch_rel and impl_kern_end_rel flags. The flag=true
 243 # means pipeline initiates a acquire/release operation at kernel launch/end.
 244 # VIPER protocol is write-through based, and thus only impl_kern_launch_acq
 245 # needs to set.
 246 if (buildEnv['PROTOCOL'] == 'GPU_VIPER'):
 247     shader.impl_kern_launch_acq = True
 248     shader.impl_kern_end_rel = False
 249 else:
 250     shader.impl_kern_launch_acq = True
 251     shader.impl_kern_end_rel = True
 252
 253 # Switching off per-lane TLB by default
 254 per_lane = False
 255 if options.TLB_config == "perLane":
 256     per_lane = True
 257
 258 # List of compute units; one GPU can have multiple compute units
 259 compute_units = []
 260 for i in range(n_cu):
 261     compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
 262                                      num_SIMDs = options.simds_per_cu,
 263                                      wf_size = options.wf_size,
 264                                      spbypass_pipe_length = \
 265                                      options.sp_bypass_path_length,
 266                                      dpbypass_pipe_length = \
 267                                      options.dp_bypass_path_length,
 268                                      issue_period = options.issue_period,
 269                                      coalescer_to_vrf_bus_width = \
 270                                      options.glbmem_rd_bus_width,
 271                                      vrf_to_coalescer_bus_width = \
 272                                      options.glbmem_wr_bus_width,
 273                                      num_global_mem_pipes = \
 274                                      options.glb_mem_pipes_per_cu,
 275                                      num_shared_mem_pipes = \
 276                                      options.shr_mem_pipes_per_cu,
 277                                      n_wf = options.wfs_per_simd,
 278                                      execPolicy = options.CUExecPolicy,
 279                                      debugSegFault = options.SegFaultDebug,
 280                                      functionalTLB = options.FunctionalTLB,
 281                                      localMemBarrier = options.LocalMemBarrier,
 282                                      countPages = options.countPages,
 283                                      localDataStore = \
 284                                      LdsState(banks = options.numLdsBanks,
 285                                               bankConflictPenalty = \
 286                                               options.ldsBankConflictPenalty,
 287                                               size = options.lds_size)))
 288     wavefronts = []
 289     vrfs = []
 290     vrf_pool_mgrs = []
 291     srfs = []
 292     srf_pool_mgrs = []
 293     for j in range(options.simds_per_cu):
 294         for k in range(shader.n_wf):
 295             wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
 296                                         wf_size = options.wf_size))
 297
 298         if options.reg_alloc_policy == "simple":
 299             vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
 300                                                options.vreg_file_size,
 301                                                min_alloc = \
 302                                                options.vreg_min_alloc))
 303             srf_pool_mgrs.append(SimplePoolManager(pool_size = \
 304                                                options.sreg_file_size,
 305                                                min_alloc = \
 306                                                options.vreg_min_alloc))
 307         elif options.reg_alloc_policy == "dynamic":
 308             vrf_pool_mgrs.append(DynPoolManager(pool_size = \
 309                                                options.vreg_file_size,
 310                                                min_alloc = \
 311                                                options.vreg_min_alloc))
 312             srf_pool_mgrs.append(DynPoolManager(pool_size = \
 313                                                options.sreg_file_size,
 314                                                min_alloc = \
 315                                                options.vreg_min_alloc))
 316
 317         vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
 318                                        num_regs=options.vreg_file_size))
 319         srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
 320                                        num_regs=options.sreg_file_size))
 321
 322     compute_units[-1].wavefronts = wavefronts
 323     compute_units[-1].vector_register_file = vrfs
 324     compute_units[-1].scalar_register_file = srfs
 325     compute_units[-1].register_manager = \
 326         RegisterManager(policy=options.registerManagerPolicy,
 327                         vrf_pool_managers=vrf_pool_mgrs,
 328                         srf_pool_managers=srf_pool_mgrs)
 329     if options.TLB_prefetch:
 330         compute_units[-1].prefetch_depth = options.TLB_prefetch
 331         compute_units[-1].prefetch_prev_type = options.pf_type
 332
 333     # attach the LDS and the CU to the bus (actually a Bridge)
 334     compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
 335     compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
 336
 337 # Attach compute units to GPU
 338 shader.CUs = compute_units
 339
 340 ########################## Creating the CPU system ########################
 341 # The shader core will be whatever is after the CPU cores are accounted for
 342 shader_idx = options.num_cpus
 343
 344 # The command processor will be whatever is after the shader is accounted for
 345 cp_idx = shader_idx + 1
 346 cp_list = []
 347
 348 # List of CPUs
 349 cpu_list = []
 350
 351 CpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type)
 352 if CpuClass == AtomicSimpleCPU:
 353     fatal("AtomicSimpleCPU is not supported")
 354 if mem_mode != 'timing':
 355     fatal("Only the timing memory mode is supported")
 356 shader.timing = True
 357
 358 if options.fast_forward and options.fast_forward_pseudo_op:
 359     fatal("Cannot fast-forward based both on the number of instructions and"
 360           " on pseudo-ops")
 361 fast_forward = options.fast_forward or options.fast_forward_pseudo_op
 362
 363 if fast_forward:
 364     FutureCpuClass, future_mem_mode = CpuClass, mem_mode
 365
 366     CpuClass = X86KvmCPU
 367     mem_mode = 'atomic_noncaching'
 368     # Leave shader.timing untouched, because its value only matters at the
 369     # start of the simulation and because we require switching cpus
 370     # *before* the first kernel launch.
 371
 372     future_cpu_list = []
 373
 374     # Initial CPUs to be used during fast-forwarding.
 375     for i in range(options.num_cpus):
 376         cpu = CpuClass(cpu_id = i,
 377                        clk_domain = SrcClockDomain(
 378                            clock = options.CPUClock,
 379                            voltage_domain = VoltageDomain(
 380                                voltage = options.cpu_voltage)))
 381         cpu_list.append(cpu)
 382
 383         if options.fast_forward:
 384             cpu.max_insts_any_thread = int(options.fast_forward)
 385
 386 if fast_forward:
 387     MainCpuClass = FutureCpuClass
 388 else:
 389     MainCpuClass = CpuClass
 390
 391 # CPs to be used throughout the simulation.
 392 for i in range(options.num_cp):
 393     cp = MainCpuClass(cpu_id = options.num_cpus + i,
 394                       clk_domain = SrcClockDomain(
 395                           clock = options.CPUClock,
 396                           voltage_domain = VoltageDomain(
 397                               voltage = options.cpu_voltage)))
 398     cp_list.append(cp)
 399
 400 # Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
 401 for i in range(options.num_cpus):
 402     cpu = MainCpuClass(cpu_id = i,
 403                        clk_domain = SrcClockDomain(
 404                            clock = options.CPUClock,
 405                            voltage_domain = VoltageDomain(
 406                                voltage = options.cpu_voltage)))
 407     if fast_forward:
 408         cpu.switched_out = True
 409         future_cpu_list.append(cpu)
 410     else:
 411         cpu_list.append(cpu)
 412
 413 host_cpu = cpu_list[0]
 414
 415 hsapp_gpu_map_vaddr = 0x200000000
 416 hsapp_gpu_map_size = 0x1000
 417 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
 418
 419 # HSA kernel mode driver
 420 gpu_driver = GPUComputeDriver(filename="kfd")
 421
 422 # Creating the GPU kernel launching components: that is the HSA
 423 # packet processor (HSAPP), GPU command processor (CP), and the
 424 # dispatcher.
 425 gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
 426                                numHWQueues=options.num_hw_queues)
 427 dispatcher = GPUDispatcher()
 428 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
 429                                    dispatcher=dispatcher)
 430 gpu_driver.device = gpu_cmd_proc
 431 shader.dispatcher = dispatcher
 432 shader.gpu_cmd_proc = gpu_cmd_proc
 433
 434 # Create and assign the workload Check for rel_path in elements of
 435 # base_list using test, returning the first full path that satisfies test
 436 def find_path(base_list, rel_path, test):
 437     for base in base_list:
 438         if not base:
 439             # base could be None if environment var not set
 440             continue
 441         full_path = os.path.join(base, rel_path)
 442         if test(full_path):
 443             return full_path
 444     fatal("%s not found in %s" % (rel_path, base_list))
 445
 446 def find_file(base_list, rel_path):
 447     return find_path(base_list, rel_path, os.path.isfile)
 448
 449 executable = find_path(benchmark_path, options.cmd, os.path.exists)
 450 # It's common for a benchmark to be in a directory with the same
 451 # name as the executable, so we handle that automatically
 452 if os.path.isdir(executable):
 453     benchmark_path = [executable]
 454     executable = find_file(benchmark_path, options.cmd)
 455
 456 if options.env:
 457     with open(options.env, 'r') as f:
 458         env = [line.rstrip() for line in f]
 459 else:
 460     env = ['LD_LIBRARY_PATH=%s' % ':'.join([
 461                os.getenv('ROCM_PATH','/opt/rocm')+'/lib',
 462                os.getenv('HCC_HOME','/opt/rocm/hcc')+'/lib',
 463                os.getenv('HSA_PATH','/opt/rocm/hsa')+'/lib',
 464                os.getenv('HIP_PATH','/opt/rocm/hip')+'/lib',
 465                os.getenv('ROCM_PATH','/opt/rocm')+'/libhsakmt/lib',
 466                os.getenv('ROCM_PATH','/opt/rocm')+'/miopen/lib',
 467                os.getenv('ROCM_PATH','/opt/rocm')+'/miopengemm/lib',
 468                os.getenv('ROCM_PATH','/opt/rocm')+'/hipblas/lib',
 469                os.getenv('ROCM_PATH','/opt/rocm')+'/rocblas/lib',
 470                "/usr/lib/x86_64-linux-gnu"
 471            ]),
 472            'HOME=%s' % os.getenv('HOME','/'),
 473            "HSA_ENABLE_INTERRUPT=1"]
 474
 475 process = Process(executable = executable, cmd = [options.cmd]
 476                   + options.options.split(), drivers = [gpu_driver], env = env)
 477
 478 for cpu in cpu_list:
 479     cpu.createThreads()
 480     cpu.workload = process
 481
 482 for cp in cp_list:
 483     cp.workload = host_cpu.workload
 484
 485 if fast_forward:
 486     for i in range(len(future_cpu_list)):
 487         future_cpu_list[i].workload = cpu_list[i].workload
 488         future_cpu_list[i].createThreads()
 489
 490 ########################## Create the overall system ########################
 491 # List of CPUs that must be switched when moving between KVM and simulation
 492 if fast_forward:
 493     switch_cpu_list = \
 494         [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
 495
 496 # Full list of processing cores in the system.
 497 cpu_list = cpu_list + [shader] + cp_list
 498
 499 # creating the overall system
 500 # notice the cpu list is explicitly added as a parameter to System
 501 system = System(cpu = cpu_list,
 502                 mem_ranges = [AddrRange(options.mem_size)],
 503                 cache_line_size = options.cacheline_size,
 504                 mem_mode = mem_mode,
 505                 workload = SEWorkload.init_compatible(executable))
 506 if fast_forward:
 507     system.future_cpu = future_cpu_list
 508 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
 509 system.clk_domain = SrcClockDomain(clock =  options.sys_clock,
 510                                    voltage_domain = system.voltage_domain)
 511
 512 if fast_forward:
 513     have_kvm_support = 'BaseKvmCPU' in globals()
 514     if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
 515         system.vm = KvmVM()
 516         for i in range(len(host_cpu.workload)):
 517             host_cpu.workload[i].useArchPT = True
 518             host_cpu.workload[i].kvmInSE = True
 519     else:
 520         fatal("KvmCPU can only be used in SE mode with x86")
 521
 522 # configure the TLB hierarchy
 523 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
 524
 525 # create Ruby system
 526 system.piobus = IOXBar(width=32, response_latency=0,
 527                        frontend_latency=0, forward_latency=0)
 528 dma_list = [gpu_hsapp, gpu_cmd_proc]
 529 Ruby.create_system(options, None, system, None, dma_list, None)
 530 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
 531                                     voltage_domain = system.voltage_domain)
 532 gpu_cmd_proc.pio = system.piobus.master
 533 gpu_hsapp.pio = system.piobus.master
 534
 535 for i, dma_device in enumerate(dma_list):
 536     exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)
 537
 538 # attach the CPU ports to Ruby
 539 for i in range(options.num_cpus):
 540     ruby_port = system.ruby._cpu_ports[i]
 541
 542     # Create interrupt controller
 543     system.cpu[i].createInterruptController()
 544
 545     # Connect cache port's to ruby
 546     system.cpu[i].icache_port = ruby_port.slave
 547     system.cpu[i].dcache_port = ruby_port.slave
 548
 549     ruby_port.mem_master_port = system.piobus.slave
 550     if buildEnv['TARGET_ISA'] == "x86":
 551         system.cpu[i].interrupts[0].pio = system.piobus.master
 552         system.cpu[i].interrupts[0].int_master = system.piobus.slave
 553         system.cpu[i].interrupts[0].int_slave = system.piobus.master
 554         if fast_forward:
 555             system.cpu[i].mmu.connectWalkerPorts(
 556                 ruby_port.slave, ruby_port.slave)
 557
 558 # attach CU ports to Ruby
 559 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
 560 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
 561 # hooked up until after the CP. To make this script generic, figure out
 562 # the index as below, but note that this assumes there is one sequencer
 563 # per compute unit and one sequencer per SQC for the math to work out
 564 # correctly.
 565 gpu_port_idx = len(system.ruby._cpu_ports) \
 566                - options.num_compute_units - options.num_sqc \
 567                - options.num_scalar_cache
 568 gpu_port_idx = gpu_port_idx - options.num_cp * 2
 569
 570 # Connect token ports. For this we need to search through the list of all
 571 # sequencers, since the TCP coalescers will not necessarily be first. Only
 572 # TCP coalescers use a token port for back pressure.
 573 token_port_idx = 0
 574 for i in range(len(system.ruby._cpu_ports)):
 575     if isinstance(system.ruby._cpu_ports[i], VIPERCoalescer):
 576         system.cpu[shader_idx].CUs[token_port_idx].gmTokenPort = \
 577             system.ruby._cpu_ports[i].gmTokenPort
 578         token_port_idx += 1
 579
 580 wavefront_size = options.wf_size
 581 for i in range(n_cu):
 582     # The pipeline issues wavefront_size number of uncoalesced requests
 583     # in one GPU issue cycle. Hence wavefront_size mem ports.
 584     for j in range(wavefront_size):
 585         system.cpu[shader_idx].CUs[i].memory_port[j] = \
 586                   system.ruby._cpu_ports[gpu_port_idx].slave[j]
 587     gpu_port_idx += 1
 588
 589 for i in range(n_cu):
 590     if i > 0 and not i % options.cu_per_sqc:
 591         print("incrementing idx on ", i)
 592         gpu_port_idx += 1
 593     system.cpu[shader_idx].CUs[i].sqc_port = \
 594             system.ruby._cpu_ports[gpu_port_idx].slave
 595 gpu_port_idx = gpu_port_idx + 1
 596
 597 for i in range(n_cu):
 598     if i > 0 and not i % options.cu_per_scalar_cache:
 599         print("incrementing idx on ", i)
 600         gpu_port_idx += 1
 601     system.cpu[shader_idx].CUs[i].scalar_port = \
 602         system.ruby._cpu_ports[gpu_port_idx].slave
 603 gpu_port_idx = gpu_port_idx + 1
 604
 605 # attach CP ports to Ruby
 606 for i in range(options.num_cp):
 607     system.cpu[cp_idx].createInterruptController()
 608     system.cpu[cp_idx].dcache_port = \
 609                 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
 610     system.cpu[cp_idx].icache_port = \
 611                 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
 612     system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
 613     system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
 614     system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
 615     cp_idx = cp_idx + 1
 616
 617 ################# Connect the CPU and GPU via GPU Dispatcher ##################
 618 # CPU rings the GPU doorbell to notify a pending task
 619 # using this interface.
 620 # And GPU uses this interface to notify the CPU of task completion
 621 # The communcation happens through emulated driver.
 622
 623 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
 624 # parameters must be after the explicit setting of the System cpu list
 625 if fast_forward:
 626     shader.cpu_pointer = future_cpu_list[0]
 627 else:
 628     shader.cpu_pointer = host_cpu
 629
 630 ########################## Start simulation ########################
 631
 632 redirect_paths = [RedirectPath(app_path = "/proc",
 633                                host_paths =
 634                                 ["%s/fs/proc" % m5.options.outdir]),
 635                   RedirectPath(app_path = "/sys",
 636                                host_paths =
 637                                 ["%s/fs/sys"  % m5.options.outdir]),
 638                   RedirectPath(app_path = "/tmp",
 639                                host_paths =
 640                                 ["%s/fs/tmp"  % m5.options.outdir])]
 641
 642 system.redirect_paths = redirect_paths
 643
 644 root = Root(system=system, full_system=False)
 645
 646 hsaTopology.createHsaTopology(options)
 647
 648 m5.ticks.setGlobalFrequency('1THz')
 649 if options.abs_max_tick:
 650     maxtick = options.abs_max_tick
 651 else:
 652     maxtick = m5.MaxTick
 653
 654 # Benchmarks support work item annotations
 655 Simulation.setWorkCountOptions(system, options)
 656
 657 # Checkpointing is not supported by APU model
 658 if (options.checkpoint_dir != None or
 659     options.checkpoint_restore != None):
 660     fatal("Checkpointing not supported by apu model")
 661
 662 checkpoint_dir = None
 663 m5.instantiate(checkpoint_dir)
 664
 665 # Map workload to this address space
 666 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
 667
 668 if options.fast_forward:
 669     print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread)
 670
 671 exit_event = m5.simulate(maxtick)
 672
 673 if options.fast_forward:
 674     if exit_event.getCause() == "a thread reached the max instruction count":
 675         m5.switchCpus(system, switch_cpu_list)
 676         print("Switched CPUS @ tick %s" % (m5.curTick()))
 677         m5.stats.reset()
 678         exit_event = m5.simulate(maxtick - m5.curTick())
 679 elif options.fast_forward_pseudo_op:
 680     while exit_event.getCause() == "switchcpu":
 681         # If we are switching *to* kvm, then the current stats are meaningful
 682         # Note that we don't do any warmup by default
 683         if type(switch_cpu_list[0][0]) == FutureCpuClass:
 684             print("Dumping stats...")
 685             m5.stats.dump()
 686         m5.switchCpus(system, switch_cpu_list)
 687         print("Switched CPUS @ tick %s" % (m5.curTick()))
 688         m5.stats.reset()
 689         # This lets us switch back and forth without keeping a counter
 690         switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list]
 691         exit_event = m5.simulate(maxtick - m5.curTick())
 692
 693 print("Ticks:", m5.curTick())
 694 print('Exiting because ', exit_event.getCause())
 695
 696 sys.exit(exit_event.getCode())