configs/example/apu_se.py

   1 # Copyright (c) 2015 Advanced Micro Devices, Inc.
   2 # All rights reserved.
   3 #
   4 # For use for simulation and test purposes only
   5 #
   6 # Redistribution and use in source and binary forms, with or without
   7 # modification, are permitted provided that the following conditions are met:
   8 #
   9 # 1. Redistributions of source code must retain the above copyright notice,
  10 # this list of conditions and the following disclaimer.
  11 #
  12 # 2. Redistributions in binary form must reproduce the above copyright notice,
  13 # this list of conditions and the following disclaimer in the documentation
  14 # and/or other materials provided with the distribution.
  15 #
  16 # 3. Neither the name of the copyright holder nor the names of its
  17 # contributors may be used to endorse or promote products derived from this
  18 # software without specific prior written permission.
  19 #
  20 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23 # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  24 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25 # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26 # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27 # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28 # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29 # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30 # POSSIBILITY OF SUCH DAMAGE.
  31
  32 from __future__ import print_function
  33 from __future__ import absolute_import
  34
  35 import optparse, os, re, getpass
  36 import math
  37 import glob
  38 import inspect
  39
  40 import m5
  41 from m5.objects import *
  42 from m5.util import addToPath
  43
  44 addToPath('../')
  45
  46 from ruby import Ruby
  47
  48 from common import Options
  49 from common import Simulation
  50 from common import GPUTLBOptions, GPUTLBConfig
  51
  52 import hsaTopology
  53 from common import FileSystemConfig
  54
  55 ########################## Script Options ########################
  56 def setOption(parser, opt_str, value = 1):
  57     # check to make sure the option actually exists
  58     if not parser.has_option(opt_str):
  59         raise Exception("cannot find %s in list of possible options" % opt_str)
  60
  61     opt = parser.get_option(opt_str)
  62     # set the value
  63     exec("parser.values.%s = %s" % (opt.dest, value))
  64
  65 def getOption(parser, opt_str):
  66     # check to make sure the option actually exists
  67     if not parser.has_option(opt_str):
  68         raise Exception("cannot find %s in list of possible options" % opt_str)
  69
  70     opt = parser.get_option(opt_str)
  71     # get the value
  72     exec("return_value = parser.values.%s" % opt.dest)
  73     return return_value
  74
  75 # Adding script options
  76 parser = optparse.OptionParser()
  77 Options.addCommonOptions(parser)
  78 Options.addSEOptions(parser)
  79
  80 parser.add_option("--cpu-only-mode", action="store_true", default=False,
  81                   help="APU mode. Used to take care of problems in "\
  82                        "Ruby.py while running APU protocols")
  83 parser.add_option("-u", "--num-compute-units", type="int", default=4,
  84                   help="number of GPU compute units"),
  85 parser.add_option("--num-cp", type="int", default=0,
  86                   help="Number of GPU Command Processors (CP)")
  87 parser.add_option("--benchmark-root", help="Root of benchmark directory tree")
  88
  89 # not super important now, but to avoid putting the number 4 everywhere, make
  90 # it an option/knob
  91 parser.add_option("--cu-per-sqc", type="int", default=4, help="number of CUs" \
  92                   "sharing an SQC (icache, and thus icache TLB)")
  93 parser.add_option('--cu-per-scalar-cache', type='int', default=4,
  94                   help='Number of CUs sharing a scalar cache')
  95 parser.add_option("--simds-per-cu", type="int", default=4, help="SIMD units" \
  96                   "per CU")
  97 parser.add_option('--cu-per-sa', type='int', default=4,
  98                   help='Number of CUs per shader array. This must be a '
  99                   'multiple of options.cu-per-sqc and options.cu-per-scalar')
 100 parser.add_option('--sa-per-complex', type='int', default=1,
 101                   help='Number of shader arrays per complex')
 102 parser.add_option('--num-gpu-complexes', type='int', default=1,
 103                   help='Number of GPU complexes')
 104 parser.add_option("--wf-size", type="int", default=64,
 105                   help="Wavefront size(in workitems)")
 106 parser.add_option("--sp-bypass-path-length", type="int", default=4, \
 107                   help="Number of stages of bypass path in vector ALU for "
 108                   "Single Precision ops")
 109 parser.add_option("--dp-bypass-path-length", type="int", default=4, \
 110                   help="Number of stages of bypass path in vector ALU for "
 111                   "Double Precision ops")
 112 # issue period per SIMD unit: number of cycles before issuing another vector
 113 parser.add_option("--issue-period", type="int", default=4, \
 114                   help="Number of cycles per vector instruction issue period")
 115 parser.add_option("--glbmem-wr-bus-width", type="int", default=32, \
 116                   help="VGPR to Coalescer (Global Memory) data bus width "
 117                   "in bytes")
 118 parser.add_option("--glbmem-rd-bus-width", type="int", default=32, \
 119                   help="Coalescer to VGPR (Global Memory) data bus width in "
 120                   "bytes")
 121 # Currently we only support 1 local memory pipe
 122 parser.add_option("--shr-mem-pipes-per-cu", type="int", default=1, \
 123                   help="Number of Shared Memory pipelines per CU")
 124 # Currently we only support 1 global memory pipe
 125 parser.add_option("--glb-mem-pipes-per-cu", type="int", default=1, \
 126                   help="Number of Global Memory pipelines per CU")
 127 parser.add_option("--wfs-per-simd", type="int", default=10, help="Number of " \
 128                   "WF slots per SIMD")
 129
 130 parser.add_option("--registerManagerPolicy", type="string", default="static",
 131                   help="Register manager policy")
 132 parser.add_option("--vreg-file-size", type="int", default=2048,
 133                   help="number of physical vector registers per SIMD")
 134 parser.add_option("--vreg-min-alloc", type="int", default=4,
 135                   help="Minimum number of registers that can be allocated "
 136                   "from the VRF. The total number of registers will be "
 137                   "aligned to this value.")
 138
 139 parser.add_option("--sreg-file-size", type="int", default=2048,
 140                   help="number of physical vector registers per SIMD")
 141 parser.add_option("--sreg-min-alloc", type="int", default=4,
 142                   help="Minimum number of registers that can be allocated "
 143                   "from the SRF. The total number of registers will be "
 144                   "aligned to this value.")
 145
 146 parser.add_option("--bw-scalor", type="int", default=0,
 147                   help="bandwidth scalor for scalability analysis")
 148 parser.add_option("--CPUClock", type="string", default="2GHz",
 149                   help="CPU clock")
 150 parser.add_option("--gpu-clock", type="string", default="1GHz",
 151                   help="GPU clock")
 152 parser.add_option("--cpu-voltage", action="store", type="string",
 153                   default='1.0V',
 154                   help = """CPU  voltage domain""")
 155 parser.add_option("--gpu-voltage", action="store", type="string",
 156                   default='1.0V',
 157                   help = """CPU  voltage domain""")
 158 parser.add_option("--CUExecPolicy", type="string", default="OLDEST-FIRST",
 159                   help="WF exec policy (OLDEST-FIRST, ROUND-ROBIN)")
 160 parser.add_option("--SegFaultDebug",action="store_true",
 161                  help="checks for GPU seg fault before TLB access")
 162 parser.add_option("--FunctionalTLB",action="store_true",
 163                  help="Assumes TLB has no latency")
 164 parser.add_option("--LocalMemBarrier",action="store_true",
 165                  help="Barrier does not wait for writethroughs to complete")
 166 parser.add_option("--countPages", action="store_true",
 167                  help="Count Page Accesses and output in per-CU output files")
 168 parser.add_option("--TLB-prefetch", type="int", help = "prefetch depth for"\
 169                   "TLBs")
 170 parser.add_option("--pf-type", type="string", help="type of prefetch: "\
 171                   "PF_CU, PF_WF, PF_PHASE, PF_STRIDE")
 172 parser.add_option("--pf-stride", type="int", help="set prefetch stride")
 173 parser.add_option("--numLdsBanks", type="int", default=32,
 174                   help="number of physical banks per LDS module")
 175 parser.add_option("--ldsBankConflictPenalty", type="int", default=1,
 176                   help="number of cycles per LDS bank conflict")
 177 parser.add_option('--fast-forward-pseudo-op', action='store_true',
 178                   help = 'fast forward using kvm until the m5_switchcpu'
 179                   ' pseudo-op is encountered, then switch cpus. subsequent'
 180                   ' m5_switchcpu pseudo-ops will toggle back and forth')
 181 parser.add_option("--num-hw-queues", type="int", default=10,
 182                   help="number of hw queues in packet processor")
 183
 184 Ruby.define_options(parser)
 185
 186 #add TLB options to the parser
 187 GPUTLBOptions.tlb_options(parser)
 188
 189 (options, args) = parser.parse_args()
 190
 191 # The GPU cache coherence protocols only work with the backing store
 192 setOption(parser, "--access-backing-store")
 193
 194 # if benchmark root is specified explicitly, that overrides the search path
 195 if options.benchmark_root:
 196     benchmark_path = [options.benchmark_root]
 197 else:
 198     # Set default benchmark search path to current dir
 199     benchmark_path = ['.']
 200
 201 ########################## Sanity Check ########################
 202
 203 # Currently the gpu model requires ruby
 204 if buildEnv['PROTOCOL'] == 'None':
 205     fatal("GPU model requires ruby")
 206
 207 # Currently the gpu model requires only timing or detailed CPU
 208 if not (options.cpu_type == "TimingSimpleCPU" or
 209    options.cpu_type == "DerivO3CPU"):
 210     fatal("GPU model requires TimingSimpleCPU or DerivO3CPU")
 211
 212 # This file can support multiple compute units
 213 assert(options.num_compute_units >= 1)
 214
 215 # Currently, the sqc (I-Cache of GPU) is shared by
 216 # multiple compute units(CUs). The protocol works just fine
 217 # even if sqc is not shared. Overriding this option here
 218 # so that the user need not explicitly set this (assuming
 219 # sharing sqc is the common usage)
 220 n_cu = options.num_compute_units
 221 num_sqc = int(math.ceil(float(n_cu) / options.cu_per_sqc))
 222 options.num_sqc = num_sqc # pass this to Ruby
 223 num_scalar_cache = int(math.ceil(float(n_cu) / options.cu_per_scalar_cache))
 224 options.num_scalar_cache = num_scalar_cache
 225
 226 print('Num SQC = ', num_sqc, 'Num scalar caches = ', num_scalar_cache,
 227       'Num CU = ', n_cu)
 228
 229 ########################## Creating the GPU system ########################
 230 # shader is the GPU
 231 shader = Shader(n_wf = options.wfs_per_simd,
 232                 clk_domain = SrcClockDomain(
 233                     clock = options.gpu_clock,
 234                     voltage_domain = VoltageDomain(
 235                         voltage = options.gpu_voltage)))
 236
 237 # GPU_RfO(Read For Ownership) implements SC/TSO memory model.
 238 # Other GPU protocols implement release consistency at GPU side.
 239 # So, all GPU protocols other than GPU_RfO should make their writes
 240 # visible to the global memory and should read from global memory
 241 # during kernal boundary. The pipeline initiates(or do not initiate)
 242 # the acquire/release operation depending on these impl_kern_launch_rel
 243 # and impl_kern_end_rel flags.  The flag=true means pipeline initiates
 244 # a acquire/release operation at kernel launch/end.
 245 # VIPER protocols (GPU_VIPER, GPU_VIPER_Region and GPU_VIPER_Baseline)
 246 # are write-through based, and thus only imple_kern_launch_acq needs to
 247 # set.
 248 if buildEnv['PROTOCOL'] == 'GPU_RfO':
 249     shader.impl_kern_launch_acq = False
 250     shader.impl_kern_end_rel = False
 251 elif (buildEnv['PROTOCOL'] != 'GPU_VIPER' or
 252         buildEnv['PROTOCOL'] != 'GPU_VIPER_Region' or
 253         buildEnv['PROTOCOL'] != 'GPU_VIPER_Baseline'):
 254     shader.impl_kern_launch_acq = True
 255     shader.impl_kern_end_rel = False
 256 else:
 257     shader.impl_kern_launch_acq = True
 258     shader.impl_kern_end_rel = True
 259
 260 # Switching off per-lane TLB by default
 261 per_lane = False
 262 if options.TLB_config == "perLane":
 263     per_lane = True
 264
 265 # List of compute units; one GPU can have multiple compute units
 266 compute_units = []
 267 for i in range(n_cu):
 268     compute_units.append(ComputeUnit(cu_id = i, perLaneTLB = per_lane,
 269                                      num_SIMDs = options.simds_per_cu,
 270                                      wf_size = options.wf_size,
 271                                      spbypass_pipe_length = \
 272                                      options.sp_bypass_path_length,
 273                                      dpbypass_pipe_length = \
 274                                      options.dp_bypass_path_length,
 275                                      issue_period = options.issue_period,
 276                                      coalescer_to_vrf_bus_width = \
 277                                      options.glbmem_rd_bus_width,
 278                                      vrf_to_coalescer_bus_width = \
 279                                      options.glbmem_wr_bus_width,
 280                                      num_global_mem_pipes = \
 281                                      options.glb_mem_pipes_per_cu,
 282                                      num_shared_mem_pipes = \
 283                                      options.shr_mem_pipes_per_cu,
 284                                      n_wf = options.wfs_per_simd,
 285                                      execPolicy = options.CUExecPolicy,
 286                                      debugSegFault = options.SegFaultDebug,
 287                                      functionalTLB = options.FunctionalTLB,
 288                                      localMemBarrier = options.LocalMemBarrier,
 289                                      countPages = options.countPages,
 290                                      localDataStore = \
 291                                      LdsState(banks = options.numLdsBanks,
 292                                               bankConflictPenalty = \
 293                                               options.ldsBankConflictPenalty)))
 294     wavefronts = []
 295     vrfs = []
 296     vrf_pool_mgrs = []
 297     srfs = []
 298     srf_pool_mgrs = []
 299     for j in xrange(options.simds_per_cu):
 300         for k in xrange(shader.n_wf):
 301             wavefronts.append(Wavefront(simdId = j, wf_slot_id = k,
 302                                         wf_size = options.wf_size))
 303         vrf_pool_mgrs.append(SimplePoolManager(pool_size = \
 304                                                options.vreg_file_size,
 305                                                min_alloc = \
 306                                                options.vreg_min_alloc))
 307
 308         vrfs.append(VectorRegisterFile(simd_id=j, wf_size=options.wf_size,
 309                                        num_regs=options.vreg_file_size))
 310
 311         srf_pool_mgrs.append(SimplePoolManager(pool_size = \
 312                                                options.sreg_file_size,
 313                                                min_alloc = \
 314                                                options.vreg_min_alloc))
 315         srfs.append(ScalarRegisterFile(simd_id=j, wf_size=options.wf_size,
 316                                        num_regs=options.sreg_file_size))
 317
 318     compute_units[-1].wavefronts = wavefronts
 319     compute_units[-1].vector_register_file = vrfs
 320     compute_units[-1].scalar_register_file = srfs
 321     compute_units[-1].register_manager = \
 322         RegisterManager(policy=options.registerManagerPolicy,
 323                         vrf_pool_managers=vrf_pool_mgrs,
 324                         srf_pool_managers=srf_pool_mgrs)
 325     if options.TLB_prefetch:
 326         compute_units[-1].prefetch_depth = options.TLB_prefetch
 327         compute_units[-1].prefetch_prev_type = options.pf_type
 328
 329     # attach the LDS and the CU to the bus (actually a Bridge)
 330     compute_units[-1].ldsPort = compute_units[-1].ldsBus.slave
 331     compute_units[-1].ldsBus.master = compute_units[-1].localDataStore.cuPort
 332
 333 # Attach compute units to GPU
 334 shader.CUs = compute_units
 335
 336 ########################## Creating the CPU system ########################
 337 # The shader core will be whatever is after the CPU cores are accounted for
 338 shader_idx = options.num_cpus
 339
 340 # The command processor will be whatever is after the shader is accounted for
 341 cp_idx = shader_idx + 1
 342 cp_list = []
 343
 344 # List of CPUs
 345 cpu_list = []
 346
 347 CpuClass, mem_mode = Simulation.getCPUClass(options.cpu_type)
 348 if CpuClass == AtomicSimpleCPU:
 349     fatal("AtomicSimpleCPU is not supported")
 350 if mem_mode != 'timing':
 351     fatal("Only the timing memory mode is supported")
 352 shader.timing = True
 353
 354 if options.fast_forward and options.fast_forward_pseudo_op:
 355     fatal("Cannot fast-forward based both on the number of instructions and"
 356           " on pseudo-ops")
 357 fast_forward = options.fast_forward or options.fast_forward_pseudo_op
 358
 359 if fast_forward:
 360     FutureCpuClass, future_mem_mode = CpuClass, mem_mode
 361
 362     CpuClass = X86KvmCPU
 363     mem_mode = 'atomic_noncaching'
 364     # Leave shader.timing untouched, because its value only matters at the
 365     # start of the simulation and because we require switching cpus
 366     # *before* the first kernel launch.
 367
 368     future_cpu_list = []
 369
 370     # Initial CPUs to be used during fast-forwarding.
 371     for i in range(options.num_cpus):
 372         cpu = CpuClass(cpu_id = i,
 373                        clk_domain = SrcClockDomain(
 374                            clock = options.CPUClock,
 375                            voltage_domain = VoltageDomain(
 376                                voltage = options.cpu_voltage)))
 377         cpu_list.append(cpu)
 378
 379         if options.fast_forward:
 380             cpu.max_insts_any_thread = int(options.fast_forward)
 381
 382 if fast_forward:
 383     MainCpuClass = FutureCpuClass
 384 else:
 385     MainCpuClass = CpuClass
 386
 387 # CPs to be used throughout the simulation.
 388 for i in range(options.num_cp):
 389     cp = MainCpuClass(cpu_id = options.num_cpus + i,
 390                       clk_domain = SrcClockDomain(
 391                           clock = options.CPUClock,
 392                           voltage_domain = VoltageDomain(
 393                               voltage = options.cpu_voltage)))
 394     cp_list.append(cp)
 395
 396 # Main CPUs (to be used after fast-forwarding if fast-forwarding is specified).
 397 for i in range(options.num_cpus):
 398     cpu = MainCpuClass(cpu_id = i,
 399                        clk_domain = SrcClockDomain(
 400                            clock = options.CPUClock,
 401                            voltage_domain = VoltageDomain(
 402                                voltage = options.cpu_voltage)))
 403     if fast_forward:
 404         cpu.switched_out = True
 405         future_cpu_list.append(cpu)
 406     else:
 407         cpu_list.append(cpu)
 408
 409 host_cpu = cpu_list[0]
 410
 411 hsapp_gpu_map_vaddr = 0x200000000
 412 hsapp_gpu_map_size = 0x1000
 413 hsapp_gpu_map_paddr = int(Addr(options.mem_size))
 414
 415 # HSA kernel mode driver
 416 gpu_driver = GPUComputeDriver(filename="kfd")
 417
 418 # Creating the GPU kernel launching components: that is the HSA
 419 # packet processor (HSAPP), GPU command processor (CP), and the
 420 # dispatcher.
 421 gpu_hsapp = HSAPacketProcessor(pioAddr=hsapp_gpu_map_paddr,
 422                                numHWQueues=options.num_hw_queues)
 423 dispatcher = GPUDispatcher()
 424 gpu_cmd_proc = GPUCommandProcessor(hsapp=gpu_hsapp,
 425                                    dispatcher=dispatcher)
 426 gpu_driver.device = gpu_cmd_proc
 427 shader.dispatcher = dispatcher
 428 shader.gpu_cmd_proc = gpu_cmd_proc
 429
 430 # Create and assign the workload Check for rel_path in elements of
 431 # base_list using test, returning the first full path that satisfies test
 432 def find_path(base_list, rel_path, test):
 433     for base in base_list:
 434         if not base:
 435             # base could be None if environment var not set
 436             continue
 437         full_path = os.path.join(base, rel_path)
 438         if test(full_path):
 439             return full_path
 440     fatal("%s not found in %s" % (rel_path, base_list))
 441
 442 def find_file(base_list, rel_path):
 443     return find_path(base_list, rel_path, os.path.isfile)
 444
 445 executable = find_path(benchmark_path, options.cmd, os.path.exists)
 446 # It's common for a benchmark to be in a directory with the same
 447 # name as the executable, so we handle that automatically
 448 if os.path.isdir(executable):
 449     benchmark_path = [executable]
 450     executable = find_file(benchmark_path, options.cmd)
 451
 452 if options.env:
 453     with open(options.env, 'r') as f:
 454         env = [line.rstrip() for line in f]
 455 else:
 456     env = ['LD_LIBRARY_PATH=%s' % ':'.join([
 457                os.getenv('ROCM_PATH','/opt/rocm')+'/lib',
 458                os.getenv('HCC_HOME','/opt/rocm/hcc')+'/lib',
 459                os.getenv('HSA_PATH','/opt/rocm/hsa')+'/lib',
 460                os.getenv('HIP_PATH','/opt/rocm/hip')+'/lib',
 461                os.getenv('ROCM_PATH','/opt/rocm')+'/libhsakmt/lib',
 462                os.getenv('ROCM_PATH','/opt/rocm')+'/miopen/lib',
 463                os.getenv('ROCM_PATH','/opt/rocm')+'/miopengemm/lib',
 464                os.getenv('ROCM_PATH','/opt/rocm')+'/hipblas/lib',
 465                os.getenv('ROCM_PATH','/opt/rocm')+'/rocblas/lib',
 466                "/usr/lib/x86_64-linux-gnu"
 467            ]),
 468            'HOME=%s' % os.getenv('HOME','/'),
 469            "HSA_ENABLE_INTERRUPT=0"]
 470
 471 process = Process(executable = executable, cmd = [options.cmd]
 472                   + options.options.split(), drivers = [gpu_driver], env = env)
 473
 474 for cpu in cpu_list:
 475     cpu.createThreads()
 476     cpu.workload = process
 477
 478 for cp in cp_list:
 479     cp.workload = host_cpu.workload
 480
 481 if fast_forward:
 482     for i in range(len(future_cpu_list)):
 483         future_cpu_list[i].workload = cpu_list[i].workload
 484         future_cpu_list[i].createThreads()
 485
 486 ########################## Create the overall system ########################
 487 # List of CPUs that must be switched when moving between KVM and simulation
 488 if fast_forward:
 489     switch_cpu_list = \
 490         [(cpu_list[i], future_cpu_list[i]) for i in range(options.num_cpus)]
 491
 492 # Full list of processing cores in the system.
 493 cpu_list = cpu_list + [shader] + cp_list
 494
 495 # creating the overall system
 496 # notice the cpu list is explicitly added as a parameter to System
 497 system = System(cpu = cpu_list,
 498                 mem_ranges = [AddrRange(options.mem_size)],
 499                 cache_line_size = options.cacheline_size,
 500                 mem_mode = mem_mode)
 501 if fast_forward:
 502     system.future_cpu = future_cpu_list
 503 system.voltage_domain = VoltageDomain(voltage = options.sys_voltage)
 504 system.clk_domain = SrcClockDomain(clock =  options.sys_clock,
 505                                    voltage_domain = system.voltage_domain)
 506
 507 if fast_forward:
 508     have_kvm_support = 'BaseKvmCPU' in globals()
 509     if have_kvm_support and buildEnv['TARGET_ISA'] == "x86":
 510         system.vm = KvmVM()
 511         for i in range(len(host_cpu.workload)):
 512             host_cpu.workload[i].useArchPT = True
 513             host_cpu.workload[i].kvmInSE = True
 514     else:
 515         fatal("KvmCPU can only be used in SE mode with x86")
 516
 517 # configure the TLB hierarchy
 518 GPUTLBConfig.config_tlb_hierarchy(options, system, shader_idx)
 519
 520 # create Ruby system
 521 system.piobus = IOXBar(width=32, response_latency=0,
 522                        frontend_latency=0, forward_latency=0)
 523 dma_list = [gpu_hsapp, gpu_cmd_proc]
 524 Ruby.create_system(options, None, system, None, dma_list, None)
 525 system.ruby.clk_domain = SrcClockDomain(clock = options.ruby_clock,
 526                                     voltage_domain = system.voltage_domain)
 527 gpu_cmd_proc.pio = system.piobus.master
 528 gpu_hsapp.pio = system.piobus.master
 529
 530 for i, dma_device in enumerate(dma_list):
 531     exec('system.dma_cntrl%d.clk_domain = system.ruby.clk_domain' % i)
 532
 533 # attach the CPU ports to Ruby
 534 for i in range(options.num_cpus):
 535     ruby_port = system.ruby._cpu_ports[i]
 536
 537     # Create interrupt controller
 538     system.cpu[i].createInterruptController()
 539
 540     # Connect cache port's to ruby
 541     system.cpu[i].icache_port = ruby_port.slave
 542     system.cpu[i].dcache_port = ruby_port.slave
 543
 544     ruby_port.mem_master_port = system.piobus.slave
 545     if buildEnv['TARGET_ISA'] == "x86":
 546         system.cpu[i].interrupts[0].pio = system.piobus.master
 547         system.cpu[i].interrupts[0].int_master = system.piobus.slave
 548         system.cpu[i].interrupts[0].int_slave = system.piobus.master
 549         if fast_forward:
 550             system.cpu[i].itb.walker.port = ruby_port.slave
 551             system.cpu[i].dtb.walker.port = ruby_port.slave
 552
 553 # attach CU ports to Ruby
 554 # Because of the peculiarities of the CP core, you may have 1 CPU but 2
 555 # sequencers and thus 2 _cpu_ports created. Your GPUs shouldn't be
 556 # hooked up until after the CP. To make this script generic, figure out
 557 # the index as below, but note that this assumes there is one sequencer
 558 # per compute unit and one sequencer per SQC for the math to work out
 559 # correctly.
 560 gpu_port_idx = len(system.ruby._cpu_ports) \
 561                - options.num_compute_units - options.num_sqc \
 562                - options.num_scalar_cache
 563 gpu_port_idx = gpu_port_idx - options.num_cp * 2
 564
 565 wavefront_size = options.wf_size
 566 for i in range(n_cu):
 567     # The pipeline issues wavefront_size number of uncoalesced requests
 568     # in one GPU issue cycle. Hence wavefront_size mem ports.
 569     for j in range(wavefront_size):
 570         system.cpu[shader_idx].CUs[i].memory_port[j] = \
 571                   system.ruby._cpu_ports[gpu_port_idx].slave[j]
 572     gpu_port_idx += 1
 573
 574 for i in range(n_cu):
 575     if i > 0 and not i % options.cu_per_sqc:
 576         print("incrementing idx on ", i)
 577         gpu_port_idx += 1
 578     system.cpu[shader_idx].CUs[i].sqc_port = \
 579             system.ruby._cpu_ports[gpu_port_idx].slave
 580 gpu_port_idx = gpu_port_idx + 1
 581
 582 for i in xrange(n_cu):
 583     if i > 0 and not i % options.cu_per_scalar_cache:
 584         print("incrementing idx on ", i)
 585         gpu_port_idx += 1
 586     system.cpu[shader_idx].CUs[i].scalar_port = \
 587         system.ruby._cpu_ports[gpu_port_idx].slave
 588 gpu_port_idx = gpu_port_idx + 1
 589
 590 # attach CP ports to Ruby
 591 for i in range(options.num_cp):
 592     system.cpu[cp_idx].createInterruptController()
 593     system.cpu[cp_idx].dcache_port = \
 594                 system.ruby._cpu_ports[gpu_port_idx + i * 2].slave
 595     system.cpu[cp_idx].icache_port = \
 596                 system.ruby._cpu_ports[gpu_port_idx + i * 2 + 1].slave
 597     system.cpu[cp_idx].interrupts[0].pio = system.piobus.master
 598     system.cpu[cp_idx].interrupts[0].int_master = system.piobus.slave
 599     system.cpu[cp_idx].interrupts[0].int_slave = system.piobus.master
 600     cp_idx = cp_idx + 1
 601
 602 ################# Connect the CPU and GPU via GPU Dispatcher ##################
 603 # CPU rings the GPU doorbell to notify a pending task
 604 # using this interface.
 605 # And GPU uses this interface to notify the CPU of task completion
 606 # The communcation happens through emulated driver.
 607
 608 # Note this implicit setting of the cpu_pointer, shader_pointer and tlb array
 609 # parameters must be after the explicit setting of the System cpu list
 610 if fast_forward:
 611     shader.cpu_pointer = future_cpu_list[0]
 612 else:
 613     shader.cpu_pointer = host_cpu
 614
 615 ########################## Start simulation ########################
 616
 617 redirect_paths = [RedirectPath(app_path = "/proc",
 618                                host_paths =
 619                                 ["%s/fs/proc" % m5.options.outdir]),
 620                   RedirectPath(app_path = "/sys",
 621                                host_paths =
 622                                 ["%s/fs/sys"  % m5.options.outdir]),
 623                   RedirectPath(app_path = "/tmp",
 624                                host_paths =
 625                                 ["%s/fs/tmp"  % m5.options.outdir])]
 626
 627 system.redirect_paths = redirect_paths
 628
 629 root = Root(system=system, full_system=False)
 630
 631 hsaTopology.createHsaTopology(options)
 632
 633 m5.ticks.setGlobalFrequency('1THz')
 634 if options.abs_max_tick:
 635     maxtick = options.abs_max_tick
 636 else:
 637     maxtick = m5.MaxTick
 638
 639 # Benchmarks support work item annotations
 640 Simulation.setWorkCountOptions(system, options)
 641
 642 # Checkpointing is not supported by APU model
 643 if (options.checkpoint_dir != None or
 644     options.checkpoint_restore != None):
 645     fatal("Checkpointing not supported by apu model")
 646
 647 checkpoint_dir = None
 648 m5.instantiate(checkpoint_dir)
 649
 650 # Map workload to this address space
 651 host_cpu.workload[0].map(0x10000000, 0x200000000, 4096)
 652
 653 if options.fast_forward:
 654     print("Switch at instruction count: %d" % cpu_list[0].max_insts_any_thread)
 655
 656 exit_event = m5.simulate(maxtick)
 657
 658 if options.fast_forward:
 659     if exit_event.getCause() == "a thread reached the max instruction count":
 660         m5.switchCpus(system, switch_cpu_list)
 661         print("Switched CPUS @ tick %s" % (m5.curTick()))
 662         m5.stats.reset()
 663         exit_event = m5.simulate(maxtick - m5.curTick())
 664 elif options.fast_forward_pseudo_op:
 665     while exit_event.getCause() == "switchcpu":
 666         # If we are switching *to* kvm, then the current stats are meaningful
 667         # Note that we don't do any warmup by default
 668         if type(switch_cpu_list[0][0]) == FutureCpuClass:
 669             print("Dumping stats...")
 670             m5.stats.dump()
 671         m5.switchCpus(system, switch_cpu_list)
 672         print("Switched CPUS @ tick %s" % (m5.curTick()))
 673         m5.stats.reset()
 674         # This lets us switch back and forth without keeping a counter
 675         switch_cpu_list = [(x[1], x[0]) for x in switch_cpu_list]
 676         exit_event = m5.simulate(maxtick - m5.curTick())
 677
 678 print("Ticks:", m5.curTick())
 679 print('Exiting because ', exit_event.getCause())
 680
 681 sys.exit(exit_event.getCode())